# OST file parsing, backup, and pruning

I am creating this notebook to go through and clear up space in my old emails but also to glean any important information and get some good data
(This is all done after hours .. for whomever finds this!)

In [None]:
import findspark

findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [None]:
spark = SparkSession.builder.appName('outlook').getOrCreate()

In [None]:
import os
import datetime as dt

import pandas as pd
import numpy as np
import re

### Root Export Folder and Dictionary Directory

In [None]:
root_export_dir = 'd:/exports'
dictionary_dir = 'd:/git/dictionaries'

# Connect to Outlook Worker

In [None]:
import win32com.client

outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

# Pull all Folders in gmcwilliams@example.co Email Account

In [None]:
for folders in outlook.Folders:
    print(folders)

In [None]:
account_gmcwilliams = outlook.Folders['gmcwilliams@example.co']
account_online_archive = outlook.Folders['Online Archive - cloud@example.co']
account_reports_manager = outlook.Folders['Reports Manager']

In [None]:
for folders in account_gmcwilliams.Folders:
    print(folders)

## Assign all folders variables for iteration later

In [None]:
inbox = account_gmcwilliams.Folders['Inbox']
sent = account_gmcwilliams.Folders['Sent Items']
outbox = account_gmcwilliams.Folders['Outbox']
files = account_gmcwilliams.Folders['Files']
drafts = account_gmcwilliams.Folders['Drafts']
deleted = account_gmcwilliams.Folders['Deleted Items']
junk = account_gmcwilliams.Folders['Junk Email']

In [None]:
for folder in inbox.Folders:
    print(folder)

# Create DataFrame from Reports Manager Email Account

## Create Initial DataFrame from Inbox

In [None]:
emails_list = []

#for folder in reports_junk.Folders:

messages = inbox.Items
messages.Sort('[ReceivedTime]', True)

i = 0

for message in messages:
    # print(message)
    message_dict = {}
    try:
        message_dict['from'] = message.SenderEmailAddress
    except:
        pass
    try:
        message_dict['to'] = message.To
    except:
        pass
    try:
        message_dict['date'] = message.ReceivedTime.strftime("%Y-%m-%d %H:%M:%S")
    except:
        pass
    try:
        message_dict['subject'] = message.Subject
    except:
        pass
    try:
        message_dict['body'] = message.Body
    except:
        pass
    attachments_list = []
    for attachment in message.Attachments:
        try:
            attachment_name = str(attachment).lower()
            attachments_list.append(attachment_name)
            #attachment.SaveASFile(path+ '\\' + attachment_name)
        except Exception as e:
            print(e)

    message_dict['attachments'] = attachments_list
    message_dict['messageObject'] = message
    emails_list.append(message_dict)

    i = i + 1
    if i >= 10:
        break



In [None]:
schema = [
    StructField("from", StringType(), True),
    StructField("to", StringType(), True),
    StructField("date", StringType(), True),
    StructField("subject", StringType(), True),
    StructField("body", StringType(), True),
    StructField("attachments", StringType(), True),
]

emails_list_schema = StructType(schema)

In [None]:
df_emails = spark.createDataFrame(emails_list, schema=emails_list_schema)

In [None]:
df_emails.show()

In [None]:
df_inbox = pd.DataFrame(emails_list)

In [None]:
df_inbox

## Create and Concat Sub folders of Inbox onto Original Dataframe

In [None]:
email_info = []

for folder in reports_inbox.Folders:

    try:
        print('before clearning emails_list')
        print(email_info)
    except:
        pass

    messages = folder.Items
    messages.Sort('[ReceivedTime]', True)

    for message in messages:
        message_dict = {}
        message_dict['from'] = message.SenderEmailAddress
        message_dict['to'] = message.To

        message_dict['date'] = message.ReceivedTime.strftime("%Y-%m-%d %H:%M:%S")

        message_dict['subject'] = message.Subject
        message_dict['body'] = message.Body

        attachments_list = []
        for attachment in message.Attachments:
            try:
                attachment_name = str(attachment).lower()
                attachments_list.append(attachment_name)
                #attachment.SaveASFile(path+ '\\' + attachment_name)
            except Exception as e:
                print(e)

        message_dict['attachments'] = attachments_list
        message_dict['messageObject'] = message
        email_info.append(message_dict)
        #print(message_dict)

In [None]:
df_emails = pd.DataFrame(email_info)

In [None]:
df_emails['date'] = pd.to_datetime(df_emails['date'], errors='coerce')

In [None]:
df_emails

# Shape and Export Email Report attachments

## Standardize Client Names

In [None]:
df = pd.read_csv(f'{dictionary_dir}/client_name_standardization.dict', delimiter='\t')
client_rename_dict = {}
for index, row in df.iterrows():
    [REDACTED] = row['[REDACTED]']
    currentName = row['currentName']
    client_rename_dict[[REDACTED]] = currentName

In [None]:
def client_names(c_name):
    dict_length = len(client_rename_dict)
    for k, v in client_rename_dict.items():
        try:
            result = re.sub(k, v, c_name)
            if result != c_name:
                print(f'Keyword found: {k}')
                print(f'Replacment value: {v}')
                print('\n')
                return v
                break
        except Exception as e:
            print(e)
            break
    return c_name

In [None]:
df_emails['clientName'] = df_emails['subject'].apply(client_names)

## Add Client Column to Source DataFrame

In [None]:
client_name_prog = re.compile(r'[^\w]+([A-Z\|]{3,5})[^\w]+')
clients_set = set()


def client_names(file):
    result = client_name_prog.findall(file)
    print(result)

### Standardize File Attachment Names

In [None]:
df = pd.read_csv(f'{dictionary_dir}.csv')
attachment_name_dict = {}
for index, row in df.iterrows():
    attachmentName = row['attachmentName']
    reportName = row['reportName']
    attachment_name_dict[attachmentName] = reportName

### Iterate through known attachment names and replace with standard if found

In [None]:
def transform_name_to_standard(a_filename):
    for k, v in attachment_name_dict.items():
        result = re.sub(k, v, a_filename)
        if result != a_filename:
            return (result)
        return None
    return None

 ### Create Labtech Report Dataframe and Rename, Export Attachments

In [None]:
df_labtech_reports = df_emails[df_emails['subject'].str.contains('[lL]ab[tT]ech')]

In [None]:
df = pd.read_csv(f'{dictionary_dir}/report_archive_folder_name.dict')
report_archive_folder_name_dict = {}
for index, row in df.iterrows():
    monthNumber = row['monthNumber']
    reportMonthFolder = row['reportMonthFolder']
    report_archive_folder_name_dict[monthNumber] = reportMonthFolder

In [None]:
def create_month_folder(month):
    for k, v in report_archive_folder_name_dict.items():
        try:
            if month == k:
                print(f'Keyword found: {k}')
                print(f'Replacment value: {v}')
                print('\n')
                return v
                break
        except Exception as e:
            print(e)
            break
    return month

In [None]:
try:
    os.makedirs(f'{root_export_dir}/report_archives')
except:
    pass

file_ext_prog = re.compile(r'.*(antivirus_health|computer_audit|patch_compliance|third\sparty\spatch\saudit)([\w\d]+)$')

for index, row in df_labtech_reports.iterrows():
    i = 1
    for attachment in row['attachments']:
        year = row['date'].year
        month = row['date'].month
        month_folder = create_month_folder(month)
        export_dir = f"{root_export_dir}/report_archives/{year}/{month_folder}"

        try:
            os.makedirs(export_dir)
        except:
            pass

        # construct new export filename with timestamp, subject, and extension accounted for

        time_to_month = row['date'].to_period('M')
        timestamp = re.sub('\-', ' - ', str(time_to_month))
        subject = row['subject']
        org_filename = re.sub(r'\[\d{2}\-\d{2}\-\d{4}\s\d{2}\.\d{2}\.\d{2}\]\d{,4}\s\-\s', "", attachment)
        split_attachment_name = re.match(r'(.*)\.(\w{,4})$', org_filename)
        filename = transform_name_to_standard(split_attachment_name.group(1))
        extension = split_attachment_name.group(2)
        client_name = row['clientName']
        print(client_name)

        print(
            f"Saving attachment [{attachment}] to ['{export_dir}/{client_name} - {timestamp} {filename}.{extension}']")

        row['messageObject'].Attachments.Item(i).SaveASFile(
            f"{export_dir}/{client_name} - {timestamp} - {filename}.{extension}")
        i = i + 1

In [None]:
import os
import datetime as dt

import pandas as pd
import numpy as np
import re

### Root Export Folder and Dictionary Directory

In [None]:
root_export_dir = 'd:/exports'
dictionary_dir = 'd:/git/example_infrastructure_data_dev/dictionaries'

# Connect to Outlook Worker

In [None]:
import win32com.client

outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")

# Pull all Folders in Reports Manager Email Account

In [None]:
for folders in outlook.Folders:
    print(folders)

In [None]:
reports_archive = outlook.Folders['Online Archive - reports@example.co']
reports_manager = outlook.Folders['Reports Manager']

In [None]:
for folders in reports_archive.Folders:
    print(folders)

## Assign all folders variables for iteration later

In [None]:
reports_inbox = reports_archive.Folders['Inbox']
reports_sent = reports_manager.Folders['Sent Items']
reports_outbox = reports_manager.Folders['Outbox']
reports_nonstandard = reports_manager.Folders['Non-Standard Reports']
reports_nonmonthly = reports_manager.Folders['Non-Monthly Reports']
reports_files = reports_manager.Folders['Files']
reports_drafts = reports_manager.Folders['Drafts']
reports_deleted = reports_manager.Folders['Deleted Items']
reports_junk = reports_manager.Folders['Junk Email']

In [None]:
for folder in reports_inbox.Folders:
    print(folder)

# Create DataFrame from Reports Manager Email Account

## Create Initial DataFrame from Inbox

In [None]:
emails_list = []

#for folder in reports_junk.Folders:

messages = reports_inbox.Items
messages.Sort('[ReceivedTime]', True)

for message in messages:
    message_dict = {}
    message_dict['from'] = message.SenderEmailAddress
    message_dict['to'] = message.To

    message_dict['date'] = message.ReceivedTime.strftime("%Y-%m-%d %H:%M:%S")

    message_dict['subject'] = message.Subject
    message_dict['body'] = message.Body

    attachments_list = []
    for attachment in message.Attachments:
        try:
            attachment_name = str(attachment).lower()
            attachments_list.append(attachment_name)
            #attachment.SaveASFile(path+ '\\' + attachment_name)
        except Exception as e:
            print(e)

    message_dict['attachments'] = attachments_list
    message_dict['messageObject'] = message
    emails_list.append(message_dict)

df_reports = pd.DataFrame(emails_list)

## Create and Concat Sub folders of Inbox onto Original Dataframe

In [None]:
email_info = []

for folder in reports_inbox.Folders:

    try:
        print('before clearning emails_list')
        print(email_info)
    except:
        pass

    messages = folder.Items
    messages.Sort('[ReceivedTime]', True)

    for message in messages:
        message_dict = {}
        message_dict['from'] = message.SenderEmailAddress
        message_dict['to'] = message.To

        message_dict['date'] = message.ReceivedTime.strftime("%Y-%m-%d %H:%M:%S")

        message_dict['subject'] = message.Subject
        message_dict['body'] = message.Body

        attachments_list = []
        for attachment in message.Attachments:
            try:
                attachment_name = str(attachment).lower()
                attachments_list.append(attachment_name)
                #attachment.SaveASFile(path+ '\\' + attachment_name)
            except Exception as e:
                print(e)

        message_dict['attachments'] = attachments_list
        message_dict['messageObject'] = message
        email_info.append(message_dict)
        #print(message_dict)

In [None]:
df_emails = pd.DataFrame(email_info)

In [None]:
df_emails['date'] = pd.to_datetime(df_emails['date'], errors='coerce')

In [None]:
df_emails

# Shape and Export Email Report attachments

## Standardize Client Names

In [None]:
df = pd.read_csv(f'{dictionary_dir}/client_name_standardization.dict')
client_rename_dict = {}
for index, row in df.iterrows():
    previousName = row['previousName']
    currentName = row['currentName']
    client_rename_dict[previousName] = currentName

In [None]:
def client_names(c_name):
    dict_length = len(client_rename_dict)
    for k, v in client_rename_dict.items():
        try:
            result = re.sub(k, v, c_name)
            if result != c_name:
                print(f'Keyword found: {k}')
                print(f'Replacment value: {v}')
                print('\n')
                return v
                break
        except Exception as e:
            print(e)
            break
    return c_name

In [None]:
df_emails['clientName'] = df_emails['subject'].apply(client_names)

## Add Client Column to Source DataFrame

In [None]:
client_name_prog = re.compile(r'[^\w]+([A-Z\|]{3,5})[^\w]+')
clients_set = set()


def client_names(file):
    result = client_name_prog.findall(file)
    print(result)

### Standardize File Attachment Names

In [None]:
df = pd.read_csv(f'{dictionary_dir}.csv')
attachment_name_dict = {}
for index, row in df.iterrows():
    attachmentName = row['attachmentName']
    reportName = row['reportName']
    attachment_name_dict[attachmentName] = reportName

### Iterate through known attachment names and replace with standard if found

In [None]:
def transform_name_to_standard(a_filename):
    for k, v in attachment_name_dict.items():
        result = re.sub(k, v, a_filename)
        if result != a_filename:
            return result
        return None
    return None

 ### Create Labtech Report Dataframe and Rename, Export Attachments

In [None]:
df_labtech_reports = df_emails[df_emails['subject'].str.contains('[lL]ab[tT]ech')]

In [None]:
try:
    os.makedirs(f'{root_export_dir}/report_archives')
except:
    pass

file_ext_prog = re.compile(r'.*(antivirus_health|computer_audit|patch_compliance|third\sparty\spatch\saudit)([\w\d]+)$')

for index, row in df_labtech_reports.iterrows():
    i = 1
    for attachment in row['attachments']:
        export_dir = f"{root_export_dir}/report_archives/{row['date'].to_period('Y')}"
        try:
            os.makedirs(export_dir)
        except:
            pass

        # construct new export filename with timestamp, subject, and extension accounted for
        time_to_month = row['date'].to_period('M')
        timestamp = re.sub('\-', ' - ', str(time_to_month))
        subject = row['subject']
        org_filename = re.sub(r'\[\d{2}\-\d{2}\-\d{4}\s\d{2}\.\d{2}\.\d{2}\]\d{,4}\s\-\s', "", attachment)
        split_attachment_name = re.match(r'(.*)\.(\w{,4})$', org_filename)
        filename = transform_name_to_standard(split_attachment_name.group(1))
        extension = split_attachment_name.group(2)
        client_name = row['clientName']
        print(client_name)

        print(
            f"Saving attachment [{attachment}] to ['{export_dir}/{client_name} - {timestamp} {filename}.{extension}']")

        row['messageObject'].Attachments.Item(i).SaveASFile(
            f"{export_dir}/{client_name} - {timestamp} - {filename}.{extension}")
        i = i + 1