## Joey's version in root folder of CRA_* repository

# Transform text files to emails
* File names contain: 
    * an immutable email id
    * the file name that contains the email
    * the line number that locates the email

2022-01-17
Author: Joey Jingze
Small Modifications: Gordon Erlebacher

In [4]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
import os
import io
import sys
import re
import csv
import pandas as pd
from tqdm import tqdm_notebook, tnrange, notebook
from datetime import datetime
from collections import defaultdict
import function_library3 as fctlib

In [22]:
from dataclasses import dataclass, field 

### Global Variables

In [23]:
# Global Variables
#inputpath = "./email_txt/downloaded_from_data_tallahassee/"
#infile = "9-1-Adam-Corey-2012-1-0.txt"

In [24]:
def print_list(li):
    for ele in li:
        print(ele)
def print_dict(dic):
    for key,val in dic.items():
        print(key, val)

In [25]:
headerReLib = {}

# headerReLib[re.compile(r'^Subject\:\s?(.*)')] = 'Subject' # possibly empty subject

headerReLib[re.compile(r'^From\:\s?(.*)')] = 'From'
headerReLib[re.compile(r'^Sent\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^Date\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^To\:\s?(.*)')] = 'To'
headerReLib[re.compile(r'^C[Cc]\:\s?(.*)')] = 'CC'
headerReLib[re.compile(r'^B[Cc][Cc]\:\s?(.*)')] = 'BCC'
headerReLib[re.compile(r'^Subject\:\s?(.*)')] = 'Subject' # possibly empty subject
# headerReLib[re.compile(r'^Attachments?\:\s?(.*)')] = 'Attachments' # search for Attachment/ARachment/AFachment
headerReLib[re.compile(r'^A(tt|R|F)achments?\:\s?(.*)')] = 'Attachments' # search for Attachment/ARachment/AFachment
# headerReLib[re.compile(r'^ARachments?\:\s?(.*)')] = 'Attachments' 
# headerReLib[re.compile(r'^AFachments?\:\s?(.*)')] = 'Attachments'  # need to correct spelling

headerReLib[re.compile(r'Importance\:\s?(.*)')] = 'Importance'

# headerReLib[re.compile(r'Read\:\s?(.*)')] = 'Read'

In [26]:
cleanReLib = {}
cleanReLib[re.compile(r'^Page\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^Page\s\d+\sof\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^\d{:3}\-\d{:3}$')] = 'Page number'

In [27]:
def saveEmail(email):
    # filecount and output path are global veriables
    global email_count
    global outputpath
    global inputfile
    global line_no
    saving_order = ['From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachment', 'Importance', 'Body']
    str_email_count = f"%05d" % email_count
    filenm = str(str_email_count)+'_fn_'+inputfile.split('.')[0]+'_ln_'+str(line_no)+'.txt'
    # print("NEW EMAIL FILE: ", outputpath, filenm)
    filenm_path = outputpath + '/' + filenm # will not work on Windows
    with open(filenm_path, 'w+', encoding = 'utf-8') as f_out:
        for key in saving_order:
            if not key in email:
                continue
            if key == 'Body':
                # print("Body: ", email['Body'])
                f_out.write('Body:\n')
                f_out.write(' '.join(email[key]) + '\n')
            elif key == 'Sent': # Make sure Date: is changed to Sent:
                f_out.write('Sent'+''.join(email[key])[4:] + '\n')
            else:
                f_out.write(' '.join(email[key]) + '\n')
    email_count += 1

In [28]:
def validEmail(email, save = True, saveAny = False):
    # if missing From or To, not a valid email. Read notice is skipped too.
    valid = False
    # email is a dictiory with keys: 'From', 'To', etc
    if 'From' in email and 'To' in email and 'Read:' not in ''.join(email['Subject']):
        valid = True
    if saveAny or (save and valid):
        saveEmail(email)
    return valid

In [29]:
def completeEmail(email):
    global stack
    global displacement_sections
    global displacement
    if stack:
        email['Body'] = email['Body'] + stack
        stack = [] 

    validEmail(email, save = True)
    
    displacement_sections = []
    displacement = False

In [30]:
def isUselessLine(line):
    for regex, cleaning_reason in cleanReLib.items(): 
        if regex.match(line):
            return True
    return False

In [31]:
@dataclass
class SavingArgs:
    inputpath: str
    infile: str
    outputpath: str
    email_count: int = 0
    prev_section: str = ''
        
@dataclass
class SplittingArgs:
    stack: list = field(default_factory=list)
    displacement_sections: list = field(default_factory=list)


In [33]:
sv_arg = SavingArgs(inputpath = "./test_input/", 
                    infile = "9-1-Adam-Corey-2012-1-0.txt",
                    outputpath = "./test_output/")
sv_arg

SavingArgs(inputpath='./test_input/', infile='9-1-Adam-Corey-2012-1-0.txt', outputpath='./test_output/', email_count=0, prev_section='')

In [34]:
spt_arg = SplittingArgs()
spt_arg

SplittingArgs(stack=[], displacement_sections=[])

In [12]:
sv_arg = SavingArgs(inputpath = "./test_input/", 
                    infile = "9-1-Adam-Corey-2012-1-0.txt",
                    outputpath = "./test_output/")
sv_arg

SavingArgs(inputpath='./test_input/', infile='9-1-Adam-Corey-2012-1-0.txt', outputpath='./test_output/', email_count=0, prev_section='')

# Testing: process dates (Sent:, Date:), all capitalizations

In [10]:
def process_dates_from_one_file(inputpath, infile):
    global stack, line_no
    
    match_date_header = re.compile('(Sent|Date):')

    f_in = open(inputpath + infile, encoding="utf8")
    outputpath = "./"
    stack = []

    empty_line_count = 0
    # All sections are better to be lists. This is designed for the block displacement issue
    email = defaultdict(list)
    dates = defaultdict(str)
    line_nos = defaultdict(int)

    # Iterate through the whole file
    for line_no_, line in enumerate(f_in.readlines()):
        # Find Sent: or Date: 
        match = match_date_header.match(line)
        if match:
            #print("================================")
            # print("==> ", line[5:])
            line = fctlib.standardize_date_string(line)
            #print("  > ", line[5:])
            #datetime(line)
            #print(len(line[5:]))
            
            # 19 Mar 2015 15:20:50
            try:
                # Month date year, 24-h:month AM/PM
                date = datetime.strptime(line[5:], "%B %d %Y %I:%M %p")
            except Exception as err:
                # print("==============================================")
                # print("===> Top level exception")
                # print("lineno: ", line_no_)
                #print("l#: ", line_no_, ", Original line: ", line)
                # print("  before process from new string")
                date = fctlib.process_dates_new_string(line[5:])
                # print(" date from fctlib:    > ", date)
                # print(Exception, err)
        pass

In [11]:
inputpath = "./email_txt/downloaded_from_data_tallahassee/"
#infile = "9-1-Adam-Corey-2012-1-0.txt"

files = os.listdir(inputpath)
print(files[5])
email_count = 0
file_nb = 0
search_range = files[file_nb : ]
# search_range = files

for inputfile in search_range:  # files[0:1] must be a list
    # print("==> ", inputpath, inputfile, email_count)
    process_dates_from_one_file(inputpath, inputfile)
    

9-1-Adam-Corey-2012-1-0.txt
... dt: ...September...
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ...12 52012 10:15 AM...
... dt: ......
... dt: ...12 52012 10:11 AM...
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ...February 72013 4:02 PM...
... dt: ...February 72013 4:02 PM...
... dt: ...February 72013 4:02 PM...
... dt: ...3 92013 1:28 AM...
... dt: ...3 92013 1:28 AM...
... dt: ......
... dt: ......
... dt: ......
... dt: ...4 22013 2:13 PM...
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ...4 22013 2:13 PM...
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ......
... dt: ...July 11th...
... dt: ...Jul

In [19]:
# all global variables
inputpath = './test/'
# infile = 'test_emails.txt'

def process_emails_from_one_file(inputpath, infile, email_count):
    global stack, outputpath, line_no
    
    f_in = open(inputpath + infile, encoding="utf8")
    outputpath = "./test"
    stack = []
    displacement_sections = []
    displacement = False

    empty_line_count = 0
    # All sections are better to be lists. This is designed for the block displacement issue
    email = defaultdict(list)
    # print("email.keys(): ", list(email.keys()))
    prev_section = None
    prev_empty = False
    insideAttachment = False

    # Iterate through the whole file
    for line_no_, line in enumerate(f_in.readlines()):
        # print("line: ", line)
        # Do not rely on availability of the loop index line_no_ outside the loop
        line_no = line_no_
        # GE: Should the > be stripped? We lose track of what was a thread. Perhaps thre should be a isThread flag?
        if line.count(">") > line.count("<"):
            email['isThread'] = True
        line = line.strip("\f").strip(">").strip()
        
        # print("\n==>>>=== line: ", line)

        # Skip some useless rows, but not empty rows
        if isUselessLine(line):
            # print("This line seems useless")
            # print(line)
            continue
            
        # Where is the To: header processed? I do not follow. (GE)

        # For the case that From/Subject is wrongly appended to the end
        # If people use "From/Subject:" in the content, this creates an incorrect split 
        # keys =['From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachment', 'Importance', 'Body']

        # print("email.keys(): ", list(email.keys()))
        
        # If a line contains one of the keys=[...] above, assume that the non-empty section to 
        # the left of the key is the last line in the email body
        # for key in keys:
        #     if key in line: 
        #         spt = line.split(key)
        #         if spt[0]:
        #             # spt[0] is tail of previous email body
        #             stack.append(spt[0])  # add to body
        #             completeEmail(email)
        #             email = defaultdict(list)
        #             line = key + ":" + spt[1].strip()
        #             break
        
        # Assume that the last line of an email can contain a header.
        # print("==> line: ", line)
        if "From:" in line:
            spt = line.split("From:")
            if spt[0]:
                # spt[0] is tail of previous email body
                # print("sp[0], From not empty")
                stack.append(spt[0])
                completeEmail(email)
                email = defaultdict(list)
                line = "From: " + spt[1].strip()  # GE: added a space and strip()
        elif "Subject:" in line:
            spt = line.split("Subject:")
            if spt[0]:
                # print("sp[0], Subject not empty")
                # stp[0] is tail of previous email body
                stack.append(spt[0])
                completeEmail(email)
                email = defaultdict(list)
                line = "Subject: " + spt[1].strip()  # GE: added a space and strip()
        # elif "To:" in line:
        #     # print("INSIDE To: on last line of mail")
        #     spt = line.split("To:")
        #     if spt[0]:
        #         print("sp[0], To: not empty")
        #         # stp[0] is tail of previous email body
        #         stack.append(spt[0])
        #         completeEmail(email)
        #         email = defaultdict(list)
        #         line = "To: " + spt[1].strip()  # GE: added a space and strip()
        
                # print("line: ", line)
        # Scanned documents are found. And usually contains To: that could mess up the splitting.
        # Scanned documents usually have a title line that all capital letters
        # if line.isupper():
        
        ### GE: Shouldl this be an elif? 
        ### GE: why the need for all(word.isupper() ....
        if line.isupper() and len(line) > 5 and all(word.isupper() for word in line.split()) and not '.' in line:
            insideAttachement = True
            completeEmail(email)
            email = defaultdict(list)
            continue
        #     if line =='':
        #         empty_line_count += 1
        #     else:
        #         empty_line_count = 0
        #     if empty_line_count >= 2:
        #         email = defaultdict(list)
        # if no pattern match, put into stack
        line_to_stack = True

        # search if line match any pattern
        # section  could be any one of keys()
        # keys =['From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachment', 'Importance', 'Body']

        for regex, section in headerReLib.items():
            if regex.match(line):
                # print("6: email.keys(): ", list(email.keys()))
                if section in email:  # repeat headers, indicating current email ends
                    # print(f"section {section} in email body")
                    # print("Email body: ", stack)
                    completeEmail(email)
                    email = defaultdict(list)
                elif stack:
                    if prev_section == None:
                        print("error!!!, prev_section == None")
                    # If error!!!, the email key is None (not very robust)
                    email[prev_section] = email[prev_section] + stack
                    stack = []
                    if displacement_sections:
                        displacement_sections.pop(0)
                    # displacement = False
                print("before, line: ", line)
                ### Searching for the regular expression in the line. Nothing found. Why? after all, 
                ### regex.math(line) returns true
                print("regex: ", regex)
                # Extract the pattern being searched from line
                line = regex.findall(line)[0]
                print("regex.findall(line): ", regex.findall(line))
                email[section].append(line)   # 
                if line == '': # found empty header
                    displacement_sections.append(section)
                
                prev_section = section
                # if match, don't append to stack.
                # have to use flag here, because there are multiple patterns testing
                line_to_stack = False
                break

        if line_to_stack and not insideAttachment:
            stack.append(line)
            
        # print("10: email.keys(): ", list(email.keys()))
        # if block displacement is found, and there are non-header lines in the stack
        if displacement and displacement_sections and stack:
            popline = stack.pop(0)
            if not popline == "":  # might be an empty line between the block of headers and block of fillers
                section = displacement_sections.pop(0)
                prev_section = section
                email[section].append(popline)
    
    # don't forget the last email
    completeEmail(email)

In [20]:
#inputpath = "./email_txt/downloaded_from_data_tallahassee/"
inputpath = "./email_txt/test_files/"
#infile = "9-1-Adam-Corey-2012-1-0.txt"
#outputpath = "./test_2022-01-19/"

files = os.listdir(inputpath)
email_count = 0

for inputfile in files: #notebook.tqdm(files[5]):
    print(inputpath, inputfile, email_count)
    process_emails_from_one_file(inputpath, inputfile, email_count)

./email_txt/test_files/ gordon2.txt 0
before, line:  From:
regex:  re.compile('^From\\:\\s?(.*)')
after line:  []
before, line:  Sent:
regex:  re.compile('^Sent\\:\\s?(.*)')
after line:  []
before, line:  To:
regex:  re.compile('^To\\:\\s?(.*)')
after line:  []
before, line:  Subject:
regex:  re.compile('^Subject\\:\\s?(.*)')
after line:  []
before, line:  From:
regex:  re.compile('^From\\:\\s?(.*)')
after line:  []
before, line:  Sent:
regex:  re.compile('^Sent\\:\\s?(.*)')
after line:  []
before, line:  To:
regex:  re.compile('^To\\:\\s?(.*)')
after line:  []
before, line:  Subject:
regex:  re.compile('^Subject\\:\\s?(.*)')
after line:  []
before, line:  From:
regex:  re.compile('^From\\:\\s?(.*)')
after line:  []
before, line:  Sent:
regex:  re.compile('^Sent\\:\\s?(.*)')
after line:  []
before, line:  To:
regex:  re.compile('^To\\:\\s?(.*)')
after line:  []
before, line:  Subject:
regex:  re.compile('^Subject\\:\\s?(.*)')
after line:  []
./email_txt/test_files/ gordon.txt 3
before,

TypeError: expected string or bytes-like object