### Joey's version in root CRA folder

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Joey's-version-in-root-CRA-folder" data-toc-modified-id="Joey's-version-in-root-CRA-folder-0.0.1"><span class="toc-item-num">0.0.1&nbsp;&nbsp;</span>Joey's version in root CRA folder</a></span></li></ul></li></ul></li><li><span><a href="#Transform-text-files-to-emails" data-toc-modified-id="Transform-text-files-to-emails-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Transform text files to emails</a></span></li><li><span><a href="#Make-dicts-and-def-functions" data-toc-modified-id="Make-dicts-and-def-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Make dicts and def functions</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Usage</a></span></li><li><span><a href="#non-function-code-below" data-toc-modified-id="non-function-code-below-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>non function code below</a></span></li></ul></div>

# Transform text files to emails
* File names contain: 
    * an immutable email id
    * the file name that contains the email
    * the line number that locates the email

# Make dicts and def functions

**All cells below are necessary**

In [1]:
import os
import io
import sys
import re
import csv
import pandas as pd
from tqdm.notebook import tqdm, tnrange
from datetime import datetime
from collections import defaultdict
from dataclasses import dataclass, field 

In [2]:
# regex in headerReLib must have one and ONLY one pair of parenthese.
headerReLib = {}  # dict{regex: str}
headerReLib[re.compile(r'^From\:\s?(.*)')] = 'From'
headerReLib[re.compile(r'^Sent\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^Date\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^To\:\s?(.*)')] = 'To'
headerReLib[re.compile(r'^C[Cc]\:\s?(.*)')] = 'CC'
headerReLib[re.compile(r'^B[Cc][Cc]\:\s?(.*)')] = 'BCC'
headerReLib[re.compile(r'^Subject\:\s?(.*)')] = 'Subject' 
headerReLib[re.compile(r'^Attachments\:\s?(.*)')] = 'Attachments'  # Attachment: is not reliable in 9-1
headerReLib[re.compile(r'Importance\:\s?(.*)')] = 'Importance'
headerReLib[re.compile(r'Priority\:\s?(.*)')] = 'Importance'



In [3]:
cleanReLib = {} # dict{regex: str}
cleanReLib[re.compile(r'^Page\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^Page\s\d+\sof\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^\d{,3}\-\d{,3}$')] = 'Page number and file number'


In [4]:
typoLib = {} # dict{str: str}
typoLib['ARachments'] = 'Attachments'
typoLib['AFachments'] = 'Attachments'


In [91]:
def saveEmail(email, sv_arg):
    # save in specific order
    saving_order = ['From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachments', 'Importance', 'Body', 'isThread', 'isAutoMessage', 'isDisplacement', 'hasAllCapLine', 'hasBadDate']
    bool_headers = ['isThread', 'isAutoMessage', 'hasAllCapLine', 'hasBadDate', 'isDisplacement']
    
    str_email_count = f"%05d" % sv_arg.email_count
    filenm = str(str_email_count)+'_fn_'+sv_arg.infile.split('.')[0]+'_ln_'+str(sv_arg.line_no)+'.txt'
    filenm_path = sv_arg.outputpath + '/' + filenm # will not work on Windows
    
    with open(filenm_path, 'w+', encoding = 'utf-8') as f_out:
    #with open(sv_arg.outputpath + '/'+str(sv_arg.email_count)+'-fn_'+sv_arg.infile.split('.')[0]+'-ln_'+str(sv_arg.line_no)+'.txt', 'w+', encoding = 'utf-8') as f_out:
        for key in saving_order:
            line_no = sv_arg.line_nb[key]
            if key in email:
                if key == 'Body':
                    f_out.write(key+': '+'\n'.join(email[key]) + '\n') # for body, join by space '\n'
                elif key in bool_headers:
                    f_out.write(f'{line_no} ' + key+': '+ str(email[key]) +'\n') # for bool type
                else:
                    f_out.write(f'{line_no} ' + key+': '+' '.join(email[key]) + '\n')  # for others, join by ' '
    sv_arg.email_count += 1

In [110]:
def validEmail(email, sv_arg, save=True, saveAny=False):
    valid = False
    # email is valid if 2 or more sections are found
    checking_section = ["From", "To", "Sent"]
    valid_count = 0
    for section in checking_section:
        if section in email:
            valid_count += 1
    if valid_count >= 2:
        valid = True
    if saveAny or (save and valid):
        saveEmail(email, sv_arg)
        print(f"GE: Email saved, line_no: {sv_arg.line_no}, valid_count: {valid_count}")
    else:
        print(
            f"GE: Email not saved, line_no: {sv_arg.line_no}, valid_count: {valid_count}"
        )
    return valid

In [115]:
def completeEmail(email, sv_arg, spt_arg):
    # all lines in stack to body. even empty
    email["Body"] = email["Body"] + spt_arg.stack
    sv_arg.line_nb["Body"].append(sv_arg.line_no)
    validEmail(email, sv_arg, save=True, saveAny=False)
    
    print(f"GE: Reset stack, displacement_sections, prev_section ({spt_arg.prev_section})")

    spt_arg.stack = []
    spt_arg.displacement_sections = []
    spt_arg.prev_section = None  # GE: Correction. 2022-01-20

In [102]:
def isUselessLine(line, file_no = ''):
    if line == '':
        return True
    if line == file_no:
        return True
    for regex, cleaning_reason in cleanReLib.items(): 
        if regex.match(line):
            return True
    return False

In [103]:
def getFileNumber(infile):
    spt = infile.split('-')
    file_no = spt[0]
    if len(spt)>1 and spt[1].isdigit():
        file_no = file_no + '-' + spt[1]
    return file_no

In [104]:
@dataclass
class SavingArgs:
    inputpath: str # path to a folder ending with /
    outputpath: str # path to a folder ending with /
    infile: str = '' # filename only
    email_count: int = 0
    line_no: int = -1
    
@dataclass
class SplittingArgs:
    prev_section: str = None
    stack: list = field(default_factory=list)
    displacement_sections: list = field(default_factory=list)
    capTolerance: int = 0
#     appendingMode: bool = False   # Autoreply and attachment must be treated differently
    isAutoMessage: bool = False
    isDisplacement: bool = False
    isAttachment: bool = False

In [149]:
def process(sv_arg):
    # define the bad date regex
    re_wrong_date = re.compile(r"\d{1,2}:\d{2}")
    
    f_in = open(sv_arg.inputpath + sv_arg.infile, encoding="utf8")
    file_no = getFileNumber(sv_arg.infile)
    
    # initialize SplittingArgs and email object
    spt_arg = SplittingArgs()
    email = defaultdict(list)
    sv_arg.line_nb = defaultdict(list)
    
    for line_no, line in enumerate(f_in.readlines()):
        sv_arg.line_no = line_no
        # decrement capTolerance (initial value is 2)
        spt_arg.capTolerance -= 1
        
        # if line start with >, it's in a thread    
        line = line.strip("\f").strip()
        if len(line) >0 and line[0] == '>':    
            email['isThread'] = True
            sv_arg.line_nb['isThread'].append(line_no)
        line = line.strip(">").strip()

        # Skip some useless rows, including empty rows
        if isUselessLine(line, file_no):
            continue
            
        # correct typos
        for typo, correction in typoLib.items(): 
            if typo in line:
                line = line.replace(typo, correction)

        # Autoreply and attachments are unlikely to have From: inside, but very likely to have To and Date.
        # If found them, turn back to only consider From as the beginning of an email
        # Only case to turn off appendingMode
        if "From:" in line:
            spt = line.split("From:")
            if spt[0]:
                spt_arg.stack.append(spt[0])
                completeEmail(email, sv_arg, spt_arg)
                email = defaultdict(list)
                sv_arg.line_nb = defaultdict(list)
                line = "From:" + spt[1]
            if spt_arg.isAutoMessage or spt_arg.isAttachment:
                spt_arg.isAutoMessage = False
                spt_arg.isAttachment = False
                completeEmail(email, sv_arg, spt_arg)
                email = defaultdict(list)
                sv_arg.line_nb = defaultdict(list)
        elif "Subject:" in line:
            spt = line.split("Subject:")
            if spt[0]:
                print(f"split subject, line: {line_no}")
                spt_arg.stack.append(spt[0]) 
                print("GE, Subject: before completeEmail(0), line_no: ", line_no)
                completeEmail(email, sv_arg, spt_arg)
                email = defaultdict(list)
                sv_arg.line_nb = defaultdict(list)
                line = "Subject:" + spt[1]
                print(f"new subject line: {line}")
            if spt_arg.isAttachment:
                spt_arg.isAttachment = False
                completeEmail(email, sv_arg, spt_arg)
                email = defaultdict(list)
                sv_arg.line_nb = defaultdict(list)
        
        # append everything into body
        if spt_arg.isAutoMessage or spt_arg.isAttachment:
            spt_arg.stack.append(line)
            continue
        
        # if these string shows in line, it's probably the end of body. So everything below might be attachment.
        end_of_body = ['Tallahassee, FL', 'Phone:', 'FAX:']
        for pattern in end_of_body:
            if pattern in line:
                spt_arg.isAttachment = True
                
        # Autoreplys or autoforwards are found. First of three cases triggers appendingMode.
        keywordsInAutomessage = ["Read:", "Not read:", "Sender:", "Tentative:", "Accepted:", "Declined:"]
        for keyword in keywordsInAutomessage:
            if keyword in line:
                spt_arg.isAutoMessage = True
                email['isAutoMessage'] = True
                sv_arg.line_nb['isAutoMessage'].append(line_no)

        # Embedded attachments are found. Second of three cases triggers appendingMode.
        # Attachments usually have a title line that all capital letters
        # Add tolerance for all cap lines if comes right after a header section
        if spt_arg.capTolerance <= 0 and len(line) > 5 and all(word.isupper() for word in line.split()) and not '.' in line and not ':' in line:
            spt_arg.isAttachment = True
            spt_arg.stack.append(line)
            email['hasAllCapLine'] = True
            sv_arg.line_nb['hasAllCapLine'].append(line_no)
            continue
        
        # if no pattern match, put into stack
        line_to_stack = True

        # search if line match any pattern
        for regex, section in headerReLib.items():
            if regex.match(line):
                # if any header section is found, the next line can be all cap without trigger appendingMode
                # capTolerance = 2 actually has 1 line tolerance. Because decrement is earlier than checking
                spt_arg.capTolerance = 2 # can be larger, but with cautious
                
                # found a repeating header section, indicating current email ends
                if section in email:  
                    # Bad date found. Third of three cases triggers appendingMode.
                    # This can be an attachment or an appointment
                    if section == 'Sent' and not re_wrong_date.search(line):
                        spt_arg.stack.append(line)
                        spt_arg.isAttachment = True
                        email['hasBadDate'] = True
                        sv_arg.line_nb['hasBadDate'].append(line_no)
                        line_to_stack = False
                        break
                    else:
                        if section == 'Subject': 
                            print("GE, Subject: before completeEmail(1), line_no: ", line_no)
                            sv_arg.GE_subject_complete_email = True
                            print(f"GE,    email: {email}")
                        print(f"Call completeEmail (1), email: {email}")
                        completeEmail(email, sv_arg, spt_arg)
                        print("GE: reset email and line_nb dictionaries\n")
                        email = defaultdict(list)
                        sv_arg.line_nb = defaultdict(list)
                        
                # section not in email, all lines in stack belongs to prev_section
                elif spt_arg.stack:   
                    if spt_arg.prev_section == None:
                        print("If this is not happenning at the beginning of a file, it is an error")
                        print('current line no:', line_no)
                        spt_arg.stack = []
                    else:
                        email[spt_arg.prev_section] = email[spt_arg.prev_section] + spt_arg.stack
                        sv_arg.line_nb[spt_arg.prev_section].append(line_no)   # I could use append so I can see trace
                        spt_arg.stack = []
                        
                    if spt_arg.displacement_sections:
                        print(f"if spt_arg.displacement_sections, line_no: {line_no}")
                        print(f"GE,    email: {email}")
                        spt_arg.displacement_sections.pop(0)
                
                # get rid of the header
                line = regex.findall(line)[0]
                email[section].append(line)
                sv_arg.line_nb[section].append(line_no)   # last line nb
                
                # found empty, maybe it's a block displacement
                if line == '': 
                    print(f"line is empty: two quotes. line_no: {line_no}")
                    print(f"GE,     email: {email}")
                    spt_arg.displacement_sections.append(section)
                    print(f"GE,     displacement_sections: ", spt_arg.displacement_sections)
                    print(f"GE,     email: {email}")
                    spt_arg.capTolerance += 1 # if found empty header, there can have one more line all cap
                spt_arg.prev_section = section
                
                # if match, don't append to stack.
                # have to use flag here, because there are multiple patterns testing
                line_to_stack = False
                break

        if line_to_stack:
            spt_arg.stack.append(line)
            
        # if block displacement is found, and there are lines in the stack
        if spt_arg.displacement_sections and spt_arg.stack:
            print(f"in displacement_sections and stack is NOT empty, line_no: {line_no}")
            print(f"   stack: {spt_arg.stack}")
            print(f"   email: {email}")
            spt_arg.isDisplacement = True
            email['isDisplacement'] = True
            sv_arg.line_nb['isDisplacement'].append(line_no)
            popline = spt_arg.stack.pop(0)
            section = spt_arg.displacement_sections.pop(0)
            email[section].append(popline)
            sv_arg.line_nb[section].append(line_no)
            spt_arg.prev_section = section

    # don't forget the last email
    completeEmail(email, sv_arg, spt_arg)

# Usage

- Initialize SavingArgs with an inputpath and an outputpath as 
```python
sv_arg = SavingArgs(inputpath = "./test_input/", outputpath = "./test_output/")
```
- For single file
```python
sv_arg.infile = "9-1-Adam-Corey-2012-1-0.txt"
process(sv_arg)
```
- For multiple files
```python
for file in filenames:
    sv_arg.infile = file
    process(sv_arg)
```

**Note**
If you want to rerun, it's better to reinitialize sv_arg, because sv_arg.email_count keeps increasing.

In [147]:
sv_ge = SavingArgs(inputpath= "./email_txt/test_files/", outputpath = "./gordon4/", email_count=0)
sv_ge.infile = "gordon4.txt"
process(sv_ge)

Call completeEmail (1), email: defaultdict(<class 'list'>, {'From': ['Stephanie Hafer'], 'Sent': ['Thursday, March 05, 2015 4:52 PM'], 'To': ['Serena Moyle; Patti Hilaman; Paige Carter-Smith; PaigeCarter-Smith; Audra Pittman;', 'Courtney Ewing; Roderick Durham; Jay Revell; Mike Bellamy'], 'CC': ['Mallory.Izbicki@lungse.org; NicolaBarnack; Alex Medina'], 'Subject': ['Event Day Practice']})
GE: Email saved, line_no: 22, valid_count: 3
GE: Reset stack, displacement_sections, prev_section (Subject)
GE: reset email and line_nb dictionaries

Call completeEmail (1), email: defaultdict(<class 'list'>, {'From': ['Courtney Ewing'], 'Sent': ['Thursday, March 19, 2015 1:17 PM'], 'To': ["Mallory Izbicki; 'Bellamy, Mike Fire Dept'; 'Jay Revell'; Roderick Durham", "roderickdurham@gmail.com; 'PattiHilaman'; 'Paige Carter-Smith'; 'Audra Pittman';", "'SerenaMoyle'"], 'CC': ['Nicola Barnack'], 'Subject': ['RE: Oxygen Ball Cocktail Hour Wrap Up']})
GE: Email saved, line_no: 33, valid_count: 3
GE: Reset st

In [107]:
sv_ge = SavingArgs(inputpath= "./gordon5/", outputpath = "./gordon5/", email_count=0)
sv_ge.infile = "gordon5.txt"
process(sv_ge)

In [108]:
sv_ge = SavingArgs(inputpath= "./gordon6/", outputpath = "./gordon6/", email_count=0)
sv_ge.infile = "gordon6.txt"
process(sv_ge)

GE: Email not saved, line_no: 299, valid_count: 0
GE: Email not saved, line_no: 2006, valid_count: 1


In [15]:
sv_ge = SavingArgs(inputpath= "./gordon7/", outputpath = "./gordon7/", email_count=0)
sv_ge.infile = "gordon7.txt"
process(sv_ge)

In [16]:
sv_ge = SavingArgs(inputpath= "./23-11/", outputpath = "./23-11/", email_count=0)
sv_ge.infile = "23-11-PCSgmail2014-2017.txt"
process(sv_ge)

In [17]:
sv_ge = SavingArgs(inputpath= "./17-2/", outputpath = "./17-2/", email_count=0)
sv_ge.infile = "17-2_subset.txt"
process(sv_ge)

In [18]:
sv_ge = SavingArgs(inputpath= "./17-2a/", outputpath = "./17-2a/", email_count=0)
sv_ge.infile = "17-2.txt"
process(sv_ge)

In [19]:
sv_ge = SavingArgs(inputpath= "./17-2b/", outputpath = "./17-2b/", email_count=0)
sv_ge.infile = "17-2-IB2013-1-0.txt"
process(sv_ge)

In [20]:
sv_ge = SavingArgs(inputpath= "./23-11a/", outputpath = "./23-11a/", email_count=0)
sv_ge.infile = "23-11.txt"
process(sv_ge)

In [21]:
sv_ge = SavingArgs(inputpath= "./34-3/", outputpath = "./34-3/", email_count=0)
sv_ge.infile = "34-3.txt"
process(sv_ge)

In [22]:
sv_ge = SavingArgs(inputpath= "./18-4/", outputpath = "./18-4/", email_count=0)
sv_ge.infile = "18-4.txt"
process(sv_ge)

In [23]:
sv_ge = SavingArgs(inputpath= "./18-4/", outputpath = "./18-4/", email_count=0)
sv_ge.infile = "18-4a.txt"
process(sv_ge)

In [24]:
sv_ge = SavingArgs(inputpath= "./18-4/", outputpath = "./18-4/", email_count=0)
sv_ge.infile = "18-4b.txt"
process(sv_ge)

In [25]:
sv_ge = SavingArgs(inputpath= "./18-4/", outputpath = "./18-4/", email_count=0)
sv_ge.infile = "18-4d.txt"
process(sv_ge)

In [26]:
sv_ge = SavingArgs(inputpath= "./34-1/", outputpath = "./34-1/", email_count=0)
sv_ge.infile = "34-1a.txt"
process(sv_ge)

In [27]:
sv_ge = SavingArgs(inputpath= "./34-1/", outputpath = "./34-1/", email_count=0)
sv_ge.infile = "34-1b.txt"
process(sv_ge)

In [30]:
sv_ge = SavingArgs(inputpath= "./34-1/", outputpath = "./34-1/", email_count=0)
sv_ge.infile = "34-1c.txt"
process(sv_ge)

In [31]:
def process_case(path, filenm):
    sv_ge = SavingArgs(inputpath= path, outputpath = path, email_count=0)
    sv_ge.infile = filenm
    process(sv_ge)

In [39]:
process_case("./21-10/", "21-10.txt")

In [40]:
process_case("./21-11/", "21-11.txt")

In [41]:
process_case("./23-4/", "23-4.txt")

In [None]:
process_case("./21-6/", "21-6.txt")

In [150]:
# files = ["test1", "test2", "test3"] 
# files = ["test20", "test21"]
files = ["test23"]
for f in files:
    process_case("./10-8/", f + ".txt")

line is empty: two quotes. line_no: 6
GE,     email: defaultdict(<class 'list'>, {'Subject': ["Zaxby's VIP Grand Opening xxx", 'Location: xx1', 'yyy  <<< LAST LINE OF BODY INCLUDED IN SUBJECT'], 'From': ['']})
GE,     displacement_sections:  ['From']
GE,     email: defaultdict(<class 'list'>, {'Subject': ["Zaxby's VIP Grand Opening xxx", 'Location: xx1', 'yyy  <<< LAST LINE OF BODY INCLUDED IN SUBJECT'], 'From': ['']})
line is empty: two quotes. line_no: 7
GE,     email: defaultdict(<class 'list'>, {'Subject': ["Zaxby's VIP Grand Opening xxx", 'Location: xx1', 'yyy  <<< LAST LINE OF BODY INCLUDED IN SUBJECT'], 'From': [''], 'Sent': ['']})
GE,     displacement_sections:  ['From', 'Sent']
GE,     email: defaultdict(<class 'list'>, {'Subject': ["Zaxby's VIP Grand Opening xxx", 'Location: xx1', 'yyy  <<< LAST LINE OF BODY INCLUDED IN SUBJECT'], 'From': [''], 'Sent': ['']})
line is empty: two quotes. line_no: 8
GE,     email: defaultdict(<class 'list'>, {'Subject': ["Zaxby's VIP Grand Openi

In [29]:
sv_ge = SavingArgs(inputpath= "./email_txt/downloaded_from_data_tallahassee/", outputpath = "./test_output/")


sv_ge.email_couant = 0   # IMPORTANT to reset email_count

files = os.listdir(sv_ge.inputpath)

for inputfile in tqdm(files[0:]):
    sv_ge.infile = inputfile
    # print(sv_ge.inputpath, inputfile)
    process(sv_ge)

  0%|          | 0/124 [00:00<?, ?it/s]

GE: Email saved, line_no: 26, valid_count: 3
GE: Reset stack, displacement_sections, prev_section (Attachments)
GE: Email saved, line_no: 57, valid_count: 3
GE: Reset stack, displacement_sections, prev_section (Subject)
GE: Email saved, line_no: 81, valid_count: 3
GE: Reset stack, displacement_sections, prev_section (Subject)
GE: Email saved, line_no: 113, valid_count: 3
GE: Reset stack, displacement_sections, prev_section (Subject)
Call completeEmail (1), email: defaultdict(<class 'list'>, {'From': ['Gail Stansberry-Ziffer'], 'Sent': ['Wednesday, May 07, 2014 9:33 PM'], 'To': ['Karen Wendland; Vince Dix, Ph.D.; Jerome Novey; JoannaNovey; Shannon L. Novey; Jay', 'Newman; SueNewman; Audrey E. Post; Rebeccah Cantley Lutz; Rebeccah Lutz; April Salter;', 'Chris Fagiano; Drew Jones; Gretchen Jones; RosanneDunkelberger; Marsha Cantrell; Ed Jaffry;', 'Coni WhitfieldCarney; Brenda Boggs; Theresa Flury Esq.; Nancy Miller; Nancy Miller; Robin', 'Hassler Thompson, J.D.; Kris Knab; Martha Olive-Ha

NameError: name 'stack' is not defined

In [37]:
sv_ge = SavingArgs(inputpath= "./email_txt/downloaded_from_data_tallahassee/", outputpath = "./test_output/")


sv_ge.email_count = 0   # IMPORTANT to reset email_count

files = os.listdir(sv_ge.inputpath)

for inputfile in tqdm(files[0:]):
    sv_ge.infile = inputfile
    # print(sv_ge.inputpath, inputfile)
    process(sv_ge)

  0%|          | 0/124 [00:00<?, ?it/s]

If this is not happenning at the beginning of a file, is an error
current line no: 1
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error
current line no: 1
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error
current line no: 1
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error
current line no: 2
If this is not happenning at the beginning of a file, is an error

# non function code below