### Joey's version in root CRA folder

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Transform text files to emails
* File names contain: 
    * an immutable email id
    * the file name that contains the email
    * the line number that locates the email

In [4]:
import os
import io
import sys
import re
import csv
import pandas as pd
from tqdm import tqdm_notebook, tnrange
from datetime import datetime
from collections import defaultdict

In [5]:
from dataclasses import dataclass, field 

In [6]:
def print_list(li):
    for ele in li:
        print(ele)
def print_dict(dic):
    for key,val in dic.items():
        print(key, val)

In [7]:
# must have one and only one pair of parenthese in regex
headerReLib = {}

headerReLib[re.compile(r'^From\:\s?(.*)')] = 'From'
headerReLib[re.compile(r'^Sent\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^Date\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^To\:\s?(.*)')] = 'To'
headerReLib[re.compile(r'^C[Cc]\:\s?(.*)')] = 'CC'
headerReLib[re.compile(r'^B[Cc][Cc]\:\s?(.*)')] = 'BCC'
headerReLib[re.compile(r'^Subject\:\s?(.*)')] = 'Subject' # possibly empty subject
headerReLib[re.compile(r'^Attachments?\:\s?(.*)')] = 'Attachments' # search for Attachment/ARachment/AFachment
# headerReLib[re.compile(r'^ARachments?\:\s?(.*)')] = 'Attachments' 
# headerReLib[re.compile(r'^AFachments?\:\s?(.*)')] = 'Attachments'  # need to correct spelling

headerReLib[re.compile(r'Importance\:\s?(.*)')] = 'Importance'

# headerReLib[re.compile(r'Read\:\s?(.*)')] = 'Read'

In [8]:
p = re.compile(r'^Attachments?\:\s?(.*)')
p.findall('Attachments: hi')

['hi']

In [9]:
cleanReLib = {}
cleanReLib[re.compile(r'^Page\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^Page\s\d+\sof\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^\d{:3}\-\d{:3}$')] = 'Page number'


In [10]:
def saveEmail(email):
    # filecount and output path are global veriables
    global email_count
    global outputpath
    global infile
    global line_no
    saving_order = ['From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachments', 'Importance', 'Body']
#     print('before saving')
#     print_dict(email)
    with open(outputpath + '/'+str(email_count)+'-fn_'+infile.split('.')[0]+'-ln_'+str(line_no)+'.txt', 'w+', encoding = 'utf-8') as f_out:
        for key in saving_order:
            if key in email:
                if key == 'Body':
                    f_out.write(key+': '+' '.join(email[key]) + '\n') # for body, join by space ' '
                else:
                    f_out.write(key+': '+''.join(email[key]) + '\n')  # for body, join by ''
    email_count += 1

In [11]:
def validEmail(email, save = True, saveAny = False):
    # if missing From or To, not a valid email. Read notice is skipped too.
    valid = False
    # GE: replace 'From' by 'From:' and 'To' by 'To:'? No since they are keys of email dict
    # GE: How do you know the body is complete? When one of the headers appears in the body (according to Joey). 
    #     Where is this check executed?
    if 'From' in email and 'To' in email and 'Read:' not in ''.join(email['Subject']):
        valid = True
    if saveAny or (save and valid):
        saveEmail(email)
    return valid

In [12]:
# Why not add a "Body:" to the emails we create? 
def completeEmail(email):
    global stack
    global displacement_sections
    global displacement
    global prev_section  # GE, 2022-01-20
    # if stack:
    email['Body'] = email['Body'] + stack
    stack = [] 

    validEmail(email, save = True)
    
    displacement_sections = []
    prev_section = None  # GE: Correction. 2022-01-20
    


In [13]:
def isUselessLine(line):
    for regex, cleaning_reason in cleanReLib.items(): 
        if regex.match(line):
            return True
    return False

In [14]:
@dataclass
class SavingArgs:
    inputpath: str
    infile: str
    outputpath: str
    email_count: int = 0
    prev_section: str = ''
        
@dataclass
class SplittingArgs:
    stack: list = field(default_factory=list)
    displacement_sections: list = field(default_factory=list)


In [15]:
sv_arg = SavingArgs(inputpath = "./test_input/", 
                    infile = "9-1-Adam-Corey-2012-1-0.txt",
                    outputpath = "./test_output/")
sv_arg

SavingArgs(inputpath='./test_input/', infile='9-1-Adam-Corey-2012-1-0.txt', outputpath='./test_output/', email_count=0, prev_section='')

In [16]:
spt_arg = SplittingArgs()
spt_arg

SplittingArgs(stack=[], displacement_sections=[])

In [17]:
# def process(sv_arg, spt_arg):
#     f_in = open(inputpath + infile, encoding="utf8")
#     email = defaultdict(list)
    

In [24]:
# all global variables
#inputpath = "./test_input/"
inputpath = "./read_original_text_files/email_txt/test_files/"


# infile = "9-1-Adam-Corey-2012-1-0.txt"
infile = "gordon1.txt"

# inputpath = './test/'
# infile = 'test_emails.txt'
f_in = open(inputpath + infile, encoding="utf8")
outputpath = "./test_output/"
outputpath = "./read_original_text_files/test_output/"
email_count = 0

stack = []
displacement_sections = []
    
empty_line_count = 0
# All sections are better to be lists. This is designed for the block displacement issue
email = defaultdict(list)
prev_section = None
prev_empty = False
insideAttachment = False

# Iterate through the whole file
for line_no, line in enumerate(f_in.readlines()):
    # GE: Should the > be stripped? We lose track of what was a thread. Perhaps thre should be a isThread flag?
    if line.count(">") > line.count("<"):     ### GE: DO NOT FOLLOW
        email['isThread'] = True
    line = line.strip("\f").strip(">").strip()

    # Skip some useless rows, but not empty rows
    if isUselessLine(line):
#         print("This line seems useless")
#         print(line)
        continue

    # For the case that From/Subject is wrongly appended to the end
    # If people use "From/Subject:" in the content, this creates an incorrect split 
    if "From:" in line:
        spt = line.split("From:")
        if spt[0]:
            stack.append(spt[0])
            completeEmail(email)
            email = defaultdict(list)
            line = "From:" + spt[1]
    elif "Subject:" in line:
        spt = line.split("Subject:")
        if spt[0]:
            stack.append(spt[0])
            completeEmail(email)
            email = defaultdict(list)
            line = "Subject:" + spt[1]
    # Scanned documents are found. And usually contains To: that could mess up the splitting.
    # Scanned documents usually have a title line that all capital letters
    # GE:   SHOULD BE "elif"?  <<<<<<<
    ### Only capture attachments if they have a capitalized line 5 or more characters
    # Will not capture emails written ALL-CAP. (do we want that?)
    elif len(line) > 5 and all(word.isupper() for word in line.split()) and not '.' in line:
        insideAttachment = True
        completeEmail(email)
        email = defaultdict(list)
        # process the next line
        continue
    #     if line =='':
    #         empty_line_count += 1
    #     else:
    #         empty_line_count = 0
    #     if empty_line_count >= 2:
    #         email = defaultdict(list)
    # if no pattern match, put into stack
    line_to_stack = True

    # search if line match any pattern
    for regex, section in headerReLib.items():
        if regex.match(line):
            insideAttachment = False
            if section in email:  # repeat headers, indicating current email ends
                completeEmail(email)
                email = defaultdict(list)
            elif stack:   # section not in email, but non-zero body
                if prev_section == None:
                    print("error!!!")
                email[prev_section] = email[prev_section] + stack
                stack = []
                if displacement_sections:
                    displacement_sections.pop(0)
        
            line = regex.findall(line)[0]
            
            email[section].append(line)
            if line == '': # found empty header
                displacement_sections.append(section)
                
            prev_section = section
            # if match, don't append to stack.
            # have to use flag here, because there are multiple patterns testing
            line_to_stack = False
            break
            
    if line_to_stack and not insideAttachment:
        stack.append(line)
    # if block displacement is found, and there are non-header lines in the stack
    if displacement_sections and stack:
        popline = stack.pop(0)
        if not popline == "":  # might be an empty line between the block of headers and block of fillers
            section = displacement_sections.pop(0)
            prev_section = section
            email[section].append(popline)
    

# don't forget the last email
completeEmail(email)

before findall, line:  From:  >>> 1
after findall, line:   >>> 1
before findall, line:  Sent: >>> 2
after findall, line:  >>> 2
before findall, line:  To:  >>> 3
after findall, line:   >>> 3
before findall, line:  Subject: >>> 4
after findall, line:  >>> 4
before findall, line:  To: gordon
after findall, line:  gordon
before findall, line:  From: Wanda
after findall, line:  Wanda
