### Joey's version in root CRA folder

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Joey's-version-in-root-CRA-folder" data-toc-modified-id="Joey's-version-in-root-CRA-folder-0.0.1"><span class="toc-item-num">0.0.1&nbsp;&nbsp;</span>Joey's version in root CRA folder</a></span></li></ul></li></ul></li><li><span><a href="#Transform-text-files-to-emails" data-toc-modified-id="Transform-text-files-to-emails-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Transform text files to emails</a></span></li><li><span><a href="#Make-dicts-and-def-functions" data-toc-modified-id="Make-dicts-and-def-functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Make dicts and def functions</a></span></li><li><span><a href="#Usage" data-toc-modified-id="Usage-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Usage</a></span></li><li><span><a href="#non-function-code-below" data-toc-modified-id="non-function-code-below-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>non function code below</a></span></li></ul></div>

# Analyze text files
* Does the text file contain page numbers
# What are the first 5 non-empty lines after the first 20 page numbers identified


# Make dicts and def functions

**All cells below are necessary**

In [17]:
import os
import io
import sys
import re
import csv
import pandas as pd
from tqdm.notebook import tqdm, tnrange
from datetime import datetime
from collections import defaultdict
from dataclasses import dataclass, field 
from copy import copy

In [2]:
# regex in headerReLib must have one and ONLY one pair of parenthese.
headerReLib = {}  # dict{regex: str}
headerReLib[re.compile(r'^From\:\s?(.*)')] = 'From'
headerReLib[re.compile(r'^Sent\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^Date\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^To\:\s?(.*)')] = 'To'
headerReLib[re.compile(r'^C[Cc]\:\s?(.*)')] = 'CC'
headerReLib[re.compile(r'^B[Cc][Cc]\:\s?(.*)')] = 'BCC'
headerReLib[re.compile(r'^Subject\:\s?(.*)')] = 'Subject' 
headerReLib[re.compile(r'^Attachments\:\s?(.*)')] = 'Attachments'  # Attachment: is not reliable in 9-1
headerReLib[re.compile(r'Importance\:\s?(.*)')] = 'Importance'
headerReLib[re.compile(r'Priority\:\s?(.*)')] = 'Importance'



In [3]:
cleanReLib = {} # dict{regex: str}
cleanReLib[re.compile(r'^Page\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^Page\s\d+\sof\s\d+$')] = 'Page number'
# cleanReLib[re.compile(r'^\d{,3}\-\d{,3}$')] = 'Page number and file number'


In [4]:
typoLib = {} # dict{str: str}
typoLib['ARachments'] = 'Attachments'
typoLib['AFachments'] = 'Attachments'


In [18]:
def saveEmail(email, sv_arg):
    # save in specific order
    saving_order = ['From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachments', 'Importance', 'Body', 'isThread', 'isAutoreply', 'hasAllCapLine', 'hasBadDate']
    bool_headers = ['isThread', 'isAutoreply', 'hasAllCapLine', 'hasBadDate']
    
    str_email_count = f"%05d" % sv_arg.email_count
    filenm = str(str_email_count)+'_fn_'+sv_arg.infile.split('.')[0]+'_ln_'+str(sv_arg.line_no)+'.txt'
    filenm_path = sv_arg.outputpath + '/' + filenm # will not work on Windows
    with open(filenm_path, 'w+', encoding = 'utf-8') as f_out:
    #with open(sv_arg.outputpath + '/'+str(sv_arg.email_count)+'-fn_'+sv_arg.infile.split('.')[0]+'-ln_'+str(sv_arg.line_no)+'.txt', 'w+', encoding = 'utf-8') as f_out:
        for key in saving_order:
            if key in email:
                if key == 'Body':
                    f_out.write(key+': '+'\n'.join(email[key]) + '\n') # for body, join by space '\n'
                elif key in bool_headers:
                    f_out.write(key+': '+ str(email[key]) +'\n') # for bool type
                else:
                    f_out.write(key+': '+' '.join(email[key]) + '\n')  # for others, join by ' '
    sv_arg.email_count += 1

In [19]:
def validEmail(email, sv_arg, save = True, saveAny = False):
    valid = False
    # email is valid if 2 or more sections are found
    checking_section = ['From', 'To', 'Sent']
    valid_count = 0
    for section in checking_section:
        if section in email:
            valid_count+=1
    if valid_count>=2:
        valid = True
    if saveAny or (save and valid):
        saveEmail(email, sv_arg)
    return valid

In [20]:
def completeEmail(email, sv_arg, spt_arg):
    # all lines in stack to body. even empty
    email['Body'] = email['Body'] + spt_arg.stack
    validEmail(email, sv_arg, save = True, saveAny = False)
    
    spt_arg.stack = [] 
    spt_arg.displacement_sections = []
    spt_arg.prev_section = None  # GE: Correction. 2022-01-20

In [21]:
def isUselessLine(line, file_no = ''):
    if line == '':
        return True
    if line == file_no:
        return True
    for regex, cleaning_reason in cleanReLib.items(): 
        if regex.match(line):
            return True
    return False

In [22]:
def getFileNumber(infile):
    spt = infile.split('-')
    file_no = spt[0]
    if len(spt)>1 and spt[1].isdigit():
        file_no = file_no + '-' + spt[1]
    return file_no

In [23]:
@dataclass
class SavingArgs:
    inputpath: str # path to a folder ending with /
    outputpath: str # path to a folder ending with /
    infile: str = '' # filename only
    email_count: int = 0
    line_no: int = -1
    
@dataclass
class SplittingArgs:
    prev_section: str = None
    stack: list = field(default_factory=list)
    displacement_sections: list = field(default_factory=list)
    capTolerance: int = 0
#     appendingMode: bool = False   # Autoreply and attachment must be treated differently
    isAutoreply: bool = False
    isAttachment: bool = False

In [30]:
def process(sv_arg):
    # define the bad date regex
    re_wrong_date = re.compile(r'^Date:\s?[0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{4}$')
    
    f_in = open(sv_arg.inputpath + sv_arg.infile, encoding="utf8")
    email = defaultdict(list)
    file_no = getFileNumber(sv_arg.infile)
    new_page_number = False
    top_page_lines = []
    page_dict = defaultdict(list)
    pages = []
    sv_arg.pdf_has_page_numbers = False
    nb_pages = 0
    
    # initialize SplittingArgs and email object
    spt_arg = SplittingArgs()
    email = defaultdict(list)
    
    for line_no, line in enumerate(f_in.readlines()):
        sv_arg.line_no = line_no
        if new_page_number and len(top_page_lines) < 5:
            top_page_lines.append(line)
        else:
            new_page_number = False
            pages.append(copy(top_page_lines))
            nb_pages += 1
            if nb_pages > 2: 
                continue
            print("==================================================")
            for l in top_page_lines:
                print(l)
            top_page_lines = []

        # Skip some useless rows, including empty rows
        if isUselessLine(line, file_no):
            sv_arg.pdf_has_page_numbers = True
            # read the next t lines in the file
            new_page_number = True
            top_page_lines = []
            continue
        


# Usage

- Initialize SavingArgs with an inputpath and an outputpath as 
```python
sv_arg = SavingArgs(inputpath = "./test_input/", outputpath = "./test_output/")
```
- For single file
```python
sv_arg.infile = "9-1-Adam-Corey-2012-1-0.txt"
process(sv_arg)
```
- For multiple files
```python
for file in filenames:
    sv_arg.infile = file
    process(sv_arg)
```

**Note**
If you want to rerun, it's better to reinitialize sv_arg, because sv_arg.email_count keeps increasing.

In [31]:
sv_ge = SavingArgs(inputpath= "./email_txt/downloaded_from_data_tallahassee/", outputpath = "./test_output/")
sv_ge.email_count = 0

In [33]:
files = os.listdir(sv_ge.inputpath)
pdf_with_pages = []

for inputfile in files[0:1]: #notebook.tqdm(files[5]):
    sv_ge.infile = inputfile
    # print(sv_ge.inputpath, inputfile)
    process(sv_ge)
    pdf_with_pages.append((sv_ge.infile, sv_ge.pdf_has_page_numbers))
    
for row in pdf_with_pages:
    print(row[0], row[1])

23-11-PCSgmail2014-2017.txt False


# non function code below