# Transform text files to emails
* File names contain: 
    * an immutable email id
    * the file name that contains the email
    * the line number that locates the email

2022-01-17
Author: Joey Jingze
Small Modifications: Gordon Erlebacher

In [16]:
import os
import io
import sys
import re
import csv
import pandas as pd
from tqdm import tqdm_notebook, tnrange, notebook
from datetime import datetime
from collections import defaultdict

### Global Variables

In [14]:
# Global Variables
inputpath = "./email_txt/downloaded_from_data_tallahassee/"
infile = "9-1-Adam-Corey-2012-1-0.txt"

In [3]:
def print_list(li):
    for ele in li:
        print(ele)
def print_dict(dic):
    for key,val in dic.items():
        print(key, val)

In [4]:
headerReLib = {}
headerReLib[re.compile(r'^From\:\s?(.*)')] = 'From'
headerReLib[re.compile(r'^Sent\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^Date\:\s?(.*)')] = 'Sent'
headerReLib[re.compile(r'^To\:\s?(.*)')] = 'To'
headerReLib[re.compile(r'^C[Cc]\:\s?(.*)')] = 'CC'
headerReLib[re.compile(r'^Subject\:\s?(.*)')] = 'Subject' # possibly empty subject
headerReLib[re.compile(r'^A(tt|R|F)achments?\:\s?(.*)')] = 'Attachment' # search for Attachment/ARachment/AFachment
headerReLib[re.compile(r'Importance\:\s?(.*)')] = 'Importance'

# headerReLib[re.compile(r'Read\:\s?(.*)')] = 'Read'

In [5]:
cleanReLib = {}
cleanReLib[re.compile(r'^Page\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^Page\s\d+\sof\s\d+$')] = 'Page number'
cleanReLib[re.compile(r'^\d{:3}\-\d{:3}$')] = 'Page number'


In [6]:
def saveEmail(email):
    # filecount and output path are global veriables
    global email_count
    global outputpath
    global infile
    global line_no
    saving_order = ['From', 'Sent', 'To', 'CC', 'Bcc', 'Subject', 'Attachment', 'Importance', 'Body']
    with open(outputpath + '/'+str(email_count)+'-fn_'+infile.split('.')[0]+'-ln_'+str(line_no)+'.txt', 'w+', encoding = 'utf-8') as f_out:
        for key in saving_order:
            if key in email:
                if key == 'Body':
                    f_out.write('Body:\n')
                    f_out.write(' '.join(email[key]) + '\n')
                if key == 'Sent': # Make sure Date: is changed to Sent:
                    f_out.write('Sent'+''.join(email[key])[4:] + '\n')
                else:
                    f_out.write(''.join(email[key]) + '\n')
    email_count += 1

In [7]:
def validEmail(email, save = True, saveAny = False):
    # if missing From or To, not a valid email. Read notice is skipped too.
    valid = False
    # GE: replace 'From' by 'From:' and 'To' by 'To:'? 
    if 'From' in email and 'To' in email and 'Read:' not in ''.join(email['Subject']):
        valid = True
    if saveAny or (save and valid):
        saveEmail(email)
    return valid

In [8]:
def completeEmail(email):
    global stack
    global displacement_sections
    global displacement
    # if stack:
    email['Body'] = email['Body'] + stack
    stack = [] 

    validEmail(email, save = True)
    
    displacement_sections = []
    displacement = False

In [9]:
def isUselessLine(line):
    for regex, cleaning_reason in cleanReLib.items(): 
        if regex.match(line):
            return True
    return False

In [12]:
# all global variables
# inputpath = './test/'
# infile = 'test_emails.txt'

def process_emails_from_one_file(inputpath, infile):
    f_in = open(inputpath + infile, encoding="utf8")
    outputpath = "./test"
    email_count = 0
    stack = []
    displacement_sections = []
    displacement = False

    empty_line_count = 0
    # All sections are better to be lists. This is designed for the block displacement issue
    email = defaultdict(list)
    prev_section = None

    # Iterate through the whole file
    for line_no, line in enumerate(f_in.readlines()):
        # GE: Should the > be stripped? We lose track of what was a thread. Perhaps thre should be a isThread flag?
        line = line.strip("\f").strip(">").strip()

        # Skip some useless rows, but not empty rows
        if isUselessLine(line):
            print("This line seems useless")
            print(line)
            continue

        # For the case that From/Subject is wrongly appended to the end
        # If people use "From/Subject:" in the content, this creates an incorrect split 
        if "From:" in line:
            spt = line.split("From:")
            if spt[0]:
                stack.append(spt[0])
                completeEmail(email)
                email = defaultdict(list)
                line = "From:" + spt[1]
        if "Subject:" in line:
            spt = line.split("Subject:")
            if spt[0]:
                stack.append(spt[0])
                completeEmail(email)
                email = defaultdict(list)
                line = "Subject:" + spt[1]
        # Scanned documents are found. And usually contains To: that could mess up the splitting.
        # Scanned documents usually have a title line that all capital letters
        if line.isupper():
            completeEmail(email)
            email = defaultdict(list)

        #     if line =='':
        #         empty_line_count += 1
        #     else:
        #         empty_line_count = 0
        #     if empty_line_count >= 2:
        #         email = defaultdict(list)
        # if no pattern match, put into stack
        line_to_stack = True

        # search if line match any pattern
        for regex, section in headerReLib.items():
            if regex.match(line):

                if section in email:  # repeat headers, indicating current email ends
                    completeEmail(email)
                    email = defaultdict(list)
                elif stack:
                    if prev_section == None:
                        print("error!!!")
                    email[prev_section] = email[prev_section] + stack
                    stack = []
                    displacement = False

                email[section].append(line)
                if email[section] == [section + ":"] or (
                    section == "Sent" and email[section] == ["Date:"]
                ):
                    displacement_sections.append(section)
                    if len(displacement_sections) >= 2:  # when block displacement happen
                        displacement = True
                prev_section = section
                # if match, don't append to stack.
                # have to use flag here, because there are multiple patterns testing
                line_to_stack = False
                break

        # if block displacement is found, and there are non-header lines in the stack
        if displacement and displacement_sections and stack:
            popline = stack.pop(0)
            if (
                not popline == ""
            ):  # might be an empty line between the block of headers and block of fillers
                section = displacement_sections.pop(0)
                prev_section = section
                email[section].append(popline)
        if line_to_stack:
            stack.append(line)

In [17]:
for inputfile in notebook.tqdm(os.listdir(inputpath)):
    process_emails_from_one_file(inputpath, infile)
    # don't forget the last email
    completeEmail(email)

  0%|          | 0/124 [00:00<?, ?it/s]

This line seems useless
Page 1 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 1 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 1 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 1 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 1 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 1 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page 3 of 3
This line seems useless
Page 2 of 3
This line seems useless
Page