<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [146]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import re
from function_library import *
from function_library2 import *

In [623]:
# Search for "On Behalf" in the string.
re_behalf = re.compile(r'(.*)[Oo]n [Bb]ehalf')

# Search anywhere on the line for <mailto:xxxx> where xxx is the mailing address. 
# The patterns also consider () and [] instead of <>. Extract the last occurence of 
# an mail address via ().
re_bracket = re.compile(r'(.*)[\[\<\(]mailto\:(.*)[\]\>\)]')

# Search anywhere on the line for <xxxx>, (xxxx), or [xxxx]. Extract xxxx. If there are multiple 
# occurences, extract the last one because .* is greedy.
re_bracket2 = re.compile(r'(.*)[\[\<\(](.*)[\]\>\)]')

# Search anywhere in the line for a sequence of lower and upper cases letters, followed by a space (\s)
# followed by 0 or 1 capital letters [A-Z]? followed by 0 or a dots \.? For example, "GORdon A."
# ERROR: only allowing for a single space after the first name. Should be one or more spaces (\s+ intead of \s?) ?
# After the (first_name + initial + dot), allow for one sapce, and a last name formed from 1 or more [A-Za-z]. 
# Extract the last name with ()
#re_name1 = re.compile(r'.*?([A-Za-z]+\s?[A-Z]?\.?)\s([A-Za-z]+)')  # Written by Joey

# I would allow for multiple initials: Gordon A. Z. Erlebacher, or Gordon A Z Erlebacher (dots might be missing), 
#  or Gordon AZ Erlebacher  (Multiple names, or concatenated initials)
# re_name1 has no comma
# PROBLEM: if the first name has unicode characters, it is not found, and the first name   switches to the last name. 
# That is probably the result of \b, and the fact I allow zero or more sections to the first name. 
# I would like to capture the first name REGARDLESS OF THE CHARACTERS IN THE NAME, even numbers. So use \w and the UNICODE FLAG.
re_name1 = re.compile(r""".*?   
    (                         # Capture first name +  middle names (abbreviated or not)
        #[A-Za-z]+             # First name (one or more characters)
        (?: \s?
            #(?:\b[A-Za-z]+?\b\.?\s*)+   # Not captured: Abbreviation structure
            (?:\b\w+?\b\.?\s*)+   # Not captured: Abbreviation structure
        )+                       # Not captured: 1j or more sequence of abbreviations
        #)?                       # Not captured: 0 or more sequence of abbreviations
    )                            # end of first name + initials capture
    #\s*\b([A-Za-z]+\b)             # Capture last name preceded by one or more spaces. 
    \s*\b(\w+\b)             # Capture last name preceded by one or more spaces. 
                                 # \b forces the last name to start at a word boundary
""", re.X)

# Extract last name with (), followed by a comma, (the last name only contains lower and upper case, no apostrophes or dashes)
# Leave a single space, followed by first name ([A-Za-z], zero or more (?) spaces (\s) (ERROR: should be 1 ore more spaces (+), 
# followed by 0 or 1 initials [A-Z] followed by 0 or 1 dots (\.?). NOT GENERAL ENOUGH. What about Erle, Gordon A.E.F. or 
#   Erle, Gordon A. E.   F. ? 
#re_name2 = re.compile(r'.*?([A-Za-z]+),\s([A-Za-z]+\s?[A-Z]?\.?)')  # Joey's version
# Gordon's version
re_name2 = re.compile(r""".*?
    #([A-Za-z]+)\b,\s*                  # Capture last name followed by comma
    (\w+)\b,\s*                  # Capture last name followed by comma
    (                                  # Capture first name and initials
        (?: \s*
            #(?:\b[A-Za-z]+\b\.?\s*)?   # Not captured: Abbreviation structure
            (?:\b\w+\b\.?\s*)?   # Not captured: Abbreviation structure
        )*                             # Not captured: 0 or more sequence of abbreviations
    )   
""", re.X)  

re_name3 = re.compile(r""".*?
    ([\w^0-9]+)
""", re.X | re.UNICODE)



# Capture an email: that is a very complex exercise, so it it unlikely that this approarch works, but it is likely good enough. 
# Here is an more complex solution: https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s01.html
# Email string is defined as the first occurance of the expression in (): 
#   The email is a series of [a-zA-Z_] followed by zero or one dots \.? followed by one or more letter/number (/w), followed by '@'. 
#   followed by [a-zA-Z_0-9] (equiv to [\w] zero or more times, followed by a dot (0 or 1 times), followed by [a-zA-Z_] 0 or more, 
#  followed by period 0 or more, followed by 2 or three letters 
re_email =  re.compile(r'.*?([a-zA-Z_]*\.?\w+@[a-zA-Z_0-9]*\.?[a-zA-Z_]*\.?[a-zA-Z]{2,3})')

#  The following special characters are allowed in an email name:  ! # $ % & ' * + - / = ? ^ _ ` { |
#  For now, we ignore them. 
#  A domain suffix is required, so the domain after @ has at least one period. The full domain must be less than 64 characters long. 
#  We ignore this constraint. 
# Hyphens are allowed, but must be surrounded by characters: (?:[A-Za-z0-9]+\-?)+[A-Za-z0-9]+
# Domain name rules: https://www.dynadot.com/community/blog/domain-name-rules.html
#                    https://www.20i.com/support/domain-names/domain-name-restrictions
re_email1 = re.compile(r'.*?([\w.]*@[A-Za-z0-9\-]*\.?[a-zA-Z_]*\.?[a-zA-Z])')

re_domain = re.compile(r""".*?(  
     (?: (?:  [A-Za-z0-9]+\-?)+[A-Za-z0-9]+\.)+ (?: [A-Za-z]+)
)""", re.X)

# Rewritten by G. Erlebacher, 2022-02-13.
re_email = re.compile(r""".*?(
     [\w.]*@    # email name: upper/lower case, numerals, dots, underscores
     (?:        # non-captured domain name
         (?:    # non-captured
             [A-Za-z0-9]+\-?    # sequence of letters/numbers followed by one hyphen (`seqA`)
         )+                     # one or more of `seqA`
         [A-Za-z0-9]+\.         # one or more letters after the last hyphen, followed by a dot
     )+                         # non-capture: one or more of `seqB`
     (?: [A-Za-z]+)             # the final domain segment, after the last dot
    )   # capture full email
""", re.X)

In [700]:
str1 = "gord.erl.e@gmail.com"
str2 = "gord..erle.bach@gmail.com"
str3 = "3_gord..erle.ba3ch@gma+_-3il.com"
email = '3__42_gord@3sd-532-asd.523-asd3.adsf.com'
re_email3.match(email).groups(0)

name = "Gordon Ad.C.EDC. dd. Dd.   D. Erlebacher"
#print(re_name1.match(name).groups())

name = "Erlebacher, Gordon E. EF H. "
name = ", Gordon E. EF H. "

re_name2.match(name)

# Two f's are a result of pdf translation. 
name = "jeﬀ barbacci"  
re_name1.match(name).groups() 

# name = "mccraw, rick"
# re_name2.match(name).groups()

# name = "john t burnette"
# re_name1.match(name).groups()

name = "atwell, scott f."
re_name2.match(name).groups()

('atwell', 'scott f.')

In [625]:
# either the sender or the recipient
# df = pd.read_csv('new_clean_output.csv',index_col = 0)
df = pd.read_csv('output_0211.csv')

# df = df.drop_duplicates(keep='first',subset=['Sent'])
# df = df.reset_index(drop=True)

from_list = df['From'].values.tolist()
to_list = df['To'].values.tolist()
cc_list = df['CC'].values.tolist()
print(len(from_list))

71143


In [716]:
unrecognized_names = set()

In [737]:
def check_name(tname, tname_orig, temail_orig):
    if re_name2.match(tname): 
        name = re_name2.findall(tname)[0]
        first_name = name[1]
        last_name = name[0]
        if first_name == '' or last_name == '':
            print("--> full name: ", tname)
            print("     first: ", first_name, "    last: ", last_name)
    elif re_name1.match(tname):
        name = re_name1.findall(tname)[0]
        first_name = name[0]
        last_name = name[1]
        if first_name == '' or last_name == '':
            print("==> full name: ", tname)
            print("     first: ", first_name, "    last: ", last_name)
    else:
        first_name = 'unrecognized name'
        last_name = tname
        unrecognized_names.add((tname_orig.strip(), temail_orig.strip()))
        print('error: unrecognized name:',tname)
    # if first_name == '' or last_name == '':
    #     print("--> fullname : ",  tname)
        
    return first_name, last_name
    

In [738]:
def check_email(f, regex):
    tname, temail = regex.findall(f)[0]
    tname=tname.lower().strip()
    temail=temail.lower()
    return tname, temail

In [739]:

# find unique complete person in From section
def search_from_section(from_list):
    print("==> search_from_section")
    for f in from_list: 
        if pd.isnull(f):
            continue
        f = f.lower()
        if re_behalf.match(f):
            f = re_behalf.findall(f)[0]
        if re_bracket.match(f):
            tname, temail = check_email(f, re_bracket)
            if re_email.match(temail) and re_email.match(tname) == None:
                email = re_email.findall(temail)[0]
                if len(tname.split()) != 2:
                    first_name = tname   # You do not know it is the first name
                    last_name = ' '
                else:
                    first_name, last_name = check_name(tname, tname, temail)

                if email not in named_email_list:
                    person = (first_name, last_name, email)
                    people_list.append(person)
                    named_email_list.append(email)
                    name_list.append(first_name + ' ' + last_name)

# find unique complete person in TO section
def search_to_section(to_list):
    print("==> search_to_section")
    for ts in to_list: 
        if pd.isnull(ts):  # if nan
            continue
        ts = ts.lower()
        ts = ts.split(';')
        for t in ts:
            t = t.strip("'")
            if re_bracket.match(t) or re_bracket2.match(t):
                if re_bracket.match(t):
                    tname, temail = re_bracket.findall(t)[0]
                else:
                    tname, temail = re_bracket2.findall(t)[0]
                tname_orig = tname
                temail_orig = temail
                tname=tname.lower().strip()
                temail=temail.lower()
                if tname == '':
                    continue

                if re_email.match(temail) and re_email.match(tname) == None:
                    email = re_email.findall(temail)[0]
                    if len(tname.split()) != 2:
                        first_name = tname
                        last_name = ' '
                    else:
                        first_name, last_name = check_name(tname, tname_orig, temail_orig)

                    if email not in named_email_list:
                        person = (first_name, last_name, email)
                        people_list.append(person)
                        named_email_list.append(email)
                        name_list.append(first_name + ' ' + last_name)
                        
def search_cc_section(cc_list):
    print("==> search_cc_section")
    # find unique complete person in CC section
    for ccs in cc_list: 
        if pd.isnull(ccs):
            continue
        #ccs = ccs.lower()  # not needed. tname is lowered further down
        ccs = ccs.split(';')
        for cc in ccs:
            cc = cc.strip("'")
            if re_bracket.match(cc) or re_bracket2.match(cc):
                if re_bracket.match(cc):
                    tname, temail = re_bracket.findall(cc)[0]
                else:
                    tname, temail = re_bracket2.findall(cc)[0]
                tname_orig = tname
                tname=tname.lower().strip()  
                temail_orig = temail
                temail=temail.lower()
                if tname == '':
                    continue

                if re_email.match(temail) and re_email.match(tname) == None:
                    email = re_email.findall(temail)[0]
                    if len(tname.split()) != 2:
                        print("     split != 2, tname: ", tname)
                        first_name, last_name = check_name(tname, tname_orig, temail_orig)
                        # first_name = tname
                        # last_name = ' '
                        print("     first: ", first_name, "     last: ", last_name)
                        print("     temail: ", temail,    "     tname: ", tname)
                        print()
                    else:
                        first_name, last_name = check_name(tname, tname_orig, temail)

                    if email not in named_email_list:
                        person = (first_name, last_name, email)
                        people_list.append(person)
                        ## Obviously, person must have only two commas)
                        split_person0 = person[0].split(',')
                        if len(split_person0) != 1:  # SHOULD NOT HAPPEN in a perfect world
                            print("split_person: ", split_person0)  # <<< Identifies errors
                            print("person: ", person)
                            print("   tname: ", tname)
                            print()
                        named_email_list.append(email)
                        name_list.append(first_name + ' ' + last_name)

In [740]:
people_list = []
named_email_list = []
name_list = []
unrecognized_names = set()

search_from_section(from_list)
print(len(people_list))
search_to_section(to_list)
print(len(people_list))
search_cc_section(cc_list)
print(len(people_list))

==> search_from_section
error: unrecognized name: prospere•, poldine
error: unrecognized name: prospere•, poldine
826
==> search_to_section
error: unrecognized name: arthur"buddy" jacobs
error: unrecognized name: charles"charlie" dudley
error: unrecognized name: frederick "fred"springer
2650
==> search_cc_section
     split != 2, tname:  elias mathes- tallahassee downtown improvement authority
     first:  elias       last:  mathes
     temail:  elias.tallahasseedowntown@gmail.com      tname:  elias mathes- tallahassee downtown improvement authority

     split != 2, tname:  curry, chris, maleszewski, victoria, pollard, david, poole, carrie, jeremiah gerald
     first:  chris      last:  curry
     temail:  jeremiah@sixelconsul4ng.com      tname:  curry, chris, maleszewski, victoria, pollard, david, poole, carrie, jeremiah gerald

     split != 2, tname:  jtburnette
error: unrecognized name: jtburnette
     first:  unrecognized name      last:  jtburnette
     temail:  jt@inkbridge.com

In [741]:
unrecognized_names

{("'", 'lesliehsmith@gmail.com'),
 ("'AdamCorey", 'acorey@101tally.com'),
 ("'ClaudetteCromartie", 'CromartieC08@gmail.com'),
 ("'JayLanders'", 'jwlanders@jwlanders.com'),
 ("'PhillipSingleton", 'phillip@pittman-law.com'),
 ('Adam', 'ACorey@gunster.com'),
 ('AdamCorey', 'acorey@101tally.com'),
 ('AlanWilliams', 'Alan.Williams@myfloridahouse.gov'),
 ('AndrewGillum', 'gilluma@talgov.com'),
 ('BarbraWilliams', 'barbra0614@hotmail.com'),
 ('BobArbuthnot', 'bobarbuthnot@gmail.com'),
 ('BrendaWilliams', 'brenda@tallha.org'),
 ('BrookeLochore', 'blochore@goodwillbigbend.com'),
 ('ByronBurroughs', 'byronburroughs@yahoo.com'),
 ('CarrieInfinger', 'carrieinfinger@doralbankusa.com'),
 ('ChadKittrel', 'ckittrell@hunterharpholdings.com'),
 ('ChristyCameron', 'christycartercameron@gmail.com'),
 ('ClaudetteCromartie', 'CromartieC08@gmail.com'),
 ('DannyManausa', 'danny@manausalaw.com'),
 ('DavidCastillo', 'castman63@gmail.com'),
 ('DavidRamsay', 'davidramsay3@gmail.com'),
 ('DavidWildes', 'wildesd@tc

In [503]:
new_from_list = []

unknown_idx = 0
# replace the From section with unique people information

for f in from_list: 
    if pd.isnull(f):
        person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), 'f'+str(unknown_idx)+'_'+'l'+str(unknown_idx))
        unknown_idx = unknown_idx + 1
        new_from_list.append(person)
        continue
    email_exist_flag = 0
    f = f.lower().strip("'")
    email = ''
    first_name = ''
    last_name = ''
    if re_behalf.match(f):
        f = re_behalf.findall(f)[0]
        
    if re_email.match(f):
        email = re_email.findall(f)[0]
        email = email.lower()
        email_exist_flag = 1
    if len(f.split()) != 2:
        first_name = f
        last_name = ' '
    else:
        if re_name1.match(f):
            name = re_name1.findall(f)[0]
            first_name = name[0]
            last_name = name[1]
        elif re_name2.match(f): 
            name = re_name2.findall(f)[0]
            first_name = name[1]
            last_name = name[0]
        elif email_exist_flag == 1:
            pass
        else:
            print('error: cannot find name and email in f')
            print('f:', f)
            first_name = 'fake'
            last_name = f
            
    name = first_name + ' ' + last_name

    
    if email in named_email_list:
        idx = named_email_list.index(email)
        new_from_list.append(people_list[idx])
    elif name in name_list:
        idx = name_list.index(name)
        new_from_list.append(people_list[idx])
    elif email_exist_flag == 1:
        person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), email)
        new_from_list.append(person)
        unknown_idx = unknown_idx + 1
    else:
        person = (first_name, last_name, first_name + '_' + last_name)
        new_from_list.append(person)

In [504]:
len(new_from_list)

71143

In [505]:
len(from_list)

71143

In [506]:
new_to_lists = []

# replace the To section with unique people information

for ts in to_list:
    if pd.isnull(ts):
        new_to_lists.append([])
        continue
    ts = ts.lower()
    ts = ts.split(';')
    new_to_list = []
    for t in ts:
        t = t.strip("'")
        email_exist_flag = 0
        email = ''
        first_name = ''
        last_name = ''
        if re_behalf.match(t):
            t = re_behalf.findall(t)[0]

        if re_email.match(t):
            email = re_email.findall(t)[0]
            email = email.lower()
            email_exist_flag = 1
        if len(t.split()) != 2:
            first_name = t
            last_name = ' '
        else:
            if re_name1.match(t):
                name = re_name1.findall(t)[0]
                first_name = name[0]
                last_name = name[1]
            elif re_name2.match(t): 
                name = re_name2.findall(t)[0]
                first_name = name[1]
                last_name = name[0]
            elif email_exist_flag == 1:
                pass
            else:
                first_name = 'fake'
                last_name = t
                print('error: cannot find name and email in t, make fake name')
                print('t:', t)
#                 break

        name = first_name + ' ' + last_name


        if email in named_email_list:
            idx = named_email_list.index(email)
            new_to_list.append(people_list[idx])
        elif name in name_list:
            idx = name_list.index(name)
            new_to_list.append(people_list[idx])
        elif email_exist_flag == 1:
            person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), email)
            new_to_list.append(person)
            unknown_idx = unknown_idx + 1
        else:
            person = (first_name, last_name, first_name + '_' + last_name)
            new_to_list.append(person)
    new_to_lists.append(new_to_list)

In [507]:
len(to_list)

71143

In [508]:
len(new_to_lists)

71143

In [509]:
new_to_lists[:10]

[[('brooks ', 'hayes', 'brooks@culpeppercc.com')],
 [('', 'coons', 'kristen.coons@talgov.com')],
 [('kim ', 'rivers', 'kim@inkbridge.com')],
 [('', 'faris', 'alison.faris@talgov.com')],
 [('', 'gillum', 'andrew.gillum@talgov.com')],
 [('allen ', 'thompson', 'downtownmarket@earthlink.net')],
 [('', 'alfano', '_alfano')],
 [('bill ', 'proctor', 'proctorb@leoncountyfl.gov')],
 [('chad ', 'kittrell', 'chad@hunterandharp.com')],
 [('sue ', 'dick', 'sdick@talchamber.com'),
  (' dana ', 'noles', ' dana _noles'),
  (' whitney ', 'weeks', ' whitney _weeks'),
  (' ed edward murray jr.', ' ', ' ed edward murray jr._ '),
  (' kristin ', 'dozier', ' kristin _dozier'),
  (' cecilia ', 'homison', ' cecilia _homison'),
  ('f4871', 'l4871', 'lonnie.ballard@talgov.com'),
  ('scott ', 'balog', 'balogs@tcc.fl.edu'),
  ('kathy ', 'bell', 'kgb@coloneybell.com'),
  ('reggie l. bouthillier', ' ', 'rbouthillier@stearnsweaver.com'),
  ('jt ', 'burnette', 'jt@inkbridge.com'),
  ('william f butler', ' ', 'will@re

In [510]:
new_cc_lists = []
# replace the cc section with unique people information

for ccs in cc_list: 
    if pd.isnull(ccs):
        new_cc_lists.append([])
        continue
    ccs = ccs.lower()
    ccs = ccs.split(';')
    new_cc_list = []
    if len(ccs) == 1 and ccs[0].strip() == '':
        new_cc_lists.append(new_cc_list)
        continue
    for cc in ccs:
        cc = cc.strip("'")
        email_exist_flag = 0
        email = ''
        first_name = ''
        last_name = ''
        if re_behalf.match(cc):
            cc = re_behalf.findall(cc)[0]

        if re_email.match(cc):
            email = re_email.findall(cc)[0]
            email = email.lower()
            email_exist_flag = 1
        if len(cc.split()) != 2:
            first_name = cc
            last_name = ' '
        else:
            if re_name1.match(cc):
                name = re_name1.findall(cc)[0]
                first_name = name[0]
                last_name = name[1]
            elif re_name2.match(cc): 
                name = re_name2.findall(cc)[0]
                first_name = name[1]
                last_name = name[0]
            elif email_exist_flag == 1:
                pass
            else:
                first_name = 'fake'
                last_name = cc
                print('error: cannot find name and email in cc, make fake name')
                print('cc:', cc)

        name = first_name + ' ' + last_name

        if email in named_email_list:
            idx = named_email_list.index(email)
            new_cc_list.append(people_list[idx])
        elif name in name_list:
            idx = name_list.index(name)
            new_cc_list.append(people_list[idx])
        elif email_exist_flag == 1:
            person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), email)
            new_cc_list.append(person)
            unknown_idx = unknown_idx + 1
        else:
            person = (first_name, last_name, first_name + '_' + last_name)
            new_cc_list.append(person)
    new_cc_lists.append(new_cc_list)

In [511]:
len(cc_list)

71143

In [512]:
len(new_cc_lists)

71143

In [513]:
new_cc_lists[:10]

[[('ryan ', 'grindler', 'rgrindler@101tally.com'),
  ('', 'rosenfeld', 'david.rosenfeld@talgov.com')],
 [('', 'hurley', '_hurley')],
 [('jon ', 'brown', 'brownjon@leoncountyfl.gov'),
  (' mathieu ', 'cavell', ' mathieu _cavell'),
  ('', 'cmr', '_cmr'),
  (' victoria ', 'connell', ' victoria _connell'),
  (' deborah ', 'craig', ' deborah _craig'),
  (' stephanie ', 'holloway', ' stephanie _holloway'),
  (' lindsay ', 'jordan', ' lindsay _jordan'),
  (' angeline ', 'taylor', ' angeline _taylor'),
  (' jay ', 'townsend', ' jay _townsend'),
  (' jessicamiller', ' ', ' jessicamiller_ ')],
 [('paige ', 'carter', 'paige.tallahasseedowntown@gmail.com'),
  (' parade springtime tallahassee', ' ', ' parade springtime tallahassee_ '),
  ('', 'kring', 'edward.kring@talgov.com')],
 [('gary ', 'yordon', 'gary@zprgroup.com')],
 [],
 [('', 'daniels', 'tanya.daniels@meritagehomes.com'),
  (' simpson, roxanne m', ' ', ' simpson, roxanne m_ ')],
 [('', 'gillum', 'andrew.gillum@talgov.com')],
 [('jt ', 'bu

In [514]:
df_new = df.copy()
df_new['From'] = new_from_list
df_new['To'] = new_to_lists
df_new['CC'] = new_cc_lists
df_new


Unnamed: 0,filenm,From,Sent,To,CC,Bcc,Subject,Attachments,Importance,isThread,isAutoMessage,isDisplacement,hasAllCapLine,hasBadDate,Body
0,29142_fn_10-4-Cascade-2015-1-0_ln_42056.txt,"(shane a. moniz, , smoniz@connandassociates.com)","Friday, May 08, 2015 11:38 AM","[(brooks , hayes, brooks@culpeppercc.com)]","[(ryan , grindler, rgrindler@101tally.com), (,...",,Edison RFI's,RFI 008 Response.pdf; RFI 042 Response.pdf; RF...,,False,False,False,False,True,"['Response.pdf', 'Brooks, attached is our resp..."
1,41353_fn_10-3-Cascade-2014-2-0_ln_16811.txt,"(, beaudoin, mark.beaudoin@talgov.com)","Tuesday, August 19, 2014 5:21 PM","[(, coons, kristen.coons@talgov.com)]","[(, hurley, _hurley)]",,RE: Revised lease,,,False,False,False,False,False,['Good by me!']
2,01216_fn_17-2-IB2013-1-0_ln_44062.txt,"(ken , morris, morrisk@leoncountyfl.gov)","Tuesday, May 28, 2013 1:22 PM","[(kim , rivers, kim@inkbridge.com)]","[(jon , brown, brownjon@leoncountyfl.gov), ( m...",,Proposed Imagine Schedule,,,False,False,False,False,False,"['Kim,', 'See comments in blue regarding the p..."
3,32414_fn_32-1-PaigeCS-1-1_ln_18843.txt,"(jennifer , naff, director@springtimetallahass...","Wednesday, March 15, 2017 5:32 PM","[(, faris, alison.faris@talgov.com)]","[(paige , carter, paige.tallahasseedowntown@gm...",,RE: Springtime Parade,,,False,False,False,True,False,"['Hi Alison,', 'Dont apologize, I know its a l..."
4,56710_fn_31-2-GaryYordon2-3_ln_41027.txt,"(paul , lamaster, paullamaster250@comcast.net)","Sunday, June 7, 2015 at 1:33:39 PM Eastern Day...","[(, gillum, andrew.gillum@talgov.com)]","[(gary , yordon, gary@zprgroup.com)]",,Tony Carvajal: Develop strategy to get ahead o...,,,False,False,True,False,False,['Highly recommend you take a moment out of yo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71138,42570_fn_31-2-GaryYordon2-1_ln_10355.txt,"(paul , lamaster, paullamaster250@comcast.net)","Thursday, August 10, 2017 at 11:35:27 AM Easte...","[(, gillum, andrew.gillum@talgov.com)]",[],,Tallahassee Democrat E-Edi4on Ar4cle,,,False,False,True,False,False,"['nancy.miller@talgov.com, scoS.maddox@talgov...."
71139,36011_fn_32-1-PaigeCS-1-2_ln_5990.txt,"(ryan , grindler, ryan@edisontally.com)","Wednesday, February 22, 2017 at 6:27:34 PM Eas...","[( paige , carter, paige _carter)]","[( steve adams, ashley edwards, allison scor, ...",,Re: Summer Movie Night Series Discussion Mee-ng,,,False,False,True,False,False,"['All,', 'We are s-ll working on some logis-ca..."
71140,22004_fn_34-2-ScottMaddox2_ln_53965.txt,"(, fleming, allison.fleming@talgov.com)","Tuesday, May 28, 2013 2:38 PM","[(scott , maddox, scott@scottmaddox.com)]","[(paige , carter, paigecartersmith@gmail.com)]",,Call Sheet,Call List 5-28.xlsx,,False,False,True,False,False,['Current Call sheet attached - including this...
71141,64016_fn_16-5-HunterHarpHoldings2015-2017_ln_6...,"(scott , williams, scott@hunterandharp.com)","Monday, April 27, 2015 5:24 PM","[(scott , rowse, scott@morethanbuildings.com)]",[],,FW: Gateway Sprinklers and Sidewalks,,,False,False,False,False,False,"['Do you have a contact for the sprinklers?', ..."


In [167]:
df_new.to_csv('output_0211_name_standardized.csv', index=0)

In [138]:
f = pd.read_csv('output_0211_name_standardized.csv')

In [139]:
f.To.head()

0      [('brooks', 'hayes', 'brooks@culpeppercc.com')]
1    [('kristen', 'coons', 'kristen.coons@talgov.co...
2             [('kim', 'rivers', 'kim@inkbridge.com')]
3     [('alison', 'faris', 'alison.faris@talgov.com')]
4    [('andrew', 'gillum', 'andrew.gillum@talgov.co...
Name: To, dtype: object

In [141]:
f.CC.head()

0    [('ryan', 'grindler', 'rgrindler@101tally.com'...
1            [('patrick', 'hurley', 'patrick_hurley')]
2    [('jon', 'brown', 'brownjon@leoncountyfl.gov')...
3    [('paige', 'carter', 'paige.tallahasseedowntow...
4           [('f72764', 'l72764', 'alan.williams@my')]
Name: CC, dtype: object

In [145]:
eval(f.CC.head()[0])

[('ryan', 'grindler', 'rgrindler@101tally.com'),
 ('david', 'rosenfeld', 'david.rosenfeld@talgov.com')]