<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [56]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict
import re
from function_library import *
from function_library2 import *

In [2]:
# Search for "On Behalf" in the string.
re_behalf = re.compile(r'(.*)[Oo]n [Bb]ehalf')

# Search anywhere on the line for <mailto:xxxx> where xxx is the mailing address. 
# The patterns also consider () and [] instead of <>. Extract the last occurence of 
# an mail address via ().
re_bracket = re.compile(r'(.*)[\[\<\(]mailto\:(.*?)[\]\>\)]')

# Search anywhere on the line for <xxxx>, (xxxx), or [xxxx]. Extract xxxx. If there are multiple 
# occurences, extract the last one because .* is greedy.
re_bracket2 = re.compile(r'(.*)[\[\<\(](.*?)[\]\>\)]')

# Fix email retrieved by bracket: removed special ending and beginning non-alphanumer characters, and 
# remove any spaces

# Search anywhere in the line for a sequence of lower and upper cases letters, followed by a space (\s)
# followed by 0 or 1 capital letters [A-Z]? followed by 0 or a dots \.? For example, "GORdon A."
# ERROR: only allowing for a single space after the first name. Should be one or more spaces (\s+ intead of \s?) ?
# After the (first_name + initial + dot), allow for one sapce, and a last name formed from 1 or more [A-Za-z]. 
# Extract the last name with ()
#re_name1 = re.compile(r'.*?([A-Za-z]+\s?[A-Z]?\.?)\s([A-Za-z]+)')  # Written by Joey

# I would allow for multiple initials: Gordon A. Z. Erlebacher, or Gordon A Z Erlebacher (dots might be missing), 
#  or Gordon AZ Erlebacher  (Multiple names, or concatenated initials)
# re_name1 has no comma
# PROBLEM: if the first name has unicode characters, it is not found, and the first name   switches to the last name. 
# That is probably the result of \b, and the fact I allow zero or more sections to the first name. 
# I would like to capture the first name REGARDLESS OF THE CHARACTERS IN THE NAME, even numbers. So use \w and the UNICODE FLAG.
re_name1 = re.compile(r""".*?   
    (                         # Capture first name +  middle names (abbreviated or not)
        #[A-Za-z]+             # First name (one or more characters)
        (?: \s?
            #(?:\b[A-Za-z]+?\b\.?\s*)+   # Not captured: Abbreviation structure
            (?:\b\w+?\b\.?\s*)+   # Not captured: Abbreviation structure
        )+                       # Not captured: 1j or more sequence of abbreviations
        #)?                       # Not captured: 0 or more sequence of abbreviations
    )                            # end of first name + initials capture
    #\s*\b([A-Za-z]+\b)             # Capture last name preceded by one or more spaces. 
    \s*\b(\w+\b)             # Capture last name preceded by one or more spaces. 
                                 # \b forces the last name to start at a word boundary
""", re.X)

# Extract last name with (), followed by a comma, (the last name only contains lower and upper case, no apostrophes or dashes)
# Leave a single space, followed by first name ([A-Za-z], zero or more (?) spaces (\s) (ERROR: should be 1 ore more spaces (+), 
# followed by 0 or 1 initials [A-Z] followed by 0 or 1 dots (\.?). NOT GENERAL ENOUGH. What about Erle, Gordon A.E.F. or 
#   Erle, Gordon A. E.   F. ? 
#re_name2 = re.compile(r'.*?([A-Za-z]+),\s([A-Za-z]+\s?[A-Z]?\.?)')  # Joey's version
# Gordon's version
re_name2 = re.compile(r""".*?
    #([A-Za-z]+)\b,\s*                  # Capture last name followed by comma
    (\w+)\b,\s*                  # Capture last name followed by comma
    (                                  # Capture first name and initials
        (?: \s*
            (?:\b\w+\b\.?\s*)?   # Not captured: Abbreviation structure
        )*                             # Not captured: 0 or more sequence of abbreviations
    )   
""", re.X)  

re_name3 = re.compile(r""".*?
    ([\w^0-9]+)
""", re.X | re.UNICODE)



# Capture an email: that is a very complex exercise, so it it unlikely that this approarch works, but it is likely good enough. 
# Here is an more complex solution: https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s01.html
# Email string is defined as the first occurance of the expression in (): 
#   The email is a series of [a-zA-Z_] followed by zero or one dots \.? followed by one or more letter/number (/w), followed by '@'. 
#   followed by [a-zA-Z_0-9] (equiv to [\w] zero or more times, followed by a dot (0 or 1 times), followed by [a-zA-Z_] 0 or more, 
#  followed by period 0 or more, followed by 2 or three letters 
re_email =  re.compile(r'.*?([a-zA-Z_]*\.?\w+@[a-zA-Z_0-9]*\.?[a-zA-Z_]*\.?[a-zA-Z]{2,3})')

#  The following special characters are allowed in an email name:  ! # $ % & ' * + - / = ? ^ _ ` { |
#  For now, we ignore them. 
#  A domain suffix is required, so the domain after @ has at least one period. The full domain must be less than 64 characters long. 
#  We ignore this constraint. 
# Hyphens are allowed, but must be surrounded by characters: (?:[A-Za-z0-9]+\-?)+[A-Za-z0-9]+
# Domain name rules: https://www.dynadot.com/community/blog/domain-name-rules.html
#                    https://www.20i.com/support/domain-names/domain-name-restrictions
re_email1 = re.compile(r'.*?([\w.]*@[A-Za-z0-9\-]*\.?[a-zA-Z_]*\.?[a-zA-Z])')  # not used

re_domain = re.compile(r""".*?(  
     (?: (?:  [A-Za-z0-9]+\-?)+[A-Za-z0-9]+\.)+ (?: [A-Za-z]+)
)""", re.X)

# Rewritten by G. Erlebacher, 2022-02-13.
re_email = re.compile(r"""(.*?)(
     [\w.]*@    # email name: upper/lower case, numerals, dots, underscores
     (?:        # non-captured domain name
         (?:    # non-captured
             [A-Za-z0-9]+\-?    # sequence of letters/numbers followed by one hyphen (`seqA`)
         )+                     # one or more of `seqA`
         [A-Za-z0-9]+\.         # one or more letters after the last hyphen, followed by a dot
     )+                         # non-capture: one or more of `seqB`
     (?: [A-Za-z]+)             # the final domain segment, after the last dot
    )   # capture full email
""", re.X)


re_cap = re.compile(r'[A-Z]')

In [None]:
'cur.s@morethanbuildings.com': ('Cur.s', 'Whigham'),
'anita.favors.thompson@talgov.com': ('anita.favors.', 'thompson'),
 'Chris.an.Doolin@talgov.com': ('Chris.an', 'Doolin')
Interesting:
[rmills@fsuoldschool.com] => {('Randall', 'Mills'), ('Adam', 'Corey'), ('Randall Charles', 'Mills'), ('Randy', 'Mills'), ('randy', 'mills'), ('', 'unrecognized')}

    

In [181]:
email = "Gordon, Erlebacher <mailto:gordon@gail.c>"
#email = "    gordon@gmail.com"
name = re_email.match(email).groups()
print("name: ", name)
mail = name[-1]
print(mail)
print(len(name))
for i in range(len(name)):
    print(f"name[{i}]: {name[i]}")
re_name1.match(name[1])

email = "<marycfrederick@aol.com?adsf?>"
# email = "<mailto:tim@theedmondgroup. comcastbiz.net>"
re_bracket2.match(email).groups()

name:  ('Gordon, Erlebacher <mailto:', 'gordon@gail.c')
gordon@gail.c
2
name[0]: Gordon, Erlebacher <mailto:
name[1]: gordon@gail.c


('', 'marycfrederick@aol.com?adsf?')

In [4]:
str1 = "gord.erl.e@gmail.com"
str2 = "gord..erle.bach@gmail.com"
str3 = "3_gord..erle.ba3ch@gma+_-3il.com"
email = '3__42_gord@3sd-532-asd.523-asd3.adsf.com'
re_email.match(email).groups(0)

name = "Gordon Ad.C.EDC. dd. Dd.   D. Erlebacher"
#print(re_name1.match(name).groups())

name = "Erlebacher, Gordon E. EF H. "
name = ", Gordon E. EF H. "

re_name2.match(name)

# Two f's are a result of pdf translation. 
name = "jeﬀ barbacci"  
re_name1.match(name).groups() 

# name = "mccraw, rick"
# re_name2.match(name).groups()

# name = "john t burnette"
# re_name1.match(name).groups()

name = "atwell, scott f."
re_name2.match(name).groups()

('atwell', 'scott f.')

In [5]:
# either the sender or the recipient
# df = pd.read_csv('new_clean_output.csv',index_col = 0)
df = pd.read_csv('output_0211.csv')

# df = df.drop_duplicates(keep='first',subset=['Sent'])
# df = df.reset_index(drop=True)

from_list = df['From'].values.tolist()
to_list = df['To'].values.tolist()
cc_list = df['CC'].values.tolist()
print(len(from_list))

71143


In [6]:
unrecognized_names = set()

In [7]:
name="GordonErlebacher fred"
# search = re.search(r'^\s*([A-Z][^A-Z]+)([A-Z][^A-Z]+)[^/w]$', name)
name.count('A-Z')
count = re.compile(r'[A-Z]')
re_cap.findall(name)


['G', 'E']

In [8]:
re_first_last = re.compile(r'^([A-Z][a-z]*)([A-Z][a-z]*)$')
name = "GordonErlebacher "


In [157]:
def check_name(tname, tname_orig, temail_orig):
    if re_name2.match(tname): 
        name = re_name2.findall(tname)[0]
        first_name = name[1]
        last_name = name[0]
        # if first_name == '' or last_name == '':
        #     print("--> full name: ", tname)
        #     print("     first: ", first_name, "    last: ", last_name)
    elif re_name1.match(tname):
        name = re_name1.findall(tname)[0]
        first_name = name[0]
        last_name = name[1]
        # if first_name == '' or last_name == '':
        #     print("==> full name: ", tname)
        #     print("     first: ", first_name, "    last: ", last_name)
    else:
        first_name = ''   # I am throwing away the incorrect string
        last_name = 'unrecognized'
        # How many letters are capitalized in tname_orig
        # search = re.search(r'(?:[A-Z][^A-Z]+){2}', tname_orig)
        # if len(re_cap.findall(tname_orig)) == 2:
        first_last = re_first_last.match(tname_orig.strip())
        if first_last: 
            first_name = first_last.group(1).lower()
            last_name = first_last.group(2).lower()
            # print("*** valid name: ", tname_orig, first_name, last_name)
        else:
            unrecognized_names.add((tname_orig.strip(), temail_orig.strip()))
        
    return first_name.strip(), last_name.strip()
    

In [150]:
def check_email(f, regex):
    tname, temail = regex.findall(f)[0]
    tname=tname.lower().strip()
    temail=temail.lower()
    return tname, temail

In [186]:
name = "gor  fra alis?"
re.sub("[\s\?]+", "", name)

'gorfraalis'

In [194]:
def check_email1(f):
    bracket = re_bracket.match(f)
    bracket2 = re_bracket2.match(f)
    email = re_email.match(f)
    # print("f: ", f)
    if bracket:
        # print("bracket ")
        temail = bracket.groups()[1]
        tname = bracket.groups()[0]
    elif bracket2:
        # print("bracket 2")
        temail = bracket2.groups()[1]
        tname = bracket2.groups()[0]
    elif email:
        # print("re_email")
        temail = email.groups()[1]
        tname = email.groups()[0]
    else:
        temail = ""
        tname = f
        
    temail = re.sub("[\s\?]+", "", temail)
    first_name, last_name = check_name(tname, tname, temail)
    # print("tname: ", tname, ",  temail: ", temail)
    # print("           first: ", first_name, ", last: ", last_name)
    return first_name.strip(), last_name.strip(), temail.strip()



In [195]:
def ge_search_from(from_list):
    for f in from_list:
        if pd.isnull(f):
            # print("from is null: ", f)
            continue
        is_behalf = ''
        if re_behalf.match(f):
            f = re_behalf.findall(f)[0]
            is_behalf = 'b_'

        first, last, email = check_email1(f)
        email_to_names[email].add((is_behalf+first, is_behalf+last))
        name_to_emails[(first, last)].add(is_behalf+email)

In [196]:
def ge_search_list_of_lists(the_list):
    # print("==> search_to_section")
    for ts in the_list: 
        if pd.isnull(ts):  # if nan
            continue
        # ts = ts.lower()
        ts = ts.split(';')
        for t in ts:
            t = t.strip("'")
            first, last, email = check_email1(t)
            email_to_names[email].add((first, last))
            name_to_emails[(first, last)].add(email)                 

In [197]:
email_to_names = defaultdict(set)
name_to_emails = defaultdict(set)

ge_search_from(from_list)
ge_search_list_of_lists(to_list)
ge_search_list_of_lists(cc_list)

In [254]:
for name, emails in name_to_emails.items():
#
    
        print(f"[{name}] => {emails}")

[('Shane A.', 'Moniz')] => {'', 'smoniz@connandassociates.com'}
[('Mark', 'Beaudoin')] => {'', 'RGrindler@101tally.com', 'scott@scottmaddox.com', 'republicparking@comcast.net', 'republic_parking@comcast.net', 'Mark.Beaudoin@talqov.com', 'Mark.Beaudoin@talgov.com', 'mark.beaudoin@talgov.com'}
[('Ken', 'Morris')] => {'', 'MorrisK@leoncountyﬂ.gov', 'MorrisK@leoncountyfl.gov', 'morrisk@leoncountyfl.gov'}
[('Jennifer', 'Naff')] => {'director@springtimetallahassee.com', ''}
[('Paul', 'LaMaster')] => {'', 'paullamaster250@comcast.net', 'PAULLAMASTER250@comcast.net', 'whaQield@tallahassee.com', 'paullamaster@comcast.net', 'dougwray@comcast.net', '@comcast.net'}
[('Allison', 'Behrman')] => {'', 'Allison.Behrman@talgov.com', 'allison.behrman@talgov.com', 'downtownmarket@earthlink.net', 'Paige.tallahasseedowntown@gmail.com', 'kpriest@ccys.org'}
[('Michael', 'Alfano')] => {''}
[('Rick', 'McCraw')] => {'', 'gary@govinc.net', 'wes@mckibbon.com', 'downtownmarket@earthlink.net', 'paige.tallahasseedown

In [None]:
# Create dictionaries email -> name and name -> email
# Consider each element of name_to_emails. If there are two elements and one is empty, then name_email[name] = 'non_empty_email'
# If there are 3 elements, and one is empty, choose the email with most overlap between name and email owner. 
# If there is a single element that is empty, create a fake email:   last_first@fake.com
# If there are 4 elements, do not change them. Must be checked manually.
# Ignore the empty name the maps to many emails. 

# Consider an element of email_to_names. Ingore the email that is empty. 
# IF there is one entry in the values set, then email_name[email] = 'name'
# If there are two entries and one is empty, then email_name[email] = 'non-empty name'
# If there are three entries and one is empty, 

In [319]:
def overlap(name, email):
    # return the number of common characters
    
    email = set(re.sub("@.*$","", email))
    first = re.sub("^_b", "", name[0])
    last  = re.sub("^_b", "", name[1])
    new_name = set(re.sub("[^\w]", "", first+last))
    # print("email: ", email, ",  name: ", name)
    return len(email.intersection(new_name))

In [349]:
def compute_email_to_chosen_name():
    # An email should have only one name associated with it. 

    for email, names in email_to_names.items():
        # Remove lines that are likely incorrect
        if len(email) > 50 or len(names) > 5: continue
        # Do not consider an email with no "@"
        if not re.match(".*@", email): continue
        if email == "": continue
        lg = 0
        chosen_name = ''
        # print("\n",email, names)
        for name in names:
            if name[1] == 'unrecognized': continue
            # if first or last name contains a dot, flag it: 
            if re.match(r'.*\.[a-z]', name[0]) or re.match(r'.*\.[a-z]', name[1]):
                # print("email: ", email, ",   name= ", name)
                email_to_names_with_periods[email] = name
                continue
            # if last name ends in '\bgov' or '[\b_]com' or \bus, save it
            #  'us'          'bus' 
            if re.match(r'\w*(:?gov|_?com|us|edu)$', name[1]):
                if not re.match(r'.*\wus', name[1]):
                    # if re.match('.*(:?gov)', name[1]):
                    # print("==> email: ", email, ",   name: ", name)
                    email_to_names_with_periods[email] = name
                    continue
            lgo = overlap(name, email)
            if lgo > lg:
                lg = lgo
                chosen_name = name
        if chosen_name == '':
            null_chosen_names.append((email, names))
        else:
            email_to_chosen[email] = chosen_name

        # print("chosen name: ", chosen_name)
        # break


In [359]:
def compute_name_to_chosen_email():
    # An email should have only one name associated with it. 

    for name, emails in name_to_emails.items():
        # Remove lines that are likely incorrect
        if len(emails) > 10 or len(name) > 50: continue
#         print("name: ", name, "email: ", emails)


#         # Do not consider an email with no "@"
#         if not re.match(".*@", email): continue
#         if email == "": continue
#         lg = 0
        chosen_email = ''
        # print("\n",email, names)
        lg = 0
        for email in emails:
        #     if name[1] == '': continue
        #     # if first or last name contains a dot, flag it: 
        #     # if re.match(r'.*\.[a-z]', name[0]) or re.match(r'.*\.[a-z]', name[1]):
        #     #     # print("email: ", email, ",   name= ", name)
        #     #     email_to_names_with_periods[email] = name
        #     #     continue
        #     # if last name ends in '\bgov' or '[\b_]com' or \bus, save it
        #     if re.match(r'\w*(:?gov|_?com|us|edu)$', name[1]):
        #         if not re.match(r'.*\wus', name[1]):
        #             # if re.match('.*(:?gov)', name[1]):
        #             # print("==> email: ", email, ",   name: ", name)
        #             email_to_names_with_periods[email] = name
        #             continue
            lgo = overlap(name, email)
            if lgo > lg:
                lg = lgo
                chosen_email = email
        if chosen_email == '':
            null_chosen_emails.append((email, names))
        else:
            name_to_chosen[name] = chosen_email

        # print("chosen name: ", chosen_name)
        # break

In [410]:
emails_to_remove = []  # not used
null_chosen_names = []
email_to_chosen = {}
email_to_names_with_periods = {}

compute_email_to_chosen_name()

In [412]:
email_to_chosen
# email_to_names_with_periods

{'scott@hunterandharp.com': ('hunterandharp.', 'com'),
 'JessicaE.Brown@talgov.com': ('Brown', 'com'),
 'jlee@stctaxbenefits.com': ('stctaxbenefits.', 'com'),
 'adam@unconventionalstrategies.com': ('unconventionalstrategies.', 'com'),
 'cleandri@101tally.com': ('101tally.', 'com'),
 'BOSS_SYSTEM@talgov.com': ('talgov.', 'com'),
 'anna@hunterandharp.com': ('hunterandharp.', 'com'),
 'cityprojects@talgov.com': ('talgov.', 'com'),
 'FMS90FSYS90@talgov.com': ('talgov.', 'com'),
 'audrapittman@gmail.com': ('gmail.', 'com'),
 'MR_Street@doh.state.fl.us': ('doh.state.fl.', 'us'),
 'calendarno.ﬁca.on@google.com>': ('unconven.onalstrategies.', 'com'),
 'john@hunterandharp.com': ('hunterandharp.', 'com'),
 'shamaddox@embarqmail.com': ('embarqmail.', 'com'),
 'angela.whitaker@talgov.com': ('Whitaker', 'com'),
 'dan@gohbi.com': ('gohbi.', 'com'),
 'mfollmar@emaila2z.com': ('emaila2z.', 'com'),
 'catherine@sheltondean.com': ('sheltondean.', 'com'),
 'larry@produc.onsupportgroup.com': ('produc.onsup

In [360]:
names_to_remove = [] # not used
null_chosen_emails = []
name_to_chosen = {}
compute_name_to_chosen_email()

In [413]:
name_to_chosen

{('Shane A.', 'Moniz'): 'smoniz@connandassociates.com',
 ('Mark', 'Beaudoin'): 'Mark.Beaudoin@talqov.com',
 ('Ken', 'Morris'): 'MorrisK@leoncountyﬂ.gov',
 ('Jennifer', 'Naff'): 'director@springtimetallahassee.com',
 ('Paul', 'LaMaster'): 'paullamaster250@comcast.net',
 ('Allison', 'Behrman'): 'Allison.Behrman@talgov.com',
 ('Rick', 'McCraw'): 'Richard.McCraw@talgov.com',
 ('Matt', 'Thompson'): 'matt@madisonsocial.com',
 ('Patricia', 'Sanzone'): 'Patricia.Sanzone@dep.state.fl.us',
 ('Suzanna', 'Thomas'): 'sthomas@taledc.com',
 ('Theresa', 'Lamb'): 'Theresa.Lamb@talgov.com',
 ('Deputy County Administrator Jay Townsend',
  'Rosenzweig'): 'b_Attachment#1).Analysis:Attachment#1isthesummaryminutesfortheLeonCountySalesTaxCommitteemeetingheldonNovember15,2012,4:00p.m.intheFirstFloorProgramRoomoftheMainLibrary.TheLeonCountyClerkofCourtsOfficemaintainsavoicerecordingofallmeeting.AtranscriptionofthemeetingminuteswillbeprovidedbytheClerk’sofficeuponrequest.Options:1.ApprovethesummaryminutesfortheN

In [375]:
email_to_chosen_exceptions = {}
emails_not_found_in_email_to_chosen = []
non_name_match = []
name_match = []

# Is name_to_chosen and email_to_chosen consistent? Let us find out? 
for name, email in name_to_chosen.items():
    if len(email) > 50: continue
    try:
        new_name = email_to_chosen[email]
    except:
        # print("Exception, name: ", name, "email: ", email)
        emails_not_found_in_email_to_chosen.append(email)
        continue
        
    if name != new_name:
        non_name_match.append((name, new_name))
    else:
        name_match.append((name, new_name))
        # print("name: ", name, ",   new_name: ", new_name)
        
print("non name match: ", len(non_name_match))
print("name match: ", len(name_match))
print("emails_not_found_in_email_to_chosen: ", len(emails_not_found_in_email_to_chosen))

non name match:  397
name match:  3306
emails_not_found_in_email_to_chosen:  751


In [415]:
names_not_found_in_name_to_chosen = []
non_email_match = []
email_match = []

# Is name_to_chosen and email_to_chosen consistent? Let us find out? 
for email, name in email_to_chosen.items():
    if len(name) > 50: continue
    try:
        new_email = name_to_chosen[name]
    except:
        # print("Exception, name: ", name, "email: ", email)
        names_not_found_in_name_to_chosen.append(name)
        continue
        
    if email != new_email:
        non_email_match.append((email, new_email, name))
    else:
        email_match.append((email, new_email, name))
        # print("name: ", name, ",   new_name: ", new_name)

non_email_match.sort(key=lambda x: x[0])
email_match.sort(key=lambda x: x[0])
names_not_found_in_name_to_chosen.sort(key=lambda x: x[0])

print("non email match: ", len(non_email_match))
print("email match: ", len(email_match))
print("names_not_found_in_name_to_chosen: ", len(names_not_found_in_name_to_chosen))

non email match:  640
email match:  3306
names_not_found_in_name_to_chosen:  153


In [409]:
print( email_to_chosen["CraigD@leoncountyfl.gov"] )
print( email_to_chosen["CraigD@leoncountyﬂ.gov"] )
print( email_to_chosen["b_Deborah.Craig@talgov.com
print( email_to_names["CraigD@leoncountyﬂ.gov"] )
print( email_to_names["CraigD@leoncountyfl.gov"] )
print( name_to_emails[('Deborah', 'Craig')] )

# One missing email: Deborah.Crag@talgov.com

('Deborah', 'Craig')
('Deborah', 'Craig')
{('Deborah', 'Craig')}
{('Deborah', 'Craig'), ('', 'unrecognized')}
{'', 'b_Deborah.Craig@talgov.com', 'CraigD@leoncountyfl.gov', 'Deborah.Craig@talgov.com', 'CraigD@leoncountyﬂ.gov'}


In [419]:
name_to_emails

defaultdict(set,
            {('Shane A.', 'Moniz'): {'', 'smoniz@connandassociates.com'},
             ('Mark', 'Beaudoin'): {'',
              'Mark.Beaudoin@talgov.com',
              'Mark.Beaudoin@talqov.com',
              'RGrindler@101tally.com',
              'mark.beaudoin@talgov.com',
              'republic_parking@comcast.net',
              'republicparking@comcast.net',
              'scott@scottmaddox.com'},
             ('Ken', 'Morris'): {'',
              'MorrisK@leoncountyfl.gov',
              'MorrisK@leoncountyﬂ.gov',
              'morrisk@leoncountyfl.gov'},
             ('Jennifer', 'Naff'): {'', 'director@springtimetallahassee.com'},
             ('Paul', 'LaMaster'): {'',
              '@comcast.net',
              'PAULLAMASTER250@comcast.net',
              'dougwray@comcast.net',
              'paullamaster250@comcast.net',
              'paullamaster@comcast.net',
              'whaQield@tallahassee.com'},
             ('Allison', 'Behrman'): {'',
     

In [417]:
# How can these happen in non_email_match
# 'mailto:LBridges@flcities.com'),
# 'calendarno.ﬁca.on@google.com>', 'paige.tallahasseedowntown@gmail.com'),
# 

non_email_match
# email_match

[("'acdevents@yahoo.com'", 'anthony@tallha.org', ('Anthony', 'Davis')),
 ('.Williams@talgov.com',
  'Yolande.Williams@talgov.com',
  ('Yolande', 'Williams')),
 ('5dick@talchamber.come', 'sdick@talchamber.coma', ('Sue', 'Dick')),
 ('Allison.Fleming@talgov.com>', 'Scof.Maddox@talgov.com', ('Scof', 'Maddox')),
 ('Andrea.Rosser@talgov.com>', 'Edward.Kring@talgov.com', ('Edward', 'Kring')),
 ('Andrew.Gillum@talgov.com>', 'John.Marks@talgov.com', ('John', 'Marks')),
 ('Angela.Baldwin@talgov.com',
  'Angela.Baldwin@talgov.com>',
  ('Angela', 'Baldwin')),
 ('Appointed@talgov.com>',
  'ccoffice@talgov.com',
  ('City Commission', 'Office')),
 ('Audra@cocanet.org', 'audrapittman@gmail.com', ('Audra', 'Pittman')),
 ('Autumn.Calder@talgov.com',
  'Autumn.Calder@blueprint2000.org',
  ('Autumn', 'Calder')),
 ('BGABORDI@tallahassee.com', 'bgabordi@tallahassee.com', ('Bob', 'Gabordi')),
 ('Ben.Pingree@Tlcplace.org',
  'Ben.Pingree@blueprint2000.org',
  ('Ben', 'Pingree')),
 ('Ben.pingree@blueprint2000.

In [327]:
len(email_to_names_with_periods), email_to_names_with_periods;
# Incorrect removals: 
# 'cur.s@morethanbuildings.com': ('Cur.s', 'Whigham'),
# 'anita.favors.thompson@talgov.com': ('anita.favors.', 'thompson'),
#  'Chris.an.Doolin@talgov.com': ('Chris.an', 'Doolin')
# 3 out of 300 or 1% wrong removal. The reason is that the names were not capitalized in one case, 
#   an abbreviation was not capitalized (Cur.s instead of Cur.S), Same for Chris.an; the "an" should probably be capitalized.

In [418]:
non_name_match

[(('Shane A.', 'Moniz'), ('Shane', 'Moniz')),
 (('Matt', 'Thompson'), ('matt', 'thompson')),
 (('Gary', 'Yordon'), ('b_', 'b_unrecognized')),
 (('Janice', 'Elyea'), ('b_Janice', 'b_Elyea')),
 (('Davisson Dunlap', 'III'), ('Davisson', 'Dunlap')),
 (('Kelly', 'Dozier'), ('kelly', 'dozier')),
 (('Edward Martí', 'Kring'), ('Eddie Marti', 'Kring')),
 (('hunterandharp.', 'com'), ('John', 'McNeill')),
 (('Hays', 'Layerd'), ('Adam', 'Corey')),
 (('Barney T.', 'Bishop'), ('Barney', 'Bishop')),
 (('Bill', 'Moor'), ('bill', 'moor')),
 (('Christy', 'Cameron'), ('christy', 'cameron')),
 (('Adam B', 'Corey'), ('adam', 'corey')),
 (('Elias Mathes', 'TDIA'), ('Jessica', 'Brown')),
 (('Judy', 'Donahoe'), ('Judy Monday', 'Donahoe')),
 (('Jonathan', 'Kilpatrick'), ('Jonathan.', 'Kilpatrick')),
 (('Tiﬀany', 'Davis'), ('Tiffany', 'Davis')),
 (('Audra', 'Pittman'), ('audra', 'pittman')),
 (('Josh', 'Aubuchon'), ('josh', 'aubuchon')),
 (('April', 'Salter'), ('april', 'salter')),
 (('Liz', 'Joyner'), ('b_', '

In [322]:
email_to_chosen

{'director@springtimetallahassee.com': ('Jennifer', 'Naff'),
 'Richard.McCraw@talgov.com': ('Rick', 'McCraw'),
 'Patricia.Sanzone@dep.state.fl.us': ('Patricia', 'Sanzone'),
 'nanettes@moorecommgroup.com': ('Nanette', 'Schimpf'),
 'Venus.Childs@talgov.com': ('Venus', 'Childs'),
 'scott@hunterandharp.com': ('scott', 'williams'),
 'Stacey.Campbell@talgov.com': ('Stacey', 'Campbell'),
 'morrisk@leoncountyfl.gov': ('Ken', 'Morris'),
 'JessicaE.Brown@talgov.com': ('Jessica', 'Brown'),
 'chad@hunterandharp.com': ('Chad', 'Kittrell'),
 'Janice.Elyea@talgov.com': ('b_Janice', 'b_Elyea'),
 'danny@manausalaw.com': ('danny', 'manausa'),
 'Alison.Faris@talgov.com': ('Alison', 'Faris'),
 'davissoniii@dunlapshipman.com': ('Davisson', 'Dunlap'),
 'Tonya.Herron@talgov.com': ('Tonya', 'Herron'),
 'mail2bonita@comcast.net': ('Bonita Davis', 'Paige'),
 'SwainN@leoncountyfl.gov': ('Nan', 'Swain'),
 'jlee@stctaxbenefits.com': ('Jonathan', 'Lee'),
 'adam@unconventionalstrategies.com': ('Adam', 'Corey'),
 'of

In [265]:
len(email_to_chosen)

4294

In [202]:
for email, names in email_to_names.items():
    if len(names) > 1 and len(names) < 10:
        print(f"[{email}] => {names}")

[director@springtimetallahassee.com] => {('Jennifer', 'Naff'), ('', 'unrecognized')}
[Richard.McCraw@talgov.com] => {('', 'unrecognized'), ('Rick', 'McCraw'), ('Rick', 'Mccraw')}
[Venus.Childs@talgov.com] => {('Venus', 'Childs'), ('', 'unrecognized')}
[scott@hunterandharp.com] => {('', 'unrecognized'), ('scott', 'williams'), ('Scott', 'Williams'), ('hunterandharp.', 'com')}
[morrisk@leoncountyfl.gov] => {('Ken', 'Morris'), ('', 'unrecognized')}
[JessicaE.Brown@talgov.com] => {('Ryan Grindler Richard', 'Auwaerter'), ('Richard', 'Auwaerter'), ('', 'unrecognized'), ('Jessica', 'Brown'), ('Brown', 'com')}
[chad@hunterandharp.com] => {('', 'unrecognized'), ('rick.', 'mccraw'), ('Chad', 'Kittrell')}
[Janice.Elyea@talgov.com] => {('b_Janice', 'b_Elyea'), ('M', 'Ingram'), ('', 'unrecognized'), ('Janice', 'Elyea')}
[danny@manausalaw.com] => {('Danny', 'Manausa'), ('danny', 'manausa'), ('', 'unrecognized')}
[Alison.Faris@talgov.com] => {('Alison', 'Faris'), ('Alison Monday', 'Faris'), ('', 'unre

In [119]:

# find unique complete person in From section
def search_from_section(from_list):
    print("==> search_from_section")
    for f in from_list[0:200]: 
        if pd.isnull(f):
            continue
        f = f.lower()
        if re_behalf.match(f):
            f = re_behalf.findall(f)[0]
        
        first, last, email = check_email1(f)
        if email == "":
            print(f"email'', first: {first}, last: {last}, email: {email}")
        if first == "":
            print(f"first'', first: {first}, last: {last}, email: {email}")
        if last == "":
            print(f"last'', first: {first}, last: {last}, email: {email}")
        
        if email not in named_email_list:
            person = (first, last, email)
            people_list.append(person)
            named_email_list.append(email)
            name_list.append(first + ' ' + last)  # should not be needed
            

            
# find unique complete person in TO section
def search_to_section(to_list):
    print("==> search_to_section")
    for ts in to_list[0:200]: 
        if pd.isnull(ts):  # if nan
            continue
        ts = ts.lower()
        ts = ts.split(';')
        for t in ts:
            t = t.strip("'")
            email_match = re_email.match(t)
            bracket_match = re_bracket.match(t)
            bracket2_match = re_bracket2.match(t)
            if bracket_match:
                print("bracket: ", bracket_match.groups())
            if bracket2_match:
                print("bracket2: ", bracket2_match.groups())
            continue
        
            if re_bracket.match(t) or re_bracket2.match(t):
                if re_bracket.match(t):
                    tname, temail = re_bracket.findall(t)[0]
                else:
                    tname, temail = re_bracket2.findall(t)[0]
                tname_orig = tname
                temail_orig = temail
                tname=tname.lower().strip()
                temail=temail.lower()
                if tname == '':
                    continue

                if re_email.match(temail) and re_email.match(tname) == None:
                    email = re_email.findall(temail)[0]
                    if len(tname.split()) != 2:
                        first_name = tname
                        last_name = ' '
                    else:
                        first_name, last_name = check_name(tname, tname_orig, temail_orig)

                    if email not in named_email_list:
                        person = (first_name, last_name, email)
                        people_list.append(person)
                        named_email_list.append(email)
                        name_list.append(first_name + ' ' + last_name)
                        
def search_cc_section(cc_list):
    print("==> search_cc_section")
    # find unique complete person in CC section
    for ccs in cc_list: 
        if pd.isnull(ccs):
            continue
        #ccs = ccs.lower()  # not needed. tname is lowered further down
        ccs = ccs.split(';')
        for cc in ccs:
            cc = cc.strip("'")
            if re_bracket.match(cc) or re_bracket2.match(cc):
                if re_bracket.match(cc):
                    tname, temail = re_bracket.findall(cc)[0]
                else:
                    tname, temail = re_bracket2.findall(cc)[0]
                tname_orig = tname
                tname=tname.lower().strip()  
                temail_orig = temail
                temail=temail.lower()
                if tname == '':
                    continue

                if re_email.match(temail) and re_email.match(tname) == None:
                    email = re_email.findall(temail)[0]
                    if len(tname.split()) != 2:
                        first_name, last_name = check_name(tname, tname_orig, temail_orig)
                    else:
                        first_name, last_name = check_name(tname, tname_orig, temail)

                    if email not in named_email_list:
                        person = (first_name, last_name, email)
                        people_list.append(person)
                        ## Obviously, person must have only two commas)
                        split_person0 = person[0].split(',')
                        if len(split_person0) != 1:  # SHOULD NOT HAPPEN in a perfect world
                            print("SHOULD NOT HAPPEN: split_person: ", split_person0)  # <<< Identifies errors
                            print("person: ", person)
                            print("   tname: ", tname)
                            print()
                        named_email_list.append(email)
                        name_list.append(first_name + ' ' + last_name)

In [None]:
people_list = []
named_email_list = []
name_list = []
unrecognized_names = set()
email_set = set()
people_set = set()
name_set = set()
# name_dict[('joe', 'blow')] = list of emails
name_dict = defaultdict(list) 

search_from_section(from_list)
print(len(people_list))
# search_to_section(to_list)
# print(len(people_list))
# search_cc_section(cc_list)
# print(len(people_list))

In [13]:
people_set = set(people_list)
named_email_set = set(named_email_list)
name_set = set(name_list)

In [14]:
people_list[0:2], named_email_list[0:2], name_list[0:2]

([('jennifer ', 'naff', ('', 'director@springtimetallahassee.com')),
  ('rick', 'mccraw', ('', 'richard.mccraw@talgov.com'))],
 [('', 'director@springtimetallahassee.com'),
  ('', 'richard.mccraw@talgov.com')],
 ['jennifer  naff', 'rick mccraw'])

In [15]:
unrecognized_names

{("'", 'lesliehsmith@gmail.com'),
 ("'AdamCorey", 'acorey@101tally.com'),
 ("'ClaudetteCromartie", 'CromartieC08@gmail.com'),
 ("'JayLanders'", 'jwlanders@jwlanders.com'),
 ("'PhillipSingleton", 'phillip@pittman-law.com'),
 ('Adam', 'ACorey@gunster.com'),
 ('Gmax1', 'gmax1@comcast.net'),
 ('JTBurnette', 'jt@inkbridge.com'),
 ('JamesMcFaddin', 'mcfaddin@sostrategy.com'),
 ('Jeff', 'estimator@rtelectricllc.com'),
 ('JessicaLowe-Minor', 'LWVFExecutiveDirector@gmail.com'),
 ('Lo & Be', 'loandbedesigns@gmail.com'),
 ('Maisbel (Mae) Mendez', 'mmendez@101tally.com'),
 ('PhyllisDePreist', 'republic_parking@comcast.net'),
 ('Pons', 'communityinfo@leonschools.n et'),
 ('RickMcCraw', 'rick.mccraw@talgov.com'),
 ('Rivers', 'kim@inkbridge.com'),
 ('VerlaLawson-Grady', 'Verla.Lawson-Grady@djj.state.fl.us'),
 ('VerlaLawson-Grady', 'Verla.LawsonGrady@djj.state.fl.us'),
 ('arthur"buddy" jacobs', 'aijacobs@bellsouth.net'),
 ('charles"charlie" dudley', 'cdudley@bcmdm.com'),
 ('frederick "fred"springer', 

In [31]:
named_email_list

[('', 'director@springtimetallahassee.com'),
 ('', 'richard.mccraw@talgov.com'),
 ('', 'patricia.sanzone@dep.state.fl.us'),
 ('', 'venus.childs@talgov.com'),
 ('', 'scott@hunterandharp.com'),
 ('', 'paige.tallahasseedowntown@gmail.com'),
 ('', 'gary@govinc.net'),
 ('', 'chad@hunterandharp.com'),
 ('', 'danny@manausalaw.com'),
 ('', 'davissoniii@dunlapshipman.com'),
 ('', 'mail2bonita@comcast.net'),
 ('', 'kim@inkbridge.com'),
 ('', 'adam@unconventionalstrategies.com'),
 ('', 'office@unconventionalstrategies.com'),
 ('', 'mike.tadros@talgov.com'),
 ('', 'daniellee@leoncountyfl.gov'),
 ('', 'agillum@pfaw.org'),
 ('', 'wes@mckibbon.com'),
 ('', 'dena.strickland@boystown.org'),
 ('', 'mark.beaudoin@talgov.com'),
 ('', 'morrisk@leoncountyfl.gov'),
 ('', 'anna@hunterandharp.com'),
 ('', 'cristin@newleafmarket.coop'),
 ('', 'gary@zprgroup.com'),
 ('', 'ben.pingree@blueprint2000.org'),
 ('', 'sarah.valentine@talgov.com'),
 ('', 'ekring@101tally.com'),
 ('', 'catherine@sheltondean.com'),
 ('', 

In [29]:
def analyze_from_list(from_list):
    new_from_list = []

    unknown_idx = 0
    # replace the From section with unique people information
    # from_list: all the names from the From: column, without removing duplicates
    # Purpose: ...

    for f in from_list[0:200]: 
        if pd.isnull(f):  # NaN
            person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), 'f'+str(unknown_idx)+'_'+'l'+str(unknown_idx))
            unknown_idx = unknown_idx + 1
            new_from_list.append(person)
            continue
        email_exist_flag = 0
        f = f.lower().strip("'")  #  "Why single quote? 
        email = ''
        first_name = ''
        last_name = ''
        if re_behalf.match(f):
            # f should be first/last name + email
            f = re_behalf.findall(f)[0]
            # print("re_behalf, f= ", f)


        email_match = re_email.match(f)
        if email_match:
            # print("re_email matched, f: ", f)
            # print("findall: ", re_email.findall(f))
            email = email_match.groups()[-1]
            first_last = email_match.groups()[0:-1]
            # print("email_match, email: ", email)
            # print("       f: ", f)
            # print("             first_last: ", first_last)
            if re_name2.match(first_last[0]):
                # print("         re_name2 match: ", re_name2.match(f).groups())
                last_name, first_name = re_name2.match(first_last[0]).groups()[0:2]
                # print("name2: ", first_name, last_name)
            elif re_name1.match(first_last[0]):
                # print("         re_name1 match: ", re_name1.match(f).groups())
                # print("first_last: ", first_last)
                first_name, last_name = re_name1.match(first_last[0]).groups()[0:2]
                # print("name1: ", first_name, last_name)
            else:
                # print("NO NAME MATCH, t: ", f)
                first_name = 'fake'
                last_name = 'fake'

            email = email.lower()
            email_exist_flag = 1
            ## If email exists, why not add it to the named_email_list? 
            ## ANSWER: because named_email_list requires first/last name + email
        else:
            # print("No email match, f= ", f)
            if re_name2.match(f):
                # print("         re_name2 match: ", re_name2.match(f).groups())
                last_name, first_name = re_name2.match(f).groups()[0:2]
                # print("name2: ", first_name, last_name)
            elif re_name1.match(f):
                # print("         re_name1 match: ", re_name1.match(f).groups())
                first_name, last_name = re_name1.match(f).groups()[0:2]
                # print("name1: ", first_name, last_name)
            else:
                # if there is no name match, invent first and last names
                # print("NO NAME MATCH, f: ", f)
                # first_name = 'fake'
                # last_name = 'fake'
                first_name = 'f'+str(unknown_idx)
                last_name = 'l'+str(unknown_idx)
                unknown_idx = unknown_idx + 1
        # if len(f.split()) != 2:    # CHECK
        #     first_name = f
        #     last_name = ' '     # why space and not empty
        # else:   # two words separated by space
        #     if re_name1.match(f):
        #         name = re_name1.findall(f)[0]
        #         first_name = name[0]
        #         last_name = name[1]
        #     elif re_name2.match(f): 
        #         name = re_name2.findall(f)[0]
        #         first_name = name[1]
        #         last_name = name[0]
        #     elif email_exist_flag == 1:
        #         pass
        #     else:
        #         print('error: cannot find name and email in f')
        #         print('f:', f)
        #         first_name = 'fake'
        #         last_name = f

        name = first_name + ' ' + last_name
        if first_name == 'fake':
            print(f"(fake name), f: {f}\n          first: {first_name}, last: {last_name}")
            pass
        
        ### NEED NEW DATASTRUCTURES

        # all entries in named_email_list have a valid entry in people_list at the same index
        if email in named_email_list:   # named_email_list: only emails
            idx = named_email_list.index(email)
            new_from_list.append(people_list[idx])
        elif name in name_list:   ## WE MIGHT NOT NEED THIS (GE). Handle person with multiple emails
            idx = name_list.index(name)
            new_from_list.append(people_list[idx])
        elif email_exist_flag == 1:   # email by itself
            person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), email)
            print(f"==> mail by itself, f: {f}, \n          person: ", person)
            print("              name: ", name)
            new_from_list.append(person)
            unknown_idx = unknown_idx + 1
        else:
            person = (first_name, last_name, first_name + '_' + last_name)
            new_from_list.append(person)

In [30]:
analyze_from_list(from_list)

==> mail by itself, f: mccraw, rick [mailto:richard.mccraw@talgov.com], 
          person:  ('f0', 'l0', 'richard.mccraw@talgov.com')
              name:  rick  mccraw
==> mail by itself, f: sanzone, patricia [mailto:patricia.sanzone@dep.state.fl.us], 
          person:  ('f1', 'l1', 'patricia.sanzone@dep.state.fl.us')
              name:  patricia  sanzone
==> mail by itself, f: childs, venus [mailto:venus.childs@talgov.com], 
          person:  ('f2', 'l2', 'venus.childs@talgov.com')
              name:  venus  childs
==> mail by itself, f: campbell, stacey <stacey.campbell@talgov.com, 
          person:  ('f3', 'l3', 'stacey.campbell@talgov.com')
              name:  stacey  campbell
==> mail by itself, f:  elyea, janice <janice.elyea@talgov.com, 
          person:  ('f4', 'l4', 'janice.elyea@talgov.com')
              name:  janice  elyea
(fake name), f: <serles>, randall <randall.serles@talgov.com
          first: fake, last: fake
==> mail by itself, f: <serles>, randall <randall.

In [112]:
len(new_from_list)

71143

In [113]:
len(from_list)

71143

In [140]:
def analyze_list_of_lists(list_of_lists):
    new_to_lists = []

    # replace the To section with unique people information

    for ts in to_list:
        if pd.isnull(ts):
            new_to_lists.append([])
            continue
        ts = ts.lower()
        ts = ts.split(';')
        new_to_list = []
        for t in ts:       # ts: list of recipients
            t = t.strip("'")
            email_exist_flag = 0
            email = ''
            first_name = ''
            last_name = ''
            if re_behalf.match(t):
                t = re_behalf.findall(t)[0]

            if re_email.match(t):
                email = re_email.findall(t)[0]
                email = email.lower()
                email_exist_flag = 1
            if len(t.split()) != 2:
                first_name = t
                last_name = ' '
            else:
                if re_name1.match(t):
                    name = re_name1.findall(t)[0]
                    first_name = name[0]
                    last_name = name[1]
                elif re_name2.match(t): 
                    name = re_name2.findall(t)[0]
                    first_name = name[1]
                    last_name = name[0]
                elif email_exist_flag == 1:
                    pass
                else:
                    first_name = 'fake'
                    last_name = t
                    print('error: cannot find name and email in t, make fake name')
                    print(f"t: {t},  first: {first_name}, last: {last_name}")
    #                 break

            name = first_name + ' ' + last_name


            if email in named_email_list:
                idx = named_email_list.index(email)
                new_to_list.append(people_list[idx])
            elif name in name_list:
                idx = name_list.index(name)
                new_to_list.append(people_list[idx])
            elif email_exist_flag == 1: # email by itself
                person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), email)
                new_to_list.append(person)
                unknown_idx = unknown_idx + 1
            else:
                person = (first_name, last_name, first_name + '_' + last_name)
                new_to_list.append(person)
        new_to_lists.append(new_to_list)
    return new-to_list

In [141]:
new_to_list = analyze_list_of_lists(to_list)
new_cc_list = analyze_list_of_lists(cc_list)

UnboundLocalError: local variable 'unknown_idx' referenced before assignment

In [115]:
len(to_list)

71143

In [116]:
len(new_to_lists)

71143

In [123]:
#new_to_lists[:10]
# for el in new_to_lists:
for i, el in enumerate(new_to_lists):
    ##if i > 5: break
    print(el)   # How can element of new_to_lists be empty?
    ell = el[0]
    if ell[0] == ' ' or ell[1] == ' ':
        print(ell)

[('brooks ', 'hayes', 'brooks@culpeppercc.com')]
[('kristen', 'coons', 'kristen.coons@talgov.com')]
[('kim ', 'rivers', 'kim@inkbridge.com')]
[('alison', 'faris', 'alison.faris@talgov.com')]
[('andrew', 'gillum', 'andrew.gillum@talgov.com')]
[('allen ', 'thompson', 'downtownmarket@earthlink.net')]
[('michael', 'alfano', 'michael_alfano')]
[('bill ', 'proctor', 'proctorb@leoncountyfl.gov')]
[('chad ', 'kittrell', 'chad@hunterandharp.com')]
[('sue ', 'dick', 'sdick@talchamber.com'), (' dana ', 'noles', ' dana _noles'), (' whitney ', 'weeks', ' whitney _weeks'), (' ed edward murray jr.', ' ', ' ed edward murray jr._ '), (' kristin ', 'dozier', ' kristin _dozier'), (' cecilia ', 'homison', ' cecilia _homison'), ('f4877', 'l4877', 'lonnie.ballard@talgov.com'), ('scott ', 'balog', 'balogs@tcc.fl.edu'), ('kathy ', 'bell', 'kgb@coloneybell.com'), ('reggie l. bouthillier', ' ', 'rbouthillier@stearnsweaver.com'), ('jt ', 'burnette', 'jt@inkbridge.com'), ('william f butler', ' ', 'will@realestate

IndexError: list index out of range

In [510]:
new_cc_lists = []
# replace the cc section with unique people information

for ccs in cc_list: 
    if pd.isnull(ccs):
        new_cc_lists.append([])
        continue
    ccs = ccs.lower()
    ccs = ccs.split(';')
    new_cc_list = []
    if len(ccs) == 1 and ccs[0].strip() == '':
        new_cc_lists.append(new_cc_list)
        continue
    for cc in ccs:
        cc = cc.strip("'")
        email_exist_flag = 0
        email = ''
        first_name = ''
        last_name = ''
        if re_behalf.match(cc):
            cc = re_behalf.findall(cc)[0]

        if re_email.match(cc):
            email = re_email.findall(cc)[0]
            email = email.lower()
            email_exist_flag = 1
        if len(cc.split()) != 2:
            first_name = cc
            last_name = ' '
        else:
            if re_name1.match(cc):
                name = re_name1.findall(cc)[0]
                first_name = name[0]
                last_name = name[1]
            elif re_name2.match(cc): 
                name = re_name2.findall(cc)[0]
                first_name = name[1]
                last_name = name[0]
            elif email_exist_flag == 1:
                pass
            else:
                first_name = 'fake'
                last_name = cc
                print('error: cannot find name and email in cc, make fake name')
                print('cc:', cc)

        name = first_name + ' ' + last_name

        if email in named_email_list:
            idx = named_email_list.index(email)
            new_cc_list.append(people_list[idx])
        elif name in name_list:
            idx = name_list.index(name)
            new_cc_list.append(people_list[idx])
        elif email_exist_flag == 1:
            person = ('f'+str(unknown_idx), 'l'+str(unknown_idx), email)
            new_cc_list.append(person)
            unknown_idx = unknown_idx + 1
        else:
            person = (first_name, last_name, first_name + '_' + last_name)
            new_cc_list.append(person)
    new_cc_lists.append(new_cc_list)

In [511]:
len(cc_list)

71143

In [512]:
len(new_cc_lists)

71143

In [513]:
new_cc_lists[:10]

[[('ryan ', 'grindler', 'rgrindler@101tally.com'),
  ('', 'rosenfeld', 'david.rosenfeld@talgov.com')],
 [('', 'hurley', '_hurley')],
 [('jon ', 'brown', 'brownjon@leoncountyfl.gov'),
  (' mathieu ', 'cavell', ' mathieu _cavell'),
  ('', 'cmr', '_cmr'),
  (' victoria ', 'connell', ' victoria _connell'),
  (' deborah ', 'craig', ' deborah _craig'),
  (' stephanie ', 'holloway', ' stephanie _holloway'),
  (' lindsay ', 'jordan', ' lindsay _jordan'),
  (' angeline ', 'taylor', ' angeline _taylor'),
  (' jay ', 'townsend', ' jay _townsend'),
  (' jessicamiller', ' ', ' jessicamiller_ ')],
 [('paige ', 'carter', 'paige.tallahasseedowntown@gmail.com'),
  (' parade springtime tallahassee', ' ', ' parade springtime tallahassee_ '),
  ('', 'kring', 'edward.kring@talgov.com')],
 [('gary ', 'yordon', 'gary@zprgroup.com')],
 [],
 [('', 'daniels', 'tanya.daniels@meritagehomes.com'),
  (' simpson, roxanne m', ' ', ' simpson, roxanne m_ ')],
 [('', 'gillum', 'andrew.gillum@talgov.com')],
 [('jt ', 'bu

In [514]:
df_new = df.copy()
df_new['From'] = new_from_list
df_new['To'] = new_to_lists
df_new['CC'] = new_cc_lists
df_new


Unnamed: 0,filenm,From,Sent,To,CC,Bcc,Subject,Attachments,Importance,isThread,isAutoMessage,isDisplacement,hasAllCapLine,hasBadDate,Body
0,29142_fn_10-4-Cascade-2015-1-0_ln_42056.txt,"(shane a. moniz, , smoniz@connandassociates.com)","Friday, May 08, 2015 11:38 AM","[(brooks , hayes, brooks@culpeppercc.com)]","[(ryan , grindler, rgrindler@101tally.com), (,...",,Edison RFI's,RFI 008 Response.pdf; RFI 042 Response.pdf; RF...,,False,False,False,False,True,"['Response.pdf', 'Brooks, attached is our resp..."
1,41353_fn_10-3-Cascade-2014-2-0_ln_16811.txt,"(, beaudoin, mark.beaudoin@talgov.com)","Tuesday, August 19, 2014 5:21 PM","[(, coons, kristen.coons@talgov.com)]","[(, hurley, _hurley)]",,RE: Revised lease,,,False,False,False,False,False,['Good by me!']
2,01216_fn_17-2-IB2013-1-0_ln_44062.txt,"(ken , morris, morrisk@leoncountyfl.gov)","Tuesday, May 28, 2013 1:22 PM","[(kim , rivers, kim@inkbridge.com)]","[(jon , brown, brownjon@leoncountyfl.gov), ( m...",,Proposed Imagine Schedule,,,False,False,False,False,False,"['Kim,', 'See comments in blue regarding the p..."
3,32414_fn_32-1-PaigeCS-1-1_ln_18843.txt,"(jennifer , naff, director@springtimetallahass...","Wednesday, March 15, 2017 5:32 PM","[(, faris, alison.faris@talgov.com)]","[(paige , carter, paige.tallahasseedowntown@gm...",,RE: Springtime Parade,,,False,False,False,True,False,"['Hi Alison,', 'Dont apologize, I know its a l..."
4,56710_fn_31-2-GaryYordon2-3_ln_41027.txt,"(paul , lamaster, paullamaster250@comcast.net)","Sunday, June 7, 2015 at 1:33:39 PM Eastern Day...","[(, gillum, andrew.gillum@talgov.com)]","[(gary , yordon, gary@zprgroup.com)]",,Tony Carvajal: Develop strategy to get ahead o...,,,False,False,True,False,False,['Highly recommend you take a moment out of yo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71138,42570_fn_31-2-GaryYordon2-1_ln_10355.txt,"(paul , lamaster, paullamaster250@comcast.net)","Thursday, August 10, 2017 at 11:35:27 AM Easte...","[(, gillum, andrew.gillum@talgov.com)]",[],,Tallahassee Democrat E-Edi4on Ar4cle,,,False,False,True,False,False,"['nancy.miller@talgov.com, scoS.maddox@talgov...."
71139,36011_fn_32-1-PaigeCS-1-2_ln_5990.txt,"(ryan , grindler, ryan@edisontally.com)","Wednesday, February 22, 2017 at 6:27:34 PM Eas...","[( paige , carter, paige _carter)]","[( steve adams, ashley edwards, allison scor, ...",,Re: Summer Movie Night Series Discussion Mee-ng,,,False,False,True,False,False,"['All,', 'We are s-ll working on some logis-ca..."
71140,22004_fn_34-2-ScottMaddox2_ln_53965.txt,"(, fleming, allison.fleming@talgov.com)","Tuesday, May 28, 2013 2:38 PM","[(scott , maddox, scott@scottmaddox.com)]","[(paige , carter, paigecartersmith@gmail.com)]",,Call Sheet,Call List 5-28.xlsx,,False,False,True,False,False,['Current Call sheet attached - including this...
71141,64016_fn_16-5-HunterHarpHoldings2015-2017_ln_6...,"(scott , williams, scott@hunterandharp.com)","Monday, April 27, 2015 5:24 PM","[(scott , rowse, scott@morethanbuildings.com)]",[],,FW: Gateway Sprinklers and Sidewalks,,,False,False,False,False,False,"['Do you have a contact for the sprinklers?', ..."


In [167]:
df_new.to_csv('output_0211_name_standardized.csv', index=0)

In [138]:
f = pd.read_csv('output_0211_name_standardized.csv')

In [139]:
f.To.head()

0      [('brooks', 'hayes', 'brooks@culpeppercc.com')]
1    [('kristen', 'coons', 'kristen.coons@talgov.co...
2             [('kim', 'rivers', 'kim@inkbridge.com')]
3     [('alison', 'faris', 'alison.faris@talgov.com')]
4    [('andrew', 'gillum', 'andrew.gillum@talgov.co...
Name: To, dtype: object

In [141]:
f.CC.head()

0    [('ryan', 'grindler', 'rgrindler@101tally.com'...
1            [('patrick', 'hurley', 'patrick_hurley')]
2    [('jon', 'brown', 'brownjon@leoncountyfl.gov')...
3    [('paige', 'carter', 'paige.tallahasseedowntow...
4           [('f72764', 'l72764', 'alan.williams@my')]
Name: CC, dtype: object

In [145]:
eval(f.CC.head()[0])

[('ryan', 'grindler', 'rgrindler@101tally.com'),
 ('david', 'rosenfeld', 'david.rosenfeld@talgov.com')]