# Name Reduction
The starting point of this analysis is a file of mails where each element of From, To, Cc: is a single string, either a name or an email.

In [120]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [77]:
# import pandas as pd
import numpy as np
import regex as rex
import pandas as pd
from datetime import datetime
from dateutil import parser
from dateutil.tz import gettz
from unidecode import unidecode
import process_to_library as tolib
import standardize_library as standlib
import name_matching_lib as nmlib

In [190]:
# comma-delimited fields have been transformed to semi-colon-delimited fields
df = pd.read_csv("output_to_cc.csv")

In [191]:
stand = standlib.StandardizeNames(df)

In [192]:
stand.process()

71143 71143 71143
=> Created field_dict
=> Created field_dict1
True
len:  40629
len(field_dict):  40629
len(email_to_names):  3636
len(name_to_emails):  12557
len(self.email_to_names), len(self.name_to_emails), len(self.clean_names_without_emails), len(self.clean_names_with_emails)
3636 12557 8915 2096
=> Created email_to_names and name_to_emails
=> Created names_without_emails
=> Created field_dict2
71143
End count=  71035
Number exceptions:  0
len my_list, new_my_list:  71143 71143
End count=  178051
nb nan:  35009
Number exceptions:  0
len my_list, new_my_list:  71143 71143
End count=  330813
nb nan:  0
Number exceptions:  0
len my_list, new_my_list:  71143 71143
=> Create new (From, To, Cc) headers


In [193]:
len(stand.name_to_emails), len(stand.email_to_names)

(12557, 3636)

In [194]:
stand.name_to_emails
stand.print_dict(stand.email_to_names, min_length=2, nb_to_print=600)

True
len:  3636


In [195]:
names = list(stand.name_to_emails.keys())
len(names)

12557

In [196]:
len(stand.new_to_list)

71143

In [236]:
names_emails = set()
names = set()
emails = set()
emails_prefix = set()
for v in stand.field_dict2.values():
    names_emails.add(v)
    if not rex.match(r'.*@', v):
        names.add(v)
    elif rex.match(r'.*@', v):
        emails.add(v)
        emails_prefix.add(rex.sub('@.*$', '', v))
len(names), len(emails), len(names_emails), len(emails_prefix)

(11012, 14076, 25088, 12314)

In [239]:
# Create a dictionary (name1, name2, cosine similarity)
names = list(names)  # was a set
names_emails = list(names_emails)
emails = list(emails)
emails_prefix = list(emails_prefix)

In [243]:
def name_similarities(names, cs_thresh=0.5, sort=True):
    # df1 = nmlib.name_matches(names, cs_thresh=cs_thresh, ngram_size=1) # top=100
    df2 = nmlib.name_matches(names, cs_thresh=cs_thresh, ngram_size=2)
    df3 = nmlib.name_matches(names, cs_thresh=cs_thresh, ngram_size=3)
    df4 = nmlib.name_matches(names, cs_thresh=cs_thresh, ngram_size=4)
    df = pd.concat([df1, df2, df3, df4])
    df = df.drop_duplicates(['left_side','right_side'])
    if sort:
        df = df.sort_values('left_side')
    return df

In [244]:
len(names)

11012

In [245]:
for n in names:
    if rex.match('.*maddox', n):
        print(n)

scottvalentinemaddox
rickimaddox
scodmaddox
maddoxgaryl
maddoxscoy
scofcharlesmaddox
bobbymaddox
maddoxaidefleming
maddoxnick
maddoxcampaign
shamaddox
scottmaddoxemail
maddoxscos
maddox
scottcharlesmaddox
kerrymaddox
charliemaddox
allieflemingforscottmaddox
scottmaddoxscott
maddoxfleming
scottmaddox
scoimaddox
maddoxscott
scofmaddox
nickmaddox


In [246]:
# How to make sure that we do not have name1,name2  and  name2,name1? 
# One way: make sure that left_side always < right_side
dfsim = name_similarities(emails_prefix, cs_thresh=.4, sort=True)
dfsim.shape

(230, 3)

In [247]:
left = dfsim['left_side'].values
right = dfsim['right_side'].values
for l,r in zip(left, right):
    print(l, r)

allisonfleming alliefleming
angela.ivy angela
angela.ivy angelah
ashleyei ashleyedwards
brook halbrook044
brook sbrooks
brook abrooks
brook brooks
brook tlbrooks
brook bob.brooks
brook brook.pace
brook lecia.brooks
brook brooke.evans
brook brookehobbs
brook michael.brooks
brook jbrooks123
brook randygbrooks
brook kbrooks
brook tbrooks
cassandraleland cassandrajackson
cassandraleland cassandradecoste
cassandraleland cassandracarter
catherinebray catherinebaker
catherinebray catherinekunst
catherinebray catherinejones
city.manager manager
city.manager managers
city.manager citymanagr
city.manager kirbymanager
citycommissione citycommissionaides
citycommissione leoncountycommissionaides
citycommissione planningcommission
citycommissione citycommissionoffice
citycommissione cityhallcommissionchambers
colleenroland jasonroland
dbarber barb.barnett
dbarber dbarlowe
dbarber barbara.jones
dbarber barbara.boles
dbarber cherigarbark
dbarber barbarablow
dbarber chuchabarber
dbarber barbara.robins

In [150]:
stand.name_to_emails['alliefleming']

{'', 'allison.fleming@talgov.com'}

In [147]:
stand.email_to_names['tim.davis@talgov.com']

{'timdavis'}

In [89]:
stand.name_to_emails['adamcrowley']

{'acorey@101tally.com'}

In [86]:
stand.print_dict(stand.name_to_emails, nb_to_print=1000)

True
len:  1951
aarengraves______{''}
abdullahfmsfuncponalusers______{''}
abenaojetayo______{''}
aceystinson______{''}
adambaker______{'adam.jacobs@talgov.com'}
adambcorey______{'', 'acorey@101tally.com'}
adambelcher______{''}
adambronakoski______{'adam.jacobs@talgov.com'}
adamcollins______{''}
adamcorey______{'', 'adambcorey@gmail.com', 'acorey@gunster.com', 'acorey@101tally.com', 'adam@unconventionalstrategies.com'}
adamcrowley______{'acorey@101tally.com'}
adamjacobs______{'', 'adam.jacobs@talgov.com'}
adamjohansen______{'adam.jacobs@talgov.com'}
adriannekautz______{''}
adriennebellflower______{''}
adrienneframe______{''}
advisoryboards______{''}
aeshahmcqueen______{''}
agustinguscorbella______{'corbella@gtlaw.com'}
ahmadkhalilfavors______{'afavors@tallha.org', 'khalil.favors@gmail.com'}
aiwuyao______{''}
akhenatonthomas______{''}
akinakinyemi______{'akinyemia@leoncountyfl.gov'}
alanaguinard______{''}
alanataylor______{''}
alangale______{''}
alanhanstein______{''}
alanhooper______{''

In [16]:
stand.print_dict(stand.field_dict1, nb_to_print=10)

True
len:  24888
.williams@talgov.com______('unrecognized', '.williams@talgov.com')
0@gtlaw.com______('unrecognized', '0@gtlaw.com')
100bmtally@gmail.com______('unrecognized', '100bmtally@gmail.com')
10pointconstruction@gmail.com______('unrecognized', '10pointconstruction@gmail.com')
123obamayeswecan@gmail.com______('unrecognized', '123obamayeswecan@gmail.com')
13scribes@gmail.com______('unrecognized', '13scribes@gmail.com')
19jenkins71@gmail.com______('unrecognized', '19jenkins71@gmail.com')
19tjenkins71@gmail.com______('unrecognized', '19tjenkins71@gmail.com')
1cogburn@cogburnbros.com______('unrecognized', '1cogburn@cogburnbros.com')
1stcoastelectric@att.net______('unrecognized', '1stcoastelectric@att.net')


In [17]:
stand.name_to_mails

AttributeError: 'StandardizeNames' object has no attribute 'name_to_mails'

In [93]:
stand.print_dict(stand.field_dict1, min_length=0, nb_to_print=10000)

True
len:  6001
______('unrecognized', '')
  Adam  Jacobs______('adamjacobs', '')
  Andrea  Rosser______('andrearosser', '')
  Angela  Baldwin______('angelabaldwin', '')
  Case  Elaine______('caseelaine', '')
  ChrisPna______('chrispna', '')
  Cornet  Gibson______('cornetgibson', '')
  Edward  Kring______('edwardkring', '')
  Edward.YoungJr@talgov.com ______('unrecognized', 'edward.youngjr@talgov.com')
  Fred  Harris______('fredharris', '')
  Lawrence  Ransom______('lawrenceransom', '')
  Liz Joyner______('lizjoyner', '')
  Sheppard  Aziz______('sheppardaziz', '')
  anita.favors.thompson@talgov.com______('unrecognized', 'anita.favors.thompson@talgov.com')
  kwdix@embarqmail.com ______('unrecognized', 'kwdix@embarqmail.com')
  piers rawling@gmail.com  piers rawling@gmail.com   piers.rawling@gmail.com______('unrecognized', 'rawling@gmail.com')
  piers.rawling@gmail.com   piers.rawling@gmail.com______('unrecognized', 'piers.rawling@gmail.com')
 "Weingarden, Lauren" <lweingarden@fsu.edu___

In [96]:
stand.print_dict(stand.field_dict2, nb_to_print=1000, pattern='@')

True
len:  6001


In [94]:
for k,v in stand.field_dict2.items():
    if rex.match(r'.*@', v):
        print(v)

director@springtimetallahassee.com
richard.mccraw@talgov.com
patricia.sanzone@dep.state.fl.us
nanettes@moorecommgroup.com
venus.childs@talgov.com
scott@hunterandharp.com
paige.tallahasseedowntown@gmail.com
stacey.campbell@talgov.com
gary@govinc.net
morrisk@leoncountyfl.gov
jessicae.brown@talgov.com
chad@hunterandharp.com
janice.elyea@talgov.com
randall.serles@talgov.com
rkenon@comcast.net
danny@manausalaw.com
stacey.campbell@talgov.com
alison.faris@talgov.com
davissoniii@dunlapshipman.com
tonya.herron@talgov.com
mail2bonita@comcast.net
kim@inkbridge.com
swainn@leoncountyfl.gov
kim@inkbridge.com
jlee@stctaxbenefits.com
adam@unconventionalstrategies.com
office@unconventionalstrategies.com
mike.kissane@talgov.com
cleandri@101tally.com
dietrich.simmons@talgov.com
debra.thomas@talgov.com
mike.tadros@talgov.com
daniellee@leoncountyfl.gov
mlisa.ingram@talgov.com
chad@hunterandharp.com
rob.mcgarrah@talgov.com
heather.beary@talgov.com
agillum@pfaw.org
connie.williams@talgov.com
wes@mckibbon.com