## Gender guesser do file  

#### Purpose: take list of contacts provided by GIZ and guess their gender based on first names
Outline:   
Part 1: Set up paths, import necessary packages  
Part 2: Load datasets (mailing lists provided by GIZ and gendered name lists)  
Part 3: Run gender-guesser package  
Part 4: Fuzzy matching with existing gendered names lists to guess gender of remaining names  
Part 5: Export results

## Part 1
#### Download any necessary packages, import and set up paths

In [47]:
## Install these packages if you don't have them already (remove the #)

#!pip install gender_guesser
#!pip install earthpy
#!pip install fuzzywuzzy
#!pip install python-Levenshtein

In [48]:
import csv
import pandas as pd
import re
import gender_guesser.detector as gender
import os
import earthpy as et
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [6]:
github_path = os.path.join(et.io.HOME, "Documents","GitHub","giz-pema-ecommerce","sampling-email-experiment")

try:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")
    os.chdir(gdrive_path)
except:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive","My Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")

## Part 2

#### Load datasets and merge

In [10]:
## GIZ contact list (ungendered)

contacts = pd.read_csv(os.path.join(gdrive_path,'intermediate','giz_contact_list_ungendered.csv'), dtype='string' )
contacts = contacts[['firmname', 'name', 'email', 'firstname', 'lastname', 'origin']]
contacts.head()

Unnamed: 0,firmname,name,email,firstname,lastname,origin
0,3Dwave,Ferid kamel,feridkamel@gmail.com,Ferid,kamel,pema
1,ABAPLAST,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema
2,ABIN CONSULTING,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema
3,ABP,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema
4,ABSHORE,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema


In [61]:
## Gendered names list

names = pd.read_csv(os.path.join(gdrive_path,'intermediate','gendered_names.csv'), dtype='string' )
names = names[['firstname','gender']]
names.head()

Unnamed: 0,firstname,gender
0,Aabidah,f
1,Aabirah,f
2,Aabish,f
3,Aadab,f
4,Aadila,f


## Part 3
#### Clean data and run gender-guesser package

In [16]:
# Remove duplicates

contacts = contacts.drop_duplicates(subset='email', keep="first")

# Drop NAs

contacts = contacts[contacts['firstname'].notna()]
contacts.shape

(5001, 6)

In [19]:
# Lower case first names: 

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.lower() if type(x) == str else x)

# Recapitalize: 

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.capitalize() if type(x) == str else x)

# Lower case firm names: 

contacts['firmname'] = contacts['firmname'].apply(lambda x: x.lower() if type(x) == str else x)

# Trim first name

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.strip() if type(x) == str else x)

# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.translate(repl) if type(x) == str else x)

contacts.head()

Unnamed: 0,firmname,name,email,firstname,lastname,origin
0,3dwave,Ferid kamel,feridkamel@gmail.com,Ferid,kamel,pema
1,abaplast,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema
2,abin consulting,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema
3,abp,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema
4,abshore,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema


##### Run the gender guesser

In [20]:
d = gender.Detector()

contacts['gender'] = contacts['firstname'].apply(lambda x: d.get_gender(x))
contacts['gender'].value_counts()

male             2847
unknown          1625
female            489
mostly_female      18
mostly_male        14
andy                8
Name: gender, dtype: int64

##### Now the same but on the surnames

In [21]:
# Lower case last names: 

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.lower() if type(x) == str else x)

# Recapitalize: 

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.capitalize() if type(x) == str else x)

# Trim last name

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.strip() if type(x) == str else x)

# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.translate(repl) if type(x) == str else x)

contacts.head()

Unnamed: 0,firmname,name,email,firstname,lastname,origin,gender
0,3dwave,Ferid kamel,feridkamel@gmail.com,Ferid,Kamel,pema,male
1,abaplast,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema,male
2,abin consulting,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema,male
3,abp,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema,female
4,abshore,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema,female


In [22]:
contacts['gender1'] = contacts['lastname'].apply(lambda x: d.get_gender(x))
contacts['gender1'].value_counts()

unknown          4576
male              355
female             60
mostly_female       5
andy                4
mostly_male         1
Name: gender1, dtype: int64

## Part 4
#### Fuzzy matching with existing gendered names lists to guess gender of remaining names

This still leaves about a third of names as 'Unknown'. Let's use fuzzy matching with lists of gendered names to address this. 

In [28]:
# Pick only the ones unknown

grouped = contacts.groupby(['gender','gender1'])
df_unknown = grouped.get_group(("unknown", "unknown"))
df_unknown = df_unknown[['firmname', 'name', 'email', 'firstname', 'lastname', 'origin']]
df_unknown.shape

(1432, 6)

In [31]:
# One quick fix, many names start with 'Mr' and are male: 

fnames = {'Mr' : "male"}

df_unknown['gender2'] = df_unknown['firstname'].map(fnames)
df_unknown['gender2'].value_counts()

male    9
Name: gender2, dtype: int64

In [38]:
# Casting the first name columns into lists

df1_names = list(df_unknown['firstname'].unique())
df2_names = list(names['firstname'].unique())

In [49]:
#Defining a function to return the match and similarity score of the fuzz.ratio() scorer. The function will take in a term(name), list of terms(list_names), and a minimum similarity score(min_score) to return the match. 
def match_names(name, list_names, min_score=0):
    max_score = -1
    max_name = ''
    for x in list_names:
        score = fuzz.ratio(name, x)
        if (score > min_score) & (score > max_score):
            max_name = x
            max_score = score
    return (max_name, max_score)

In [63]:
#For loop to create a list of tuples with the first value being the name from the second dataframe (name to replace) and the second value from the first dataframe (string replacing the name value). Then, casting the list of tuples as a dictionary. 
firstnames = []
for x in df1_names:
    match = match_names(x, df2_names, 75)
    if match[1] >= 75:
        name = ('(' + str(x), str(match[0]) + ')')
        firstnames.append(name)
name_dict = dict(firstnames)
name_dict

{'(Hidri': 'Heidi)',
 '(Abderrazk': 'Abderrazak)',
 '(Foued': 'Foued)',
 '(Zoubeir': 'Zubair)',
 '(Montasser': 'Montasser)',
 '(Nidhal': 'Nihal)',
 '(Rabah': 'Rabah)',
 '(Mouley': 'Morley)',
 '(Med': 'Medb)',
 '(Nejah': 'Najah)',
 '(Sofiene': 'Sofie)',
 '(Saloua': 'Saoa)',
 '(Abdelaziz': 'Abdellaziz)',
 '(Dorra': 'Dora)',
 '(Mr': 'Mor)',
 '(Ghassen': 'Ghassen)',
 '(Beligh': 'Beligh)',
 '(Sabeur': 'Sabur)',
 '(Mariem': 'Marie)',
 '(Adnen': 'Adnen)',
 '(Hassene': 'Hassine)',
 '(Aymen': 'Aymen)',
 '(Bochra': 'Bora)',
 '(Hmida': 'Hmida)',
 '(Skander': 'Sikandar)',
 '(Chahir': 'Chahir)',
 '(Chema': 'Chema)',
 '(Islem': 'Islem)',
 '(Moez': 'Moeez)',
 '(Takwa': 'Taqwa)',
 '(Meher': 'Meher)',
 '(Souheib': 'Suheb)',
 '(Najeh': 'Najah)',
 '(Henda': 'Henda)',
 '(Abdessalem': 'Abdessalem)',
 '(Feriel': 'Feivel)',
 '(Chafik': 'Chafik)',
 '(Rhouma': 'Roma)',
 '(Boubaker': 'Boubaker)',
 '(Wiem': 'Wim)',
 '(Meria': 'Mercia)',
 '(Sofienne': 'Sofie)',
 '(Aissa': 'Arissa)',
 '(Hajer': 'Hajar)',
 '(Naoufe

In [65]:
names['firstname'] = names['firstname'].replace(name_dict)

In [66]:
df_new = pd.merge(df_unknown, names, how='left', on='firstname')

In [71]:
df_new['gender'].value_counts()

m      521
f      120
?       21
f,m      1
Name: gender, dtype: Int64

## Part 5:
#### Merge and export files  
Using Excel to avoid spacing issues 

In [72]:
frames = [df_new, contacts]

df_names = pd.concat(frames)
df_names.head()

Unnamed: 0,firmname,name,email,firstname,lastname,origin,gender2,gender,gender1
0,actia,Yemen Zegneni,yemen.zegneni@actia.engineering.tn,Yemen,zegneni,pema,,,
1,admedera,Hidri,Wael.hidri@admedera.com.tn,Hidri,,pema,,,
2,advans pharma,Abderrazk FEKI,abderrazek.feki@advanspharma.com,Abderrazk,Feki,pema,,,
3,adwya,foued maknassy,fmaknassy@adwya.com.tn,Foued,Maknassy,pema,,m,
4,aetech,Zoubeir Chaieb,z.chaieb@aetech-solutions.com,Zoubeir,Chaieb,pema,,,


In [73]:
# Export to Excel

df_names.to_excel(os.path.join(gdrive_path,'intermediate','giz_contact_list.xlsx'))