## Gender guesser do file  

#### Purpose: take list of contacts provided by GIZ and guess their gender based on first names
Outline:   
Part 1: Set up paths, import necessary packages  
Part 2: Load datasets (mailing lists provided by GIZ) and merge  
Part 3: Clean data and run gender-guesser package  
Part 4: (Still under construction!!!) Load existing lists of Arabic names to guess gender of remaining names  
Part 5: Export results

## Part 1
#### Download any necessary packages, import and set up paths

In [None]:
## Install these packages if you don't have them already (remove the #)

#!pip install gender_guesser
#!pip install earthpy

In [1]:
import csv
import pandas as pd
import re
import gender_guesser.detector as gender
import os
import earthpy as et

In [6]:
github_path = os.path.join(et.io.HOME, "Documents","GitHub","giz-pema-ecommerce","sampling-email-experiment")
gdrive_path = os.path.join(et.io.HOME, "Google Drive","My Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")
os.chdir(my_path)

## Part 2

#### Load datasets and merge
(Prior to this, all datasets were saved in .csv format)

In [7]:
## First PEMA dataset

pema = pd.read_csv(os.path.join(gdrive_path,'raw','BD Pema II.csv'), dtype='string' )
pema.rename(columns={'Entreprises': 'firmname', 'Prénom et nom': 'name', 'Email': 'email'}, inplace=True)
pema

Unnamed: 0,firmname,name,email
0,3Dwave,Ferid kamel,feridkamel@gmail.com
1,ABAPLAST,Akram Ben amor,abaplast@topnet.tn
2,ABIN CONSULTING,Elyes Grar,elyesgrar@gmail.com
3,ABP,Ayda Bouassida,aydabouassidaa@gmail.com
4,ABSHORE,Asma Mechri,asma.mechri@abshore.com
...,...,...,...
622,CLOUD TEMPLE,Amina brik,Amina.BRIK@cloud-temple.com
623,CLOUD TEMPLE,Ibtissem dhiab,ibtissem.DHIAB@cloud-temple.com
624,Wevioo,Amine Aloulou,Amine.Aloulou@wevioo.com
625,GET'IT,Nourchene Bouissa,nbouissa@getit-tunisia.com


##### This one needs a bit of cleaning before it can be merged

In [8]:
pema['name'] = pema['name'].str.strip()
pema[['firstname','lastname']] = pema["name"].str.split(" ", 1, expand=True)
pema['origin'] = 'pema'
pema

Unnamed: 0,firmname,name,email,firstname,lastname,origin
0,3Dwave,Ferid kamel,feridkamel@gmail.com,Ferid,kamel,pema
1,ABAPLAST,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema
2,ABIN CONSULTING,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema
3,ABP,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema
4,ABSHORE,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema
...,...,...,...,...,...,...
622,CLOUD TEMPLE,Amina brik,Amina.BRIK@cloud-temple.com,Amina,brik,pema
623,CLOUD TEMPLE,Ibtissem dhiab,ibtissem.DHIAB@cloud-temple.com,Ibtissem,dhiab,pema
624,Wevioo,Amine Aloulou,Amine.Aloulou@wevioo.com,Amine,Aloulou,pema
625,GET'IT,Nourchene Bouissa,nbouissa@getit-tunisia.com,Nourchene,Bouissa,pema


In [9]:
# Second PEMA dataset

pema2 = pd.read_csv(os.path.join(gdrive_path,'raw','BD Event lancement PEMA II.csv'), skiprows=3, dtype='string' )
pema2.rename(columns={'Nom d’utilisateur (nom original)': 'name', 'Prénom': 'firstname', 'Nom de famille': 'lastname', 'Adresse électronique': 'email'}, inplace=True)
pema2 = pema2[['name','firstname', 'lastname', 'email']]
pema2['origin'] = 'pema2'
pema2

Unnamed: 0,name,firstname,lastname,email,origin
0,Mohamed Anis,Mohamed Anis,BEN ROMDHANE,anis.benromdhane@geomatics-engineering.com,pema2
1,nabil zarai,nabil,zarai,nabil@naza.dev,pema2
2,Fatma,Fatma,Gattoufi,fattoumagatt@gmail.com,pema2
3,Noura Aloui,Noura,Aloui,nawaranour67@yahoo.fr,pema2
4,Emna,Emna,JEMMALI,emnajemmali1@gmail.com,pema2
...,...,...,...,...,...
658,Sihem Hssini,Sihem,Hssini,soussou.sassouki@gmail.com,pema2
659,RAFIKA BENHLOUA,RAFIKA,BENHLOUA,info.pharmin@yahoo.fr,pema2
660,Amira M'barek,Amira,M'barek,export@topoliva.com,pema2
661,taboubi aymen,taboubi,aymen,ataboubi@advanstunisie.com,pema2


In [11]:
# API dataset

api = pd.read_csv(os.path.join(gdrive_path,'raw','APII-BD ApiiGizAfrica.csv'), dtype='string' )
api.rename(columns={'raison_sociale': 'firmname', 'responsable': 'name'}, inplace=True)
api = api[['firmname','name', 'email']]
api['origin'] = 'api'
api

Unnamed: 0,firmname,name,email,origin
0,STE METLINE RAS JEBEL TEXTILE,FATHI MOUSSA,meratex.fm@gmail.com,api
1,STE MODERNE D'ELEVAGE,HATEM BACCAR,sme@planet.tn,api
2,AKWEL MATEUR TUNISIA,YASSINE TAGHOUTI,yassinetaghouti@akwel-automotive.com,api
3,STE BEN AISSA FRERES,ALI BEN AISSA,benaissa_freres@yahoo.fr,api
4,ALMES II,MOHAMED BECHIR EL KHIARI,almes.mateur@poulina.com.tn,api
...,...,...,...,...
5064,ABDESSATAR BEN SALHA,ABDESSATAR BEN SALHA,,api
5065,STE HASNA CUIR COLLECTION,INSAF BEN HASSINE,,api
5066,STE CHARCOLIVE INTERNATIONAL,ALEXANDRE REKIK,alexrekik@gmail.com,api
5067,STE STAMROP,DIDIER GERMAIN,alexis.stamrop@gmail.com,api


In [12]:
# Split name into first and last

api['name'] = api['name'].str.strip()
api[['firstname','lastname']] = api["name"].str.split(" ", 1, expand=True)
api

Unnamed: 0,firmname,name,email,origin,firstname,lastname
0,STE METLINE RAS JEBEL TEXTILE,FATHI MOUSSA,meratex.fm@gmail.com,api,FATHI,MOUSSA
1,STE MODERNE D'ELEVAGE,HATEM BACCAR,sme@planet.tn,api,HATEM,BACCAR
2,AKWEL MATEUR TUNISIA,YASSINE TAGHOUTI,yassinetaghouti@akwel-automotive.com,api,YASSINE,TAGHOUTI
3,STE BEN AISSA FRERES,ALI BEN AISSA,benaissa_freres@yahoo.fr,api,ALI,BEN AISSA
4,ALMES II,MOHAMED BECHIR EL KHIARI,almes.mateur@poulina.com.tn,api,MOHAMED,BECHIR EL KHIARI
...,...,...,...,...,...,...
5064,ABDESSATAR BEN SALHA,ABDESSATAR BEN SALHA,,api,ABDESSATAR,BEN SALHA
5065,STE HASNA CUIR COLLECTION,INSAF BEN HASSINE,,api,INSAF,BEN HASSINE
5066,STE CHARCOLIVE INTERNATIONAL,ALEXANDRE REKIK,alexrekik@gmail.com,api,ALEXANDRE,REKIK
5067,STE STAMROP,DIDIER GERMAIN,alexis.stamrop@gmail.com,api,DIDIER,GERMAIN


In [13]:
# Plenty of missing emails, drop

api = api[api['email'].notna()]
api.shape

(4016, 6)

In [14]:
# Merge

frames = [pema, pema2, api]

df = pd.concat(frames)
df

Unnamed: 0,firmname,name,email,firstname,lastname,origin
0,3Dwave,Ferid kamel,feridkamel@gmail.com,Ferid,kamel,pema
1,ABAPLAST,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema
2,ABIN CONSULTING,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema
3,ABP,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema
4,ABSHORE,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema
...,...,...,...,...,...,...
5059,MAISON NATURE,OUSSAMA BRAHEM,contact@maisonnature.tn,OUSSAMA,BRAHEM,api
5062,SOCIETE HAMMAMI KHALED ET FILS,KHALED HAMMAMI,proplast.tunisie@gmail.com,KHALED,HAMMAMI,api
5066,STE CHARCOLIVE INTERNATIONAL,ALEXANDRE REKIK,alexrekik@gmail.com,ALEXANDRE,REKIK,api
5067,STE STAMROP,DIDIER GERMAIN,alexis.stamrop@gmail.com,DIDIER,GERMAIN,api


## Part 3
#### Clean data and run gender-guesser package

In [15]:
# Remove duplicates

df = df.drop_duplicates(subset='email', keep="first")
df.shape

(5002, 6)

In [16]:
# Lower case first names: 

df['firstname'] = df['firstname'].str.lower()

# Recapitalize: 

df['firstname'] = df['firstname'].str.capitalize()

# Lower case firm names: 

df['firmname'] = df['firmname'].str.lower()

# Trim first name

df['firstname'] = df['firstname'].str.strip()

# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

df['firstname'] = df['firstname'].str.translate(repl)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['firstname'] = df['firstname'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['firstname'] = df['firstname'].str.capitalize()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['firmname'] = df['firmname'].str.lower()
A value is trying to be set on a copy of a slice from a DataFram

Unnamed: 0,firmname,name,email,firstname,lastname,origin
0,3dwave,Ferid kamel,feridkamel@gmail.com,Ferid,kamel,pema
1,abaplast,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema
2,abin consulting,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema
3,abp,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema
4,abshore,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema
...,...,...,...,...,...,...
5059,maison nature,OUSSAMA BRAHEM,contact@maisonnature.tn,Oussama,BRAHEM,api
5062,societe hammami khaled et fils,KHALED HAMMAMI,proplast.tunisie@gmail.com,Khaled,HAMMAMI,api
5066,ste charcolive international,ALEXANDRE REKIK,alexrekik@gmail.com,Alexandre,REKIK,api
5067,ste stamrop,DIDIER GERMAIN,alexis.stamrop@gmail.com,Didier,GERMAIN,api


##### Run the gender guesser

In [17]:
d = gender.Detector()

df['gender'] = df['firstname'].apply(lambda x: d.get_gender(x))
df['gender'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'] = df['firstname'].apply(lambda x: d.get_gender(x))


male             2847
unknown          1626
female            489
mostly_female      18
mostly_male        14
andy                8
Name: gender, dtype: int64

##### Now the same but on the surnames

In [18]:
# Lower case first names: 

df['lastname'] = df['lastname'].str.lower()

# Recapitalize: 

df['lastname'] = df['lastname'].str.capitalize()

# Trim first name

df['lastname'] = df['lastname'].str.strip()

# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

df['lastname'] = df['lastname'].str.translate(repl)

df

df['gender1'] = df['lastname'].apply(lambda x: d.get_gender(x))
df['gender1'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lastname'] = df['lastname'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lastname'] = df['lastname'].str.capitalize()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lastname'] = df['lastname'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
T

unknown          4577
male              355
female             60
mostly_female       5
andy                4
mostly_male         1
Name: gender1, dtype: int64

## Part 4 [INCOMPLETE!]
#### Load existing lists of Arabic names to guess gender of remaining names
#### Currently not used – these datasets don't change anything atm

This still leaves about a third of names as 'Unknow'. Let's use some other approaches to fix this. 

In [19]:
# Pick only the ones unknown

grouped = df.groupby(['gender','gender1'])
df_unknown = grouped.get_group(("unknown", "unknown"))
df_unknown.shape

(1433, 8)

In [20]:
# One quick fix, many names start with 'Mr' and are male: 

#mask = (df_unknown['firstname'] == 'Mr') 
#df_unknown['gender'][mask] = 'male'

fnames = {'Mr' : "male"}

df_unknown['gender2'] = df_unknown['firstname'].map(fnames)
df_unknown['gender2'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_unknown['gender2'] = df_unknown['firstname'].map(fnames)


male    9
Name: gender2, dtype: int64

In [167]:
# Load list of names from French gov't 

fra_names = pd.read_csv(os.path.join(gdrive_path,'raw','names_france.csv'), encoding= "utf-8")
fra_names.rename(columns={'01_prenom': 'names', '02_genre': 'gender3', '03_langage': 'language', '04_fréquence': 'freq'}, inplace=True)
fra_names

Unnamed: 0,names,gender3,language,freq
0,aaliyah,f,english (modern),0.0
1,aapeli,m,finnish,0.0
2,aapo,m,finnish,0.0
3,aaren,"m,f",english,0.0
4,aarne,m,finnish,0.0
...,...,...,...,...
11622,zvi,m,jewish,0.1
11623,zvonimir,m,croatian,0.0
11624,zvonimira,f,croatian,0.0
11625,zvonko,m,croatian,0.0


Some cleaning is needed

In [168]:
# Trim names

fra_names['names'] = fra_names['names'].str.strip()

fra_names[['firstname','lastname']] = fra_names['names'].str.split(" ", 1, expand=True)


# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

fra_names['firstname'] = fra_names['firstname'].str.translate(repl)

fra_names

Unnamed: 0,names,gender3,language,freq,firstname,lastname
0,aaliyah,f,english (modern),0.0,aaliyah,
1,aapeli,m,finnish,0.0,aapeli,
2,aapo,m,finnish,0.0,aapo,
3,aaren,"m,f",english,0.0,aaren,
4,aarne,m,finnish,0.0,aarne,
...,...,...,...,...,...,...
11622,zvi,m,jewish,0.1,zvi,
11623,zvonimir,m,croatian,0.0,zvonimir,
11624,zvonimira,f,croatian,0.0,zvonimira,
11625,zvonko,m,croatian,0.0,zvonko,


In [170]:
# Merge with our dataframe

df_french = pd.merge(df_unknown,fra_names, on = 'firstname', how="left")
df_french.shape

(1433, 14)

In [67]:
# Load list of names from Kalmasoft

names = pd.read_csv(os.path.join(gdrive_path,'raw','KDBGIVE.txt'), skiprows=81, sep='\t+')
names = names[['Roman', 'Gender']]
names.rename(columns={'Roman': 'capname', 'Gender': 'Gender'}, inplace=True)
names.head()

  names = pd.read_csv('/Users/teofirpo/Desktop/KDBGIVE.txt', skiprows=81, sep='\t+')


Unnamed: 0,capname,Gender
0,Abosaymah,M
1,Aboussaima,M
2,Albastji,M
3,Elbestdji,M
4,Albastaji,M


In [68]:
# Merge with our dataframe

df_arabic = pd.merge(df_french,names, on = 'capname', how="left")

## Part 5:
#### Export files  
Using Excel to avoid spacing issues 

In [21]:
# Export to Excel

df.to_excel(os.path.join(gdrive_path,'intermediate','giz_contact_list.xlsx'))