## Samp – correct

#### Purpose: Correct files
Outline:   
Part 1: Set up paths, import necessary packages  
Part 2: Load intermediate files  
Part 3: Correct files  
Part 4: Export

## Part 1  
#### Download any necessary packages, import and set up paths

In [1]:
## Install these packages if you don't have them already (remove the #)

#!pip install earthpy

In [1]:
import os
import earthpy as et
import csv
import pandas as pd
import re

In [2]:
github_path = os.path.join(et.io.HOME, "Documents","GitHub","giz-pema-ecommerce","sampling-email-experiment")

try:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")
    os.chdir(gdrive_path)
except:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive","My Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")

## Part 2  
#### Load intermediate files

In [3]:
contacts = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'giz_contact_list_ungendered.csv'), encoding= "utf-8", index_col=0)
df_names = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'gendered_names.csv'), encoding= "utf-8", index_col=0)

## Part 3  
#### Correct files

#### Some cleaning and dropping duplicates

#### Start with df_names

In [4]:
# Trim names

df_names['firstname'] = df_names['firstname'].str.strip()

# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

df_names['firstname'] = df_names['firstname'].str.translate(repl)

df_names.shape

(17568, 2)

In [5]:
# Lower case first names and gender: 

df_names = df_names.applymap(lambda x: x.lower() if type(x) == str else x)

# Recapitalize first names: 

df_names['firstname'] = df_names['firstname'].apply(lambda x: x.capitalize() if type(x) == str else x) 

In [6]:
# Drop NAs

df_names = df_names[df_names['firstname'].notna()]

df_names = df_names.drop_duplicates(subset='firstname', keep="first")

df_names.shape

(16421, 2)

#### Now with contacts

In [7]:
# Remove duplicates

contacts = contacts.drop_duplicates(subset='email', keep="first")

# Drop NAs

contacts = contacts[contacts['firstname'].notna()]
contacts.shape

(5001, 11)

In [8]:
# Lower case first names: 

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.lower() if type(x) == str else x)

# Recapitalize: 

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.capitalize() if type(x) == str else x)

# Lower case firm names: 

contacts['firmname'] = contacts['firmname'].apply(lambda x: x.lower() if type(x) == str else x)

# Trim first name

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.strip() if type(x) == str else x)

# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

contacts['firstname'] = contacts['firstname'].apply(lambda x: x.translate(repl) if type(x) == str else x)

contacts.head()

Unnamed: 0,firmname,name,email,firstname,lastname,origin,governorate,town,sector,fte,export
0,3dwave,Ferid kamel,feridkamel@gmail.com,Ferid,kamel,pema,,,,,
1,abaplast,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema,,,,,
2,abin consulting,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema,,,,,
3,abp,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema,,,,,
4,abshore,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema,,,,,


In [9]:
# Lower case last names: 

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.lower() if type(x) == str else x)

# Recapitalize: 

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.capitalize() if type(x) == str else x)

# Trim last name

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.strip() if type(x) == str else x)

# Normalize key accents

repl = str.maketrans(
    "áéúíóçèîêàöëù",
    "aeuioceieaoeu"
)

contacts['lastname'] = contacts['lastname'].apply(lambda x: x.translate(repl) if type(x) == str else x)

contacts.head()

Unnamed: 0,firmname,name,email,firstname,lastname,origin,governorate,town,sector,fte,export
0,3dwave,Ferid kamel,feridkamel@gmail.com,Ferid,Kamel,pema,,,,,
1,abaplast,Akram Ben amor,abaplast@topnet.tn,Akram,Ben amor,pema,,,,,
2,abin consulting,Elyes Grar,elyesgrar@gmail.com,Elyes,Grar,pema,,,,,
3,abp,Ayda Bouassida,aydabouassidaa@gmail.com,Ayda,Bouassida,pema,,,,,
4,abshore,Asma Mechri,asma.mechri@abshore.com,Asma,Mechri,pema,,,,,


## Part 4  
#### Export

In [10]:
# Export to csv

contacts.to_csv(os.path.join(gdrive_path,'intermediate','giz_contact_list_ungendered.csv'))
df_names.to_csv(os.path.join(gdrive_path,'intermediate','gendered_names.csv'))