## Samp – import all files  

#### Purpose: import list of contacts provided by GIZ and merge them; import gendered names lists and merge them
Outline:   
Part 1: Set up paths, import necessary packages  
Part 2: Load datasets (mailing lists provided by GIZ) 
Part 3: Load datasets (gendered names lists)   
Part 4: Save results as intermediate files

## Part 1  
#### Download any necessary packages, import and set up paths

In [None]:
## Install these packages if you don't have them already (remove the #)

#!pip install earthpy

In [1]:
import csv
import pandas as pd
import re
import os
import earthpy as et

In [2]:
github_path = os.path.join(et.io.HOME, "Documents","GitHub","giz-pema-ecommerce","sampling-email-experiment")

try:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")
    os.chdir(gdrive_path)
except:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive","My Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")

## Part 2  
#### Load GIZ datasets and merge  
(Prior to this, all datasets were saved in .csv format)

In [3]:
## First PEMA dataset

pema = pd.read_csv(os.path.join(gdrive_path,'raw','BD Pema II.csv'), dtype='string' )
pema.rename(columns={'Entreprises': 'firmname', 'Prénom et nom': 'name', 'Email': 'email'}, inplace=True)
pema.head()

Unnamed: 0,firmname,name,email
0,3Dwave,Ferid kamel,feridkamel@gmail.com
1,ABAPLAST,Akram Ben amor,abaplast@topnet.tn
2,ABIN CONSULTING,Elyes Grar,elyesgrar@gmail.com
3,ABP,Ayda Bouassida,aydabouassidaa@gmail.com
4,ABSHORE,Asma Mechri,asma.mechri@abshore.com


In [4]:
# Second PEMA dataset

pema2 = pd.read_csv(os.path.join(gdrive_path,'raw','BD Event lancement PEMA II.csv'), skiprows=3, dtype='string' )
pema2.rename(columns={'Nom d’utilisateur (nom original)': 'name', 'Prénom': 'firstname', 'Nom de famille': 'lastname', 'Adresse électronique': 'email'}, inplace=True)
pema2 = pema2[['name','firstname', 'lastname', 'email']]
pema2.head()

Unnamed: 0,name,firstname,lastname,email
0,Mohamed Anis,Mohamed Anis,BEN ROMDHANE,anis.benromdhane@geomatics-engineering.com
1,nabil zarai,nabil,zarai,nabil@naza.dev
2,Fatma,Fatma,Gattoufi,fattoumagatt@gmail.com
3,Noura Aloui,Noura,Aloui,nawaranour67@yahoo.fr
4,Emna,Emna,JEMMALI,emnajemmali1@gmail.com


In [5]:
# API dataset

api = pd.read_csv(os.path.join(gdrive_path,'raw','APII-BD ApiiGizAfrica.csv'), dtype='string' )
api.rename(columns={'raison_sociale': 'firmname', 'gouvernorat' : 'governorate', 'delegation':'town', 'secteur_nat':'sector', 'responsable': 'name', 'effectif_total':'fte'}, inplace=True)
api = api[['firmname','name', 'email', 'governorate', 'town', 'sector','fte', 'export']]
api.head()

Unnamed: 0,firmname,name,email,governorate,town,sector,fte,export
0,STE METLINE RAS JEBEL TEXTILE,FATHI MOUSSA,meratex.fm@gmail.com,Bizerte,Ras el jebel,Industries textiles et habillement,83,TOTALEMENT EXPORTATRICE
1,STE MODERNE D'ELEVAGE,HATEM BACCAR,sme@planet.tn,Bizerte,Utique,Industries agricoles et alimentaires,54,NON TOTALEMENT EXPORTATRICE
2,AKWEL MATEUR TUNISIA,YASSINE TAGHOUTI,yassinetaghouti@akwel-automotive.com,Bizerte,Mateur,Fabrication de matériel de transport,973,TOTALEMENT EXPORTATRICE
3,STE BEN AISSA FRERES,ALI BEN AISSA,benaissa_freres@yahoo.fr,Bizerte,El alia,Industries agricoles et alimentaires,21,NON TOTALEMENT EXPORTATRICE
4,ALMES II,MOHAMED BECHIR EL KHIARI,almes.mateur@poulina.com.tn,Bizerte,Mateur,Industries agricoles et alimentaires,160,NON TOTALEMENT EXPORTATRICE


## Part 3  
#### Load gendered names lists and merge

In [6]:
# Load list of names from French gov't  (from https://www.data.gouv.fr/fr/datasets/liste-de-prenoms/)

fra_names = pd.read_csv(os.path.join(gdrive_path,'raw','names_france.csv'), encoding= "utf-8")
fra_names = fra_names[['01_prenom', '02_genre']]
fra_names.rename(columns={'01_prenom': 'firstname', '02_genre': 'gender', '03_langage': 'language', '04_fréquence': 'freq'}, inplace=True)
fra_names.head()

Unnamed: 0,firstname,gender
0,aaliyah,f
1,aapeli,m
2,aapo,m
3,aaren,"m,f"
4,aarne,m


In [7]:
# Load list of names from Kalmasoft

names = pd.read_csv(os.path.join(gdrive_path,'raw','KDBGIVE.txt'), engine='python', skiprows=81, sep='\t+')
names = names[['Roman', 'Gender']]
names.rename(columns={'Roman': 'firstname', 'Gender': 'gender'}, inplace=True)
names.head()

Unnamed: 0,firstname,gender
0,Abosaymah,M
1,Aboussaima,M
2,Albastji,M
3,Elbestdji,M
4,Albastaji,M


In [8]:
# Load list of names from Florian (1)

missing_names = pd.read_csv(os.path.join(gdrive_path,'raw','noms_manquants.csv'), encoding= "utf-8")
missing_names.head()

Unnamed: 0,firstname,_freq,gender
0,Abbes,1,M
1,Abdderazek,1,M
2,Abdeladel,1,M
3,Abdelamjid,1,M
4,Abdelatif,1,M


In [9]:
# Load list of names from Florian (2)

male_names = pd.read_csv(os.path.join(gdrive_path,'raw','males.csv'), encoding= "utf-8")
male_names.rename(columns={'Name': 'firstname', 'Gender': 'gender'}, inplace=True)
male_names.head()

Unnamed: 0,firstname,gender
0,aaban,M
1,aabid,M
2,aadil,M
3,aahil,M
4,aalam,M


In [10]:
# Load list of names from Florian (2)

female_names = pd.read_csv(os.path.join(gdrive_path,'raw','females.csv'), encoding= "utf-8")
female_names.rename(columns={'Name': 'firstname', 'Gender': 'gender'}, inplace=True)
female_names.head()

Unnamed: 0,firstname,gender
0,aabidah,F
1,aabirah,F
2,aabish,F
3,aadab,F
4,aadila,F


In [15]:
# Load list of names cleaned by Kaïs

missing = pd.read_csv(os.path.join(gdrive_path,'intermediate','missing_gender.csv'), sep=";", encoding= "utf-8")
missing.rename(columns={'name': 'firstname'}, inplace=True)
missing.head()

Unnamed: 0,firstname,gender
0,'آمال قاسم حرم,female
1,'أنيس,male
2,'رابح,male
3,سحر,unknown
4,'قرفال,unknown


## Part 5:
#### Save files  as csvs in the intermediate data folder

In [16]:
pema.to_csv(os.path.join(gdrive_path,'intermediate','pema.csv'))
pema2.to_csv(os.path.join(gdrive_path,'intermediate','pema2.csv'))
api.to_csv(os.path.join(gdrive_path,'intermediate','api.csv'))
fra_names.to_csv(os.path.join(gdrive_path,'intermediate','fra_names.csv'))
names.to_csv(os.path.join(gdrive_path,'intermediate','names.csv'))
missing_names.to_csv(os.path.join(gdrive_path,'intermediate','missing_names.csv'))
male_names.to_csv(os.path.join(gdrive_path,'intermediate','male_names.csv'))
female_names.to_csv(os.path.join(gdrive_path,'intermediate','female_names.csv'))
missing.to_csv(os.path.join(gdrive_path,'intermediate','missing.csv'))