## Samp – clean

#### Purpose: Clean files
Outline:   
Part 1: Set up paths, import necessary packages  
Part 2: Load intermediate files  
Part 3: Clean files  
Part 4: Export

## Part 1  
#### Download any necessary packages, import and set up paths

In [1]:
## Install these packages if you don't have them already (remove the #)

#!pip install earthpy

In [1]:
import os
import earthpy as et
import csv
import pandas as pd
import re

In [2]:
github_path = os.path.join(et.io.HOME, "Documents","GitHub","giz-pema-ecommerce","sampling-email-experiment")

try:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")
    os.chdir(gdrive_path)
except:
    gdrive_path = os.path.join(et.io.HOME, "Google Drive","My Drive", "Research_GIZ_Tunisia_exportpromotion","1. Intervention I – E-commerce","data","0-sampling-email-experiment")

## Part 2  
#### Load intermediate files

In [3]:
pema = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'pema.csv'), encoding= "utf-8", index_col=0)
pema2 = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'pema2.csv'), encoding= "utf-8", index_col=0)
api = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'api.csv'), encoding= "utf-8", index_col=0)
fra_names = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'fra_names.csv'), encoding= "utf-8", index_col=0)
names = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'names.csv'), encoding= "utf-8", index_col=0)
missing_names = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'missing_names.csv'), encoding= "utf-8", index_col=0)
male_names = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'male_names.csv'), encoding= "utf-8", index_col=0)
female_names = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'female_names.csv'), encoding= "utf-8", index_col=0)
missing = pd.read_csv(os.path.join(gdrive_path,'intermediate', 'missing.csv'), encoding= "utf-8", index_col=0)

## Part 3  
#### Clean files

#### Clean 'Pema'

In [4]:
# Split name into first and last

pema['name'] = pema['name'].str.strip()
pema[['firstname','lastname']] = pema["name"].str.split(" ", 1, expand=True)

# Add column identifying the origin of the contact
pema['origin'] = 'pema'

#### Clean 'Pema 2'

In [5]:
# Add column identifying the origin of the contact

pema2['origin'] = 'pema2'

#### Clean 'API'

In [6]:
# Add column identifying the origin of the contact

api['origin'] = 'api'

In [7]:
# Split name into first and last

api['name'] = api['name'].str.strip()
api[['firstname','lastname']] = api["name"].str.split(" ", 1, expand=True)

In [8]:
# Plenty of missing emails, drop

api = api[api['email'].notna()]
api.shape

(4016, 11)

#### Clean 'fra_names'

In [9]:
# Replace 'm' with 'male' etc

fra_names['gender'] = fra_names['gender'].str.replace('m','male')
fra_names['gender'] = fra_names['gender'].str.replace('f','female')
fra_names['gender'] = fra_names['gender'].str.replace('m,f','mostly male')
fra_names['gender'] = fra_names['gender'].str.replace('f,m','mostly female')
fra_names['gender'].value_counts()

male           5878
female         5313
male,female     289
female,male     147
Name: gender, dtype: int64

#### Clean 'names'

In [10]:
names['gender'] = names['gender'].str.replace('M','male')
names['gender'] = names['gender'].str.replace('F','female')
names['gender'].value_counts()

male      502
female    152
Name: gender, dtype: int64

#### Clean 'missing_names'

In [11]:
missing_names['gender'] = missing_names['gender'].str.replace('M','male')
missing_names['gender'] = missing_names['gender'].str.replace('F','female')
missing_names['gender'] = missing_names['gender'].str.replace('?','unknown')
missing_names['gender'].value_counts()

male       276
unknown     63
female      49
Name: gender, dtype: int64

#### Clean 'male_names'

In [12]:
male_names['gender'] = male_names['gender'].str.replace('M','male')
male_names['gender'].value_counts()

male    2066
Name: gender, dtype: int64

#### Clean 'female_names'

In [13]:
female_names['gender'] = female_names['gender'].str.replace('F','female')
female_names['gender'].value_counts()

female    2445
Name: gender, dtype: int64

## Part 4  
#### Save files  as csvs in the intermediate data folder

In [14]:
pema.to_csv(os.path.join(gdrive_path,'intermediate','pema.csv'))
pema2.to_csv(os.path.join(gdrive_path,'intermediate','pema2.csv'))
api.to_csv(os.path.join(gdrive_path,'intermediate','api.csv'))
fra_names.to_csv(os.path.join(gdrive_path,'intermediate','fra_names.csv'))
names.to_csv(os.path.join(gdrive_path,'intermediate','names.csv'))
missing_names.to_csv(os.path.join(gdrive_path,'intermediate','missing_names.csv'))
male_names.to_csv(os.path.join(gdrive_path,'intermediate','male_names.csv'))
female_names.to_csv(os.path.join(gdrive_path,'intermediate','female_names.csv'))
missing.to_csv(os.path.join(gdrive_path,'intermediate','missing.csv'))