# Import Needed Libraries

In [1]:
# To import configurations from config.ini files
import configparser
# For dataframe processes
import pandas as pd
import numpy as np

# For vizualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

# To display all columns (Optional)
pd.set_option('display.max_columns', None)

# Importing Configuration
I am getting into the habit of creating a configuration file, config.ini. 
This file will allow me to edit my paths in one location so that I don't have to 
constantly update paths in every single file. This configuration file can even
be used for other purposes, such as saving usernames and passwords, and I can 
even add it to the .gitignore file.

In [3]:
# import and read my config.ini file
config = configparser.ConfigParser()
config.read("../src/config.ini")

['../src/config.ini']

# Importing Given Data

In [6]:
# Import training and testing data
train_lbls = pd.read_csv(config['paths']['train_labels']
train_vals = pd.read_csv(config['paths']['train_values']
test_data = pd.read_csv(config['paths']['test_data']

# Import
sub_form = pd.read_csv(config['paths']['sub_form']

In [9]:
# Checking the shape of each dataframe
print('train_vals:', train_vals.shape)
print('train_labels', train_labels.shape)
print('test_vals', test_vals.shape)
print('sub_form', sub_form.shape)

train_vals: (59400, 40)
train_labels (59400, 2)
test_vals (14850, 40)
sub_form (14850, 2)


In [10]:
train_vals.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [11]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


In [12]:
test_vals.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,VWC,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,VWC,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,VWC,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,Water Board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [13]:
sub_form.head()

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label
2,17168,predicted label
3,45559,predicted label
4,49871,predicted label


# Data Cleaning

In [20]:
df = pd.merge(train_vals, train_labels,left_on='id', right_on='id',how='inner')

In [21]:
df.shape

(59400, 41)

I see there are some nulls, so I'll take a look at those.

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [26]:
# append column to a list if there are any nulls in the column
null_cols = [c for c in df.columns if df[c].isnull().any()]
# show columns. They all appear to be categorical or binomial
df[null_cols]

Unnamed: 0,funder,installer,subvillage,public_meeting,scheme_management,scheme_name,permit
0,Roman,Roman,Mnyusi B,True,VWC,Roman,False
1,Grumeti,GRUMETI,Nyamara,,Other,,True
2,Lottery Club,World vision,Majengo,True,VWC,Nyumba ya mungu pipe scheme,True
3,Unicef,UNICEF,Mahakamani,True,VWC,,True
4,Action In A,Artisan,Kyanyamisa,True,,,True
...,...,...,...,...,...,...,...
59395,Germany Republi,CES,Kiduruni,True,Water Board,Losaa Kia water supply,True
59396,Cefa-njombe,Cefa,Igumbilo,True,VWC,Ikondo electrical water sch,True
59397,,,Madungulu,True,VWC,,False
59398,Malec,Musa,Mwinyi,True,VWC,,True


In [27]:
# append column to a list if there are any nulls in the column
null_cols_test = [c for c in test_vals.columns if test_vals[c].isnull().any()]
# show columns. They all appear to be categorical or binomial
test_vals[null_cols_test]

Unnamed: 0,funder,installer,subvillage,public_meeting,scheme_management,scheme_name,permit
0,Dmdd,DMDD,Magoma,True,Parastatal,,True
1,Government Of Tanzania,DWE,Kimnyak,True,VWC,TPRI pipe line,True
2,,,Msatu,True,VWC,P,
3,Finn Water,FINN WATER,Kipindimbi,,VWC,,True
4,Bruder,BRUDER,Losonga,,Water Board,BRUDER,True
...,...,...,...,...,...,...,...
14845,Danida,Da,Yombo,True,VWC,Bagamoyo wate,True
14846,Hiap,HIAP,Mkondoa,True,VWC,,False
14847,,,Juhudi,True,VWC,,
14848,Germany,DWE,Namakinga B,True,VWC,Mradi wa maji wa maposeni,True


Luckily for me, the same columns have nulls in the original training and testing
sets. This means I can apply the same imputing methods to each.

In [96]:
null_cols_test = [c for c in test_vals.columns if test_vals[c].isnull().any()]
test_vals[null_cols_test]

Unnamed: 0,funder,installer,subvillage,public_meeting,scheme_management,scheme_name,permit
0,Dmdd,DMDD,Magoma,True,Parastatal,,True
1,Government Of Tanzania,DWE,Kimnyak,True,VWC,TPRI pipe line,True
2,,,Msatu,True,VWC,P,
3,Finn Water,FINN WATER,Kipindimbi,,VWC,,True
4,Bruder,BRUDER,Losonga,,Water Board,BRUDER,True
...,...,...,...,...,...,...,...
14845,Danida,Da,Yombo,True,VWC,Bagamoyo wate,True
14846,Hiap,HIAP,Mkondoa,True,VWC,,False
14847,,,Juhudi,True,VWC,,
14848,Germany,DWE,Namakinga B,True,VWC,Mradi wa maji wa maposeni,True


Getting rid of these columns will get rid of any unecessary information.
<br>
i.e. extraction_type_group and extraction_type_class essentially provide the 
same information

In [97]:
to_drop = ['date_recorded', 'wpt_name', 'region_code', 'scheme_name',
           'extraction_type_group', 'extraction_type_class',
           'management_group', 'payment','quality_group', 'source_type', 
           'source_class', 'waterpoint_type_group']

In [98]:
df.drop(columns=to_drop, inplace=True)
test_vals.drop(columns=to_drop, inplace=True)

I need to imput Null values. I choose "None" and False where I am able to since
these values are unknown in the first place.

In [28]:
df[['funder', 'installer', 'subvillage', 'public_meeting', 'permit']]

Unnamed: 0,funder,installer,subvillage,public_meeting,permit
0,Roman,Roman,Mnyusi B,True,False
1,Grumeti,GRUMETI,Nyamara,,True
2,Lottery Club,World vision,Majengo,True,True
3,Unicef,UNICEF,Mahakamani,True,True
4,Action In A,Artisan,Kyanyamisa,True,True
...,...,...,...,...,...
59395,Germany Republi,CES,Kiduruni,True,True
59396,Cefa-njombe,Cefa,Igumbilo,True,True
59397,,,Madungulu,True,False
59398,Malec,Musa,Mwinyi,True,True


In [30]:
df.funder.value_counts()

Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Rarymond Ekura               1
Justine Marwa                1
Municipal Council            1
Afdp                         1
Samlo                        1
Name: funder, Length: 1897, dtype: int64

In [101]:
df.fillna({'funder':'None', 'installer': 'unknown', 'subvillage': 'unknown', 
           'public_meeting': False, 'permit': False,
           'scheme_management': 'None'}, inplace=True)

In [102]:
test_vals.fillna({'funder':'None', 'installer': 'unknown', 'subvillage': 'unknown', 
           'public_meeting': False, 'permit': False,
           'scheme_management': 'None'}, inplace=True)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 59400 non-null  int64  
 1   amount_tsh         59400 non-null  float64
 2   funder             59400 non-null  object 
 3   gps_height         59400 non-null  int64  
 4   installer          59400 non-null  object 
 5   longitude          59400 non-null  float64
 6   latitude           59400 non-null  float64
 7   num_private        59400 non-null  int64  
 8   basin              59400 non-null  object 
 9   subvillage         59400 non-null  object 
 10  region             59400 non-null  object 
 11  district_code      59400 non-null  int64  
 12  lga                59400 non-null  object 
 13  ward               59400 non-null  object 
 14  population         59400 non-null  int64  
 15  public_meeting     59400 non-null  bool   
 16  recorded_by        594

In [104]:
objs.columns

Index(['funder', 'installer', 'basin', 'subvillage', 'region', 'lga', 'ward',
       'recorded_by', 'scheme_management', 'extraction_type', 'management',
       'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source',
       'waterpoint_type'],
      dtype='object')

In [107]:
objs = df.select_dtypes(include='O').drop(columns='status_group')
nums = df.select_dtypes(include=['int64', 'float64'])
bools = df.select_dtypes(include='bool')
targets = df['status_group']

In [108]:
objs_test = test_vals.select_dtypes(include='O')
nums_test = test_vals.select_dtypes(include=['int64', 'float64'])
bools_test = test_vals.select_dtypes(include='bool')

In [109]:
train,test = objs_dummies.align(objs_dummies_test, join='outer',
                                axis=1, fill_value=0)

In [110]:
train = train.merge(objs, left_index=True, right_index=True).merge(nums, left_index=True, right_index=True).merge(bools, left_index=True, right_index=True).merge(targets, left_index=True, right_index=True)
test = test.merge(objs_test, left_index=True, right_index=True).merge(nums_test, left_index=True, right_index=True).merge(bools_test, left_index=True, right_index=True)

In [113]:
train.shape

(59400, 28343)

In [114]:
test.shape

(14850, 28342)

# Saving Transformed Data

In [121]:
train.to_csv(config['paths']['data_path'] + 'training_set.csv')
test.to_csv(config['paths']['data_path'] + 'testing_set.csv')

# Importing Transformed Data

In [37]:
train = pd.read_csv(config['paths']['data_path'] + 'training_set.csv')
test = pd.read_csv(config['paths']['data_path'] + 'testing_set.csv')