# Import Needed Libraries

In [9]:
# to import configured input and output paths
import configparser
# Dataframe building and manipulation
import pandas as pd
import numpy as np

# Vizualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

# to display all columns
pd.set_option('display.max_columns', None)

# Importing Configuration
I like to switch between my desktop and laptop frequently. Thus, I am getting 
into the habit of creating a configuration file, config.ini. This file will
allow me to edit my paths in one location so that I don't have to constantly
update this in every notebook. This configuration file can be used for many 
purposes, such as saving usernames and passwords, and I can even add it to the
.gitignore file!

In [2]:
# import my config.ini file
config = configparser.ConfigParser()
config.read("../src/config.ini")

['../src/config.ini']

# Importing Given Data

In [91]:
# Dataframes we will use
train_labels = pd.read_csv(config['paths']['data_path'] + 'training_set_labels.csv')
train_vals = pd.read_csv(config['paths']['data_path'] + 'training_set_values.csv')

#For competition submission
test_vals = pd.read_csv(config['paths']['data_path'] + 'test_set_values.csv')
sub_form = pd.read_csv(config['paths']['data_path'] + 'SubmissionFormat.csv')

In [92]:
# Checking the shape of each dataframe
print('train_vals:', train_vals.shape)
print('train_labels', train_labels.shape)
print('test_vals', test_vals.shape)

train_vals: (59400, 40)
train_labels (59400, 2)
test_vals (14850, 40)


# Data Cleaning

In [93]:
df = pd.merge(train_vals, train_labels,left_on='id', right_on='id',how='outer')

I see there are some nulls, so I'll take a look at those.

In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 41 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     59400 non-null  int64  
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [95]:
null_cols = [c for c in df.columns if df[c].isnull().any()]
df[null_cols]

Unnamed: 0,funder,installer,subvillage,public_meeting,scheme_management,scheme_name,permit
0,Roman,Roman,Mnyusi B,True,VWC,Roman,False
1,Grumeti,GRUMETI,Nyamara,,Other,,True
2,Lottery Club,World vision,Majengo,True,VWC,Nyumba ya mungu pipe scheme,True
3,Unicef,UNICEF,Mahakamani,True,VWC,,True
4,Action In A,Artisan,Kyanyamisa,True,,,True
...,...,...,...,...,...,...,...
59395,Germany Republi,CES,Kiduruni,True,Water Board,Losaa Kia water supply,True
59396,Cefa-njombe,Cefa,Igumbilo,True,VWC,Ikondo electrical water sch,True
59397,,,Madungulu,True,VWC,,False
59398,Malec,Musa,Mwinyi,True,VWC,,True


Luckily for me, the same columns have nulls in the original training and testing
sets. This means I can apply the same imputing methods to each.

In [96]:
null_cols_test = [c for c in test_vals.columns if test_vals[c].isnull().any()]
test_vals[null_cols_test]

Unnamed: 0,funder,installer,subvillage,public_meeting,scheme_management,scheme_name,permit
0,Dmdd,DMDD,Magoma,True,Parastatal,,True
1,Government Of Tanzania,DWE,Kimnyak,True,VWC,TPRI pipe line,True
2,,,Msatu,True,VWC,P,
3,Finn Water,FINN WATER,Kipindimbi,,VWC,,True
4,Bruder,BRUDER,Losonga,,Water Board,BRUDER,True
...,...,...,...,...,...,...,...
14845,Danida,Da,Yombo,True,VWC,Bagamoyo wate,True
14846,Hiap,HIAP,Mkondoa,True,VWC,,False
14847,,,Juhudi,True,VWC,,
14848,Germany,DWE,Namakinga B,True,VWC,Mradi wa maji wa maposeni,True


Getting rid of these columns will get rid of any unecessary information.
<br>
i.e. extraction_type_group and extraction_type_class essentially provide the 
same information

In [97]:
to_drop = ['date_recorded', 'wpt_name', 'region_code', 'scheme_name',
           'extraction_type_group', 'extraction_type_class',
           'management_group', 'payment','quality_group', 'source_type', 
           'source_class', 'waterpoint_type_group']

In [98]:
df.drop(columns=to_drop, inplace=True)
test_vals.drop(columns=to_drop, inplace=True)

I need to imput Null values. I choose "None" and False where I am able to since
these values are unknown in the first place.

In [99]:
df[['funder', 'installer', 'subvillage', 'public_meeting', 'permit']]

Unnamed: 0,funder,installer,subvillage,public_meeting,permit
0,Roman,Roman,Mnyusi B,True,False
1,Grumeti,GRUMETI,Nyamara,,True
2,Lottery Club,World vision,Majengo,True,True
3,Unicef,UNICEF,Mahakamani,True,True
4,Action In A,Artisan,Kyanyamisa,True,True
...,...,...,...,...,...
59395,Germany Republi,CES,Kiduruni,True,True
59396,Cefa-njombe,Cefa,Igumbilo,True,True
59397,,,Madungulu,True,False
59398,Malec,Musa,Mwinyi,True,True


In [100]:
df.scheme_management.value_counts()

VWC                 36793
WUG                  5206
Water authority      3153
WUA                  2883
Water Board          2748
Parastatal           1680
Private operator     1063
Company              1061
Other                 766
SWC                    97
Trust                  72
None                    1
Name: scheme_management, dtype: int64

In [101]:
df.fillna({'funder':'None', 'installer': 'unknown', 'subvillage': 'unknown', 
           'public_meeting': False, 'permit': False,
           'scheme_management': 'None'}, inplace=True)

In [102]:
test_vals.fillna({'funder':'None', 'installer': 'unknown', 'subvillage': 'unknown', 
           'public_meeting': False, 'permit': False,
           'scheme_management': 'None'}, inplace=True)

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 0 to 59399
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 59400 non-null  int64  
 1   amount_tsh         59400 non-null  float64
 2   funder             59400 non-null  object 
 3   gps_height         59400 non-null  int64  
 4   installer          59400 non-null  object 
 5   longitude          59400 non-null  float64
 6   latitude           59400 non-null  float64
 7   num_private        59400 non-null  int64  
 8   basin              59400 non-null  object 
 9   subvillage         59400 non-null  object 
 10  region             59400 non-null  object 
 11  district_code      59400 non-null  int64  
 12  lga                59400 non-null  object 
 13  ward               59400 non-null  object 
 14  population         59400 non-null  int64  
 15  public_meeting     59400 non-null  bool   
 16  recorded_by        594

In [104]:
objs.columns

Index(['funder', 'installer', 'basin', 'subvillage', 'region', 'lga', 'ward',
       'recorded_by', 'scheme_management', 'extraction_type', 'management',
       'payment_type', 'water_quality', 'quantity', 'quantity_group', 'source',
       'waterpoint_type'],
      dtype='object')

In [107]:
objs = df.select_dtypes(include='O').drop(columns='status_group')
nums = df.select_dtypes(include=['int64', 'float64'])
bools = df.select_dtypes(include='bool')
targets = df['status_group']

In [108]:
objs_test = test_vals.select_dtypes(include='O')
nums_test = test_vals.select_dtypes(include=['int64', 'float64'])
bools_test = test_vals.select_dtypes(include='bool')

In [109]:
train,test = objs_dummies.align(objs_dummies_test, join='outer',
                                axis=1, fill_value=0)

In [110]:
train = train.merge(objs, left_index=True, right_index=True).merge(nums, left_index=True, right_index=True).merge(bools, left_index=True, right_index=True).merge(targets, left_index=True, right_index=True)
test = test.merge(objs_test, left_index=True, right_index=True).merge(nums_test, left_index=True, right_index=True).merge(bools_test, left_index=True, right_index=True)

In [113]:
train.shape

(59400, 28343)

In [114]:
test.shape

(14850, 28342)

# Saving Transformed Data

In [121]:
train.to_csv(config['paths']['data_path'] + 'training_set.csv')
test.to_csv(config['paths']['data_path'] + 'testing_set.csv')

# Importing Transformed Data

In [37]:
train = pd.read_csv(config['paths']['data_path'] + 'training_set.csv')
test = pd.read_csv(config['paths']['data_path'] + 'testing_set.csv')