In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyClassifier
import statsmodels.api as sm

In [2]:
X_train = pd.read_csv('data/Training_set_values.csv', index_col='id')
X_test = pd.read_csv('data/Test_set_values.csv', index_col='id')
y_train = pd.read_csv('data/Training_set_labels.csv', index_col='id')

In [3]:
X_train.head()

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
y_train.head()

Unnamed: 0_level_0,status_group
id,Unnamed: 1_level_1
69572,functional
8776,functional
34310,functional
67743,non functional
19728,functional


## Merging the X and y train data.

In [5]:
df = X_train.merge(y_train, on='id')

We merged the X train and y train data to build a dataframe that we could use to testing our different models on. The data set from where the data came did not have a y test dataset. We will split the merged dataset and create a hold out group later.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   date_recorded          59400 non-null  object 
 2   funder                 55765 non-null  object 
 3   gps_height             59400 non-null  int64  
 4   installer              55745 non-null  object 
 5   longitude              59400 non-null  float64
 6   latitude               59400 non-null  float64
 7   wpt_name               59400 non-null  object 
 8   num_private            59400 non-null  int64  
 9   basin                  59400 non-null  object 
 10  subvillage             59029 non-null  object 
 11  region                 59400 non-null  object 
 12  region_code            59400 non-null  int64  
 13  district_code          59400 non-null  int64  
 14  lga                    59400 non-null  object 
 15

# Data Cleaning

In [7]:
df['recorded_by'].value_counts()

GeoData Consultants Ltd    59400
Name: recorded_by, dtype: int64

In [8]:
# funder: 1898 unique values; source of funding less relevant to pump functionality which should be better captured by installer

In [9]:
df.funder.nunique()

1897

In [10]:
df['funder'].value_counts()

Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Othod                        1
Pentecosta Seela             1
Yasini Selemani              1
Su-ki Jang                   1
Mwamvita Rajabu              1
Name: funder, Length: 1897, dtype: int64

In [11]:
print(df['extraction_type'].value_counts()) # The kind of extraction the waterpoint uses **KEEP**
print('----------------------------------------------')
print(df['extraction_type_group'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')
print(df['extraction_type_class'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')

gravity                      26780
nira/tanira                   8154
other                         6430
submersible                   4764
swn 80                        3670
mono                          2865
india mark ii                 2400
afridev                       1770
ksb                           1415
other - rope pump              451
other - swn 81                 229
windmill                       117
india mark iii                  98
cemo                            90
other - play pump               85
walimi                          48
climax                          32
other - mkulima/shinyanga        2
Name: extraction_type, dtype: int64
----------------------------------------------
gravity            26780
nira/tanira         8154
other               6430
submersible         6179
swn 80              3670
mono                2865
india mark ii       2400
afridev             1770
rope pump            451
other handpump       364
other motorpump      122
wind-powered

In [12]:
print(df['subvillage'].value_counts()) # The kind of extraction the waterpoint uses **KEEP**
print('----------------------------------------------')
print(df['region'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')
print(df['region_code'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')
print(df['district_code'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')
print(df['lga'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')
print(df['ward'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')

Madukani        508
Shuleni         506
Majengo         502
Kati            373
Mtakuja         262
               ... 
Lotima A          1
Kichiwa Kati      1
Komshasha         1
Ikunbirunde       1
Siki              1
Name: subvillage, Length: 19287, dtype: int64
----------------------------------------------
Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64
----------------------------------------------
11    5300
17    5011
12    4639
3     4379
5     4040
18    3324
19    3047
2     3024
16    2816
10    2640
4     2513
1     2201
13    2093
14    19

In [13]:
df['region'].nunique()

21

In [14]:
# amount_tsh - Total static head (amount water available to waterpoint)      **DROP** A majority of the rows for this feature do not have values. 

# date_recorded - The date the row was entered                               **DROP**

# funder - Who funded the well                                               **DROP**

# gps_height - Altitude of the well

# installer - Organization that installed the well                           **DROP**

# wpt_name - Name of the waterpoint if there is one                          **DROP**

# num_private -                                                              **DROP**

# -----------------------------------------------------------------------------------------------------------------------------------------------------------

# ***LOCATION METRICS***

# longitude - GPS coordinate
# latitude - GPS coordinate
# basin - Geographic water basin
# subvillage - Geographic location                                           **DROP**
# region - Geographic location
# region_code - Geographic location (coded)                                  **DROP** 'region' has more information
# district_code - Geographic location (coded)
# lga - Geographic location                                                  **DROP**
# ward - Geographic location                                                 **DROP**
# -----------------------------------------------------------------------------------------------------------------------------------------------------------

# population - Population around the well

# public_meeting - True/False

# recorded_by - Group entering this row of data                              **DROP**

# -----------------------------------------------------------------------------------------------------------------------------------------------------------
# scheme_management - Who operates the waterpoint
# scheme_name - Who operates the waterpoint                                  **DROP**

# permit - If the waterpoint is permitted

# construction_year - Year the waterpoint was constructed

# extraction_type - The kind of extraction the waterpoint uses
# extraction_type_group - The kind of extraction the waterpoint uses         **DROP**
# extraction_type_class - The kind of extraction the waterpoint uses         **DROP**

# management - How the waterpoint is managed
# management_group - How the waterpoint is managed                           **DROP**

# payment - What the water costs
# payment_type - What the water costs                                        **DROP**

# water_quality - The quality of the water
# quality_group - The quality of the water                                   **DROP**

# quantity - The quantity of water
# quantity_group - The quantity of water                                     **DROP**                              

# source - The source of the water
# source_type - The source of the water                                      **DROP**
# source_class - The source of the water                                     **DROP**

# waterpoint_type - The kind of waterpoint
# waterpoint_type_group - The kind of waterpoint                             **DROP**

In [15]:
# funder - Who funded the well
# gps_height - Altitude of the well
# installer - Organization that installed the well
# longitude - GPS coordinate
# latitude - GPS coordinate
# basin - Geographic water basin
# region - Geographic location
# district_code - Geographic location (coded)
# population - Population around the well
# public_meeting - True/False
# scheme_management - Who operates the waterpoint
# permit - If the waterpoint is permitted
# construction_year - Year the waterpoint was constructed
# extraction_type - The kind of extraction the waterpoint uses
# management - How the waterpoint is managed
# payment - What the water costs
# water_quality - The quality of the water
# quantity - The quantity of water
# source - The source of the water
# waterpoint_type - The kind of waterpoint

In [16]:
print(df['funder'].value_counts())
print('----------------------------------------------')
print(df['gps_height'].value_counts())
print('----------------------------------------------')
print(df['installer'].value_counts())
print('----------------------------------------------')
print(df['basin'].value_counts())
print('----------------------------------------------')
print(df['region'].value_counts())
print('----------------------------------------------')
print(df['district_code'].value_counts())
print('----------------------------------------------')
print(df['population'].value_counts())
print('----------------------------------------------')
print(df['public_meeting'].value_counts())
print('----------------------------------------------')
print(df['scheme_management'].value_counts())
print('----------------------------------------------')
print(df['permit'].value_counts())
print('----------------------------------------------')
print(df['construction_year'].value_counts())
print('----------------------------------------------')
print(df['extraction_type'].value_counts())
print('----------------------------------------------')
print(df['management'].value_counts())
print('----------------------------------------------')
print(df['payment'].value_counts())
print('----------------------------------------------')
print(df['water_quality'].value_counts())
print('----------------------------------------------')
print(df['quantity'].value_counts())
print('----------------------------------------------')
print(df['source'].value_counts())
print('----------------------------------------------')
print(df['waterpoint_type'].value_counts())
print('----------------------------------------------')

Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
                          ... 
Othod                        1
Pentecosta Seela             1
Yasini Selemani              1
Su-ki Jang                   1
Mwamvita Rajabu              1
Name: funder, Length: 1897, dtype: int64
----------------------------------------------
 0       20438
-15         60
-16         55
-13         55
-20         52
         ...  
 2285        1
 2424        1
 2552        1
 2413        1
 2385        1
Name: gps_height, Length: 2428, dtype: int64
----------------------------------------------
DWE                          17402
Government                    1825
RWE                           1206
Commu                         1060
DANIDA                        1050
                             ...  
SUMO                             1
COW                              1
NYAKILANGANI  CO                 1

In [17]:
df['waterpoint_type'].value_counts()

communal standpipe             28522
hand pump                      17488
other                           6380
communal standpipe multiple     6103
improved spring                  784
cattle trough                    116
dam                                7
Name: waterpoint_type, dtype: int64

In [18]:
## FEATURE ENGINEERING IDEAS

# water_quality by region: Type: Object
# water_quality by extraction type: Type: Object
# water_quality by waterpoint_type: Type: Object
# water_quality by source: Type: Object
# water_quality by scheme_management: Type: Object
# water_quality by permit: Type: Object

In [19]:
print(df['water_quality'].value_counts())
print('----------------------------------------------')
print(df['region'].value_counts())
print('----------------------------------------------')
print(df['extraction_type'].value_counts())
print('----------------------------------------------')
print(df['waterpoint_type'].value_counts())
print('----------------------------------------------')
print(df['source'].value_counts())
print('----------------------------------------------')
print(df['scheme_management'].value_counts())
print('----------------------------------------------')
print(df['permit'].value_counts())

soft                  50818
salty                  4856
unknown                1876
milky                   804
coloured                490
salty abandoned         339
fluoride                200
fluoride abandoned       17
Name: water_quality, dtype: int64
----------------------------------------------
Iringa           5294
Shinyanga        4982
Mbeya            4639
Kilimanjaro      4379
Morogoro         4006
Arusha           3350
Kagera           3316
Mwanza           3102
Kigoma           2816
Ruvuma           2640
Pwani            2635
Tanga            2547
Dodoma           2201
Singida          2093
Mara             1969
Tabora           1959
Rukwa            1808
Mtwara           1730
Manyara          1583
Lindi            1546
Dar es Salaam     805
Name: region, dtype: int64
----------------------------------------------
gravity                      26780
nira/tanira                   8154
other                         6430
submersible                   4764
swn 80             

In [20]:
print(df['water_quality'].nunique())
print(df['region'].nunique())
print(df['extraction_type'].nunique())
print(df['waterpoint_type'].nunique())
print(df['source'].nunique())
print(df['scheme_management'].nunique())
print(df['permit'].nunique())

8
21
18
7
10
12
2


In [21]:
df['extraction_type'].value_counts()

gravity                      26780
nira/tanira                   8154
other                         6430
submersible                   4764
swn 80                        3670
mono                          2865
india mark ii                 2400
afridev                       1770
ksb                           1415
other - rope pump              451
other - swn 81                 229
windmill                       117
india mark iii                  98
cemo                            90
other - play pump               85
walimi                          48
climax                          32
other - mkulima/shinyanga        2
Name: extraction_type, dtype: int64

In [22]:
# converting exisiting features to ternary values to be able to multiply them together

In [24]:
tern_water_quality = {'soft':0,
                      'salty':1,
                      'unkown': 2,
                      'milky': 3,
                      'coloured':4,
                      'salty abandoned': 5, 
                      'fluoride': 6,
                      'fluoride abandoned': 7
}

tern_region = {'soft':0,
               'salty':1,
               'unkown': 2,
               'milky': 3,
               'coloured':4,
               'salty abandoned': 5, 
               'fluoride': 6,
               'fluoride abandoned': 7
}

tern_region = {'Iringa':0,
               'Shinyanga':1,
               'Mbeya':2,
               'Kilimanjaro':3,
               'Morogoro':4,
               'Arusha':5,
               'Kagera':6,
               'Mwanza':7,
               'Kigoma':8,
               'Ruvuma':9,
               'Pwani':10,
               'Tanga':11,
               'Dodoma':12,
               'Singida':13,
               'Mara':14,
               'Tabora':15,
               'Rukwa':16,
               'Mtwara':17,
               'Manyara':18,
               'Lindi':19,
               'Dar es Salaam':20                                     
}

tern_waterpoint_type = {'gravity':0,
                        'nira/tanira':1,
                        'other':2,
                        'submersible':3,
                        'swn 80':4,
                        'mono':5,
                        'india mark ii':6,
                        'afridev':7,
                        'ksb':8,
                        'other - rope pump':9,
                        'other - swn 81':10,
                        'windmill':11,
                        'india mark iii':12,
                        'cemo':13,
                        'other - play pump':14,
                        'walimi':15,
                        'climax':16,
                        'other - mkulima/shinyanga':17
}


tern_source = { 'spring':0,
                'shallow well':1,
                'machine dbh':2,
                'river':3,
                'rainwater harvesting':4,
                'hand dtw':5,
                'lake':6,
                'dam':7,
                'other':8,
                'unknown':9
}

tern_scheme_management =   {'VWC':0,
                            'WUG':1,
                            'Water authority':2,
                            'WUA':3,
                            'Water Board':4,
                            'Parastatal':5,
                            'Private operator':6,
                            'Company':7,
                            'Other':8,
                            'SWC':9,
                            'Trust':10,
                            'None':11
}


tern_permit = {'True':0,
               'False':1
    
}

In [30]:
df.water_quality = [tern_water_quality[x] for x in df.water_quality]
df.region = [tern_region[x] for x in df.region]


tern_water_quality = {'soft':0,
                      'salty':1,
                      'unkown': 2,
                      'milky': 3,
                      'coloured':4,
                      'salty abandoned': 5, 
                      'fluoride': 6,
                      'fluoride abandoned': 7
}

tern_region = {'soft':0,
               'salty':1,
               'unkown': 2,
               'milky': 3,
               'coloured':4,
               'salty abandoned': 5, 
               'fluoride': 6,
               'fluoride abandoned': 7
}

tern_region = {'Iringa':0,
               'Shinyanga':1,
               'Mbeya':2,
               'Kilimanjaro':3,
               'Morogoro':4,
               'Arusha':5,
               'Kagera':6,
               'Mwanza':7,
               'Kigoma':8,
               'Ruvuma':9,
               'Pwani':10,
               'Tanga':11,
               'Dodoma':12,
               'Singida':13,
               'Mara':14,
               'Tabora':15,
               'Rukwa':16,
               'Mtwara':17,
               'Manyara':18,
               'Lindi':19,
               'Dar es Salaam':20                                     
}

tern_waterpoint_type = {'gravity':0,
                        'nira/tanira':1,
                        'other':2,
                        'submersible':3,
                        'swn 80':4,
                        'mono':5,
                        'india mark ii':6,
                        'afridev':7,
                        'ksb':8,
                        'other - rope pump':9,
                        'other - swn 81':10,
                        'windmill':11,
                        'india mark iii':12,
                        'cemo':13,
                        'other - play pump':14,
                        'walimi':15,
                        'climax':16,
                        'other - mkulima/shinyanga':17
}

tern_source = { 'spring':0,
                'shallow well':1,
                'machine dbh':2,
                'river':3,
                'rainwater harvesting':4,
                'hand dtw':5,
                'lake':6,
                'dam':7,
                'other':8,
                'unknown':9
}

tern_scheme_management =   {'VWC':0,
                            'WUG':1,
                            'Water authority':2,
                            'WUA':3,
                            'Water Board':4,
                            'Parastatal':5,
                            'Private operator':6,
                            'Company':7,
                            'Other':8,
                            'SWC':9,
                            'Trust':10,
                            'None':11
}


tern_permit = {'True':0,
               'False':1
    
}

Index(['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer',
       'longitude', 'latitude', 'wpt_name', 'num_private', 'basin',
       'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward',
       'population', 'public_meeting', 'recorded_by', 'scheme_management',
       'scheme_name', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group',
       'status_group'],
      dtype='object')

In [31]:
df['region'].value_counts()

0     5294
1     4982
2     4639
3     4379
4     4006
5     3350
6     3316
7     3102
8     2816
9     2640
10    2635
11    2547
12    2201
13    2093
14    1969
15    1959
16    1808
17    1730
18    1583
19    1546
20     805
Name: region, dtype: int64

In [None]:
# Adding the new features to our dataframe

df.ternary_water_quality = df.water_quality.map({'True':0,
               'False':1})
df.head()


In [None]:
df.ternary_water_quality

In [None]:
### installer column - take top 20, keep those and categorize the rest as others


In [None]:
df['permit'].value_counts()

In [None]:
# FEATURE ENGINEERING IDEAS

water_quality by region: Type: Object
water_quality by extraction type: Type: Object
water_quality by waterpoint_type: Type: Object
water_quality by source: Type: Object
water_quality by scheme_management: Type: Object
water_quality by permit: Type: Object

In [None]:
df.info()

In [None]:
## ADDING COLUMNS FOR OUR NEW FEATURES
df['water_quality_by_region'] = df.water_quality * df.region
df['water_quality_by_extraction_type'] = df.water_quality * df.extraction_type
df['water_quality_by_waterpoint_type'] = df.water_quality * df.waterpoint_type
df['water_quality_by_source'] = df.water_quality * df.source
df['water_quality_by_scheme_management'] = df.water_quality * df.scheme_management
df['water_quality_by_permit'] = df.water_quality * df.permit

In [None]:
df['water_quality'].nunique()

In [None]:
df['region'].nunique()

In [None]:
print(df['extraction_type'].value_counts()) # The kind of extraction the waterpoint uses **KEEP**
print('----------------------------------------------')
print(df['extraction_type_group'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')
print(df['extraction_type_class'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')

In [None]:
print(df['waterpoint_type'].value_counts()) # The kind of extraction the waterpoint uses **KEEP**
print('----------------------------------------------')
print(df['waterpoint_type_group'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')

In [None]:
print(df['quantity'].value_counts()) # The kind of extraction the waterpoint uses **KEEP**
print('----------------------------------------------')
print(df['quantity_group'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')

In [None]:
print(df['payment'].value_counts()) # The kind of extraction the waterpoint uses **KEEP**
print('----------------------------------------------')
print(df['payment_type'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')

In [None]:
print(df['source'].value_counts()) # The kind of extraction the waterpoint uses **KEEP**
print('----------------------------------------------')
print(df['source_type'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')
print(df['source_class'].value_counts()) # The kind of extraction the waterpoint uses **DROP**
print('----------------------------------------------')

In [None]:
df.isna().sum()

In [None]:
df.drop(columns=['management_group','scheme_management','quantity_group','source_class','source_type','quality_group',
                'payment_type','extraction_type_class','extraction_type', 'waterpoint_type_group'],inplace=True )

In [None]:
# df = df.drop(['scheme_name', 'date_recorded', 'wpt_name', 'subvillage', 'lga', 'ward', 'recorded_by', 
#               'quantity_group', 'payment_type', 'funder'], axis = 1)
# df = df.fillna('missing', axis = 1)

With so many unique values for some features, onehotencoding creates more than 60,000 columns (out of a dataframe with 59,400 entries), making our models computationally prohibitive.  We use common sense and topical knowledge to eliminate some features and drop those columns from our dataframe.  Here are the dropped columns and our reasoning for excluding them: <br><br>
`date recorded`:  the age of the well is captured by the `construction_year` column <br>
`scheme_name`:  over 28,000 missing values <br>
`wpt_name`:  37,400 unique values <br>
`subvillage`: 19,288 unique values and location is captured elsewhere by `latitude` and `longitude` <br>
`lga`:  125 unique values and location is captured elsewhere by `latitude` and `longitude` <br>
`ward`:  2092 unique values and location is captured elsewhere by `latitude` and `longitude` <br>
`recorded_by`:  all values are the same <br>
`quantity_group`:  same as `quantity` <br>
`payment_type`:  same as `payment` <br>
`funder`: 1898 unique values; source of funding less relevant to pump functionality which should be better captured by `installer` <br>

We replaced NaN values with `missing` to keep the rows in our dataframe. We will onehotencode the data frame later.

In [None]:
df.info()

We are creating a hold out data set which we will test our final model on.

In [None]:
df, holdout = train_test_split(df, test_size = .1)

In [None]:
X = df.drop('status_group', axis=1)
y = df['status_group']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
X_train.head()

In [None]:
X_train.to_csv('data/X_train.csv')
X_test.to_csv('data/X_test.csv')
y_train.to_csv('data/y_train.csv')
y_test.to_csv('data/y_test.csv')

df.to_csv('data/holdout_data.csv')

# Inferential Plots

In [None]:
cols = df.select_dtypes('int64', 'float64').columns
fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 6))
fig.set_tight_layout(True)
for index, col in enumerate(cols): 
    ax = axes[index//3][index%3]
    ax.scatter(df[col], df.status_group, alpha=0.2)
    ax.set_xlabel(col)
    ax.set_ylabel("Pump Status")

In [None]:
fig, ax = plt.subplots()
ax.scatter(df.amount_tsh, df.status_group)
ax.set_xlabel('Total Static Head')
ax.set_title('Total Static Head by Pump Status');

In [None]:
fig, ax = plt.subplots()
ax.scatter(df.construction_year.loc[df.construction_year != 0], 
           df.status_group.loc[df.construction_year !=0])
ax.set_xlabel('Construction Year')
ax.set_title('Construction Year by Pump Status');

In [None]:
fig, ax = plt.subplots()
ax.scatter(df.population, df.status_group)
ax.set_xlabel('Population Around the Well')
ax.set_title('Population by Pump Status');

# Baseline Dummy Model

In [None]:
dummy_model = DummyClassifier(strategy='most_frequent', random_state=42)
dummy_model.fit(X_train, y_train)

In [None]:
print('Accuracy Score Train:', dummy_model.score(X_train, y_train))
print('Accuracy Score Test:', dummy_model.score(X_test, y_test))

In [None]:
log_loss_dummy = cross_val_score(dummy_model, X_train, y_train, scoring='neg_log_loss')
log_loss_dummy = -log_loss_dummy.mean()
print('Log Loss:', log_loss_dummy)

# Model Building

In [None]:
df.status_group.value_counts().plot(kind='bar', color='red')
plt.title('Class Distribution', fontsize = 20)
plt.tight_layout()

In [None]:
df.columns

In [None]:
viz_df = df.copy(deep=True)

In [None]:
viz_df = viz_df.drop(viz_df[viz_df['longitude']==0].index)

In [None]:
viz_df[viz_df['longitude']==0]

In [None]:
# color palette as dictionary

palette = {"functional":"xkcd:soft green",
"non functional":"xkcd:light red",
"functional needs repair":"xkcd:cerulean"
}

# https://xkcd.com/color/rgb/ - color options

In [None]:
fig, ax = plt.subplots(figsize=(10,6))
sns.scatterplot(x='longitude', y='latitude', hue='status_group', palette=palette, sizes=(1, 8), data=viz_df, ax=ax)
plt.xlabel("Longitude", fontweight='bold', size=12)
plt.ylabel("Latitude", fontweight='bold', size=12)
plt.legend(bbox_to_anchor=(1.0, 1), borderaxespad=0)
plt.title('Tanzania Well Status by Coordinates',fontweight='bold', size=16) # change this title
plt.tight_layout()
# fig.savefig('./images/map.png')

# palette="ch: r=-.2, d=.3_r"


In [None]:
viz_df.info()

In [None]:
df['latitude'].value_counts()

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
# import category_encoders as ce
from sklearn.compose import ColumnTransformer

# plot_confusion_matrix is a handy visual tool, added in the latest version of scikit-learn
# if you are running an older version, comment out this line and just use confusion_matrix

from sklearn.metrics import plot_confusion_matrix, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report

In [None]:
df.columns

In [None]:
# identifying features and target
features = df.drop('status_group', axis=1)
target = df.status_group

# dummy the features
dummy_features = pd.get_dummies(features, drop_first=True)
dummy_features.head()

In [None]:
# Create Pipeline

from sklearn.pipeline import Pipeline
def machine_learn(model):
    model_pipeline = Pipeline([('ss', StandardScaler()), ('model', model)])
    fitted_model = model_pipeline.fit(X_train, y_train)
    print("Accuracy Score:", fitted_model.score(X_test, y_test))
    preds = fitted_model.predict(X_test)
    print(classification_report(y_test, preds))
    print(sns.heatmap(confusion_matrix(y_test, preds)));

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(dummy_features, target, test_size=0.2, random_state=123)

# SMOTE
X_resampled, y_resampled = SMOTE(sampling_strategy='minority',random_state = 123).fit_sample(X_train, y_train)

In [None]:
# # Build model
# tree = DecisionTreeClassifier(random_state=123)

# machine_learn(tree)

In [None]:
# # Build model
# forest = RandomForestClassifier(random_state=123, verbose=1)

# machine_learn(forest)

In [None]:
# parameters_forest = {
#     'max_depth': [15, 20, 25] ,
#     'n_estimators': [150, 200, 250, 300],
#     'criterion': ['gini', 'entropy']}

In [None]:
# grid_search_forest = GridSearchCV(
#     estimator=forest,
#     param_grid=parameters_forest,
#     n_jobs = -1,
#     cv = 5,
#     verbose=True
# )

In [None]:
# grid_search_forest.fit(X_train, y_train)

In [None]:
# xgb = XGBClassifier(learning_rate=0.05, max_depth=35, random_state=123, objective = 'multi:softprob', num_class=3, verbosity=1)
# machine_learn(xgb)

In [None]:
df['decade'] = df['construction_year']

In [None]:
df['decade'].replace(to_replace = (1960, 1961, 1962, 1963, 1964, 1965,1966,1967,1968,1969, 1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989, 1990,1991,1992,1993,1994,1995,1996,1997,1998,1999 ), value = '20th Century')

df['decade'].replace(to_replace = (2000,2001,2002,2003,2004,2005,2006,2007,2008,2009, 2010,2011,2012,2013), value = '21st Century')

