In [1]:
import numpy as np
import pandas as pd

import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer, PolynomialFeatures
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.metrics import mean_squared_error, ConfusionMatrixDisplay, confusion_matrix, recall_score, \
    accuracy_score, precision_score, f1_score, plot_confusion_matrix, classification_report, roc_auc_score,\
    plot_roc_curve, plot_precision_recall_curve

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

pd.set_option('display.max_columns', None)

In [2]:
original_features = pd.read_csv('./Data/4910797b-ee55-40a7-8668-10efd5c1b960.csv')

In [3]:
original_features.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
columns = ['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'basin', 'subvillage', \
           'region_code', 'district_code','lga', 'population', 'public_meeting', 'scheme_management', 'permit',\
          'construction_year', 'extraction_type', 'management', 'management_group', 'payment_type', 'water_quality',\
           'quantity', 'source', 'source_class', 'waterpoint_type']
len(columns)

24

In [5]:
selected = original_features[columns]

In [6]:
selected.head(2)

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,basin,subvillage,region_code,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,management,management_group,payment_type,water_quality,quantity,source,source_class,waterpoint_type
0,6000.0,2011-03-14,Roman,1390,Roman,Lake Nyasa,Mnyusi B,11,5,Ludewa,109,True,VWC,False,1999,gravity,vwc,user-group,annually,soft,enough,spring,groundwater,communal standpipe
1,0.0,2013-03-06,Grumeti,1399,GRUMETI,Lake Victoria,Nyamara,20,2,Serengeti,280,,Other,True,2010,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe


In [7]:
selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         59400 non-null  float64
 1   date_recorded      59400 non-null  object 
 2   funder             55765 non-null  object 
 3   gps_height         59400 non-null  int64  
 4   installer          55745 non-null  object 
 5   basin              59400 non-null  object 
 6   subvillage         59029 non-null  object 
 7   region_code        59400 non-null  int64  
 8   district_code      59400 non-null  int64  
 9   lga                59400 non-null  object 
 10  population         59400 non-null  int64  
 11  public_meeting     56066 non-null  object 
 12  scheme_management  55523 non-null  object 
 13  permit             56344 non-null  object 
 14  construction_year  59400 non-null  int64  
 15  extraction_type    59400 non-null  object 
 16  management         594

In [8]:
# region_code and district_code astype(str)
selected['region_code'] = selected['region_code'].astype(str)
selected['district_code'] = selected['district_code'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected['region_code'] = selected['region_code'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected['district_code'] = selected['district_code'].astype(str)


In [9]:
# datetime conversion
max_date = pd.to_datetime(selected['date_recorded']).max()
selected['dates_passed'] = abs((pd.to_datetime(selected['date_recorded']) - max_date).dt.days)
selected['dates_passed']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected['dates_passed'] = abs((pd.to_datetime(selected['date_recorded']) - max_date).dt.days)


0         995
1         272
2         281
3         309
4         874
         ... 
59395     214
59396     941
59397     967
59398    1001
59399     986
Name: dates_passed, Length: 59400, dtype: int64

In [10]:
selected.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,basin,subvillage,region_code,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,management,management_group,payment_type,water_quality,quantity,source,source_class,waterpoint_type,dates_passed
0,6000.0,2011-03-14,Roman,1390,Roman,Lake Nyasa,Mnyusi B,11,5,Ludewa,109,True,VWC,False,1999,gravity,vwc,user-group,annually,soft,enough,spring,groundwater,communal standpipe,995
1,0.0,2013-03-06,Grumeti,1399,GRUMETI,Lake Victoria,Nyamara,20,2,Serengeti,280,,Other,True,2010,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,272
2,25.0,2013-02-25,Lottery Club,686,World vision,Pangani,Majengo,21,4,Simanjiro,250,True,VWC,True,2009,gravity,vwc,user-group,per bucket,soft,enough,dam,surface,communal standpipe multiple,281
3,0.0,2013-01-28,Unicef,263,UNICEF,Ruvuma / Southern Coast,Mahakamani,90,63,Nanyumbu,58,True,VWC,True,1986,submersible,vwc,user-group,never pay,soft,dry,machine dbh,groundwater,communal standpipe multiple,309
4,0.0,2011-07-13,Action In A,0,Artisan,Lake Victoria,Kyanyamisa,18,1,Karagwe,0,True,,True,0,gravity,other,other,never pay,soft,seasonal,rainwater harvesting,surface,communal standpipe,874


# Class definition

In [11]:
class Model():
    model_list = []
    model_df = pd.DataFrame({"Name": pd.Series(dtype='str'), "train_score": pd.Series(dtype='float64'),\
                             'train_log_loss': pd.Series(dtype='float64'), "test_score": pd.Series(dtype='float64'),
                             'test_log_loss': pd.Series(dtype='float64')})
    
    def __init__(self, name, model):
        self.name = name
        self.model = model
        self.params = model.get_params
        self.train_score = model.score(X_train, y_train)
        self.test_score = model.score(X_test, y_test)
        self.train_log_loss = -np.mean(cross_val_score(self.model, X_train, y_train, scoring='neg_log_loss'))
        self.test_log_loss = -np.mean(cross_val_score(self.model, X_test, y_test, scoring='neg_log_loss'))
                                        
        Model.model_list.append(self)
#         attributes = np.array([self.name, self.train_score, self.test_score])
        self.attributes = np.array([self.name, self.train_score, self.train_log_loss, self.test_score, self.test_log_loss])  # attributes
        
        self.attributes_df = pd.DataFrame([attributes], columns=['Name', 'train_score', 'train_log_loss', \
                            'test_score', 'test_log_loss'])
        Model.model_df = pd.concat([Model.model_df, self.attributes_df], axis=0)

    @classmethod
    def get_model_list(cls):
        return cls.model_list

# X and y assignment

In [12]:
X = selected.drop('date_recorded', axis=1)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         59400 non-null  float64
 1   funder             55765 non-null  object 
 2   gps_height         59400 non-null  int64  
 3   installer          55745 non-null  object 
 4   basin              59400 non-null  object 
 5   subvillage         59029 non-null  object 
 6   region_code        59400 non-null  object 
 7   district_code      59400 non-null  object 
 8   lga                59400 non-null  object 
 9   population         59400 non-null  int64  
 10  public_meeting     56066 non-null  object 
 11  scheme_management  55523 non-null  object 
 12  permit             56344 non-null  object 
 13  construction_year  59400 non-null  int64  
 14  extraction_type    59400 non-null  object 
 15  management         59400 non-null  object 
 16  management_group   594

In [14]:
y_df = pd.read_csv("./data/logistic_model_features.csv")
y = y_df['status_group']

In [15]:
ohe = OneHotEncoder(sparse=False)
target = pd.DataFrame(ohe.fit_transform(y_df[['status_group']]), index=y_df.index, columns=[
    'Functional', "Needs_Repair", "Non-Functional"])

y_flat = np.argmax(np.array(target), axis=1)

y_flat
dbl_check = pd.Series(y_flat, name='Target')
y = dbl_check
y.shape

(59400,)

In [16]:
X.shape

(59400, 24)

In [17]:
combined = pd.concat([X, y], axis=1)

In [18]:
combined.head(2)

Unnamed: 0,amount_tsh,funder,gps_height,installer,basin,subvillage,region_code,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,management,management_group,payment_type,water_quality,quantity,source,source_class,waterpoint_type,dates_passed,Target
0,6000.0,Roman,1390,Roman,Lake Nyasa,Mnyusi B,11,5,Ludewa,109,True,VWC,False,1999,gravity,vwc,user-group,annually,soft,enough,spring,groundwater,communal standpipe,995,0
1,0.0,Grumeti,1399,GRUMETI,Lake Victoria,Nyamara,20,2,Serengeti,280,,Other,True,2010,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,272,0


In [33]:
combined.loc[:, 'subvillage'].isna().sum()

371

In [26]:
combined.loc[:, 'subvillage'].value_counts()[55:60]

Mpakani     53
Dodoma      52
Msufini     52
Magomeni    51
Kaloleni    51
Name: subvillage, dtype: int64

In [38]:
# identify the index values of entries with fewer than 51 value counts.

to_replace = combined["subvillage"].value_counts()[combined["subvillage"].value_counts() < 51].index.values

check =  combined.replace(to_replace, value="Other") 
check['subvillage'].value_counts()

Other       51869
Madukani      508
Shuleni       506
Majengo       502
Kati          373
            ...  
Mpakani        53
Dodoma         52
Msufini        52
Magomeni       51
Kaloleni       51
Name: subvillage, Length: 61, dtype: int64

In [39]:
combined['subvillage'] = check['subvillage'].copy()

In [40]:
combined['subvillage'].value_counts()

Other       51869
Madukani      508
Shuleni       506
Majengo       502
Kati          373
            ...  
Mpakani        53
Dodoma         52
Msufini        52
Magomeni       51
Kaloleni       51
Name: subvillage, Length: 61, dtype: int64

In [41]:
combined.head(2)

Unnamed: 0,amount_tsh,funder,gps_height,installer,basin,subvillage,region_code,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,management,management_group,payment_type,water_quality,quantity,source,source_class,waterpoint_type,dates_passed,Target
0,6000.0,Roman,1390,Roman,Lake Nyasa,Other,11,5,Ludewa,109,True,VWC,False,1999,gravity,vwc,user-group,annually,soft,enough,spring,groundwater,communal standpipe,995,0
1,0.0,Grumeti,1399,GRUMETI,Lake Victoria,Other,20,2,Serengeti,280,,Other,True,2010,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,272,0


In [None]:
# also need to replace the columns of:
# funder, installer, ward, scheme_name, 

In [42]:
slices = pd.read_csv("./data/logistic_model_features.csv")

In [43]:
# correcting the values of other columns
combined['funder'] = slices['funder'].copy()
combined['installer'] = slices['installer'].copy()
combined['ward'] = slices['ward'].copy()
combined['scheme_name'] = slices['scheme_name'].copy()

In [44]:
# Imputing null values for missing values using 0's

combined['amount_tsh'].replace(0, np.nan, inplace=True)
combined['population'].replace(0, np.nan, inplace=True)
combined['construction_year'].replace(0, np.nan, inplace=True)

In [45]:
combined.head(2)

Unnamed: 0,amount_tsh,funder,gps_height,installer,basin,subvillage,region_code,district_code,lga,population,public_meeting,scheme_management,permit,construction_year,extraction_type,management,management_group,payment_type,water_quality,quantity,source,source_class,waterpoint_type,dates_passed,Target,ward,scheme_name
0,6000.0,Roman,1390,Roman,Lake Nyasa,Other,11,5,Ludewa,109.0,True,VWC,False,1999.0,gravity,vwc,user-group,annually,soft,enough,spring,groundwater,communal standpipe,995,0,other,Roman
1,,other,1399,other,Lake Victoria,Other,20,2,Serengeti,280.0,,Other,True,2010.0,gravity,wug,user-group,never pay,soft,insufficient,rainwater harvesting,surface,communal standpipe,272,0,Natta,


In [51]:
# scheme_name/ward could be further reduced imo, but won't worry about it yet

In [52]:
combined['installer'].value_counts()

DWE                17402
other              15075
dif_other           6106
RWE                 1206
DANIDA              1050
                   ...  
Shipo                 86
AICT                  84
DA                    84
MDRDP                 84
Oikos E .Africa       80
Name: installer, Length: 80, dtype: int64

In [53]:
combined.to_csv('/Users/samalainabayeva/Desktop/Water Project CSVs/updated_features.csv')

# Train/Test Split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

# Sub-Pipelines

In [65]:
numeric_pipe = Pipeline(steps=[
    ("s_i", SimpleImputer(strategy='mean', add_indicator=True, verbose=1)),
     ('ss', StandardScaler())
])
    
categorical_pipe = Pipeline(steps=[
    ('cat_imp', SimpleImputer(strategy="most_frequent", verbose=1, add_indicator=True)),
    ('ohe', OneHotEncoder(sparse=True, handle_unknown='ignore'))
])

In [67]:
CT = ColumnTransformer(transformers=[
    ('subpipe_numerics', numeric_pipe, selector(dtype_include=np.number)),
    ('subpipe_cat', categorical_pipe, selector(dtype_include=object))
], remainder='passthrough', n_jobs= -1)

# Final Pipelines for Fitting:

In [68]:
smote_pipe = ImPipeline(steps=[
    ("ct", CT),
    ('smote', SMOTE(n_jobs= -1)),
    ('model', LogisticRegression(n_jobs= -1))
])

In [76]:
baseline = Pipeline(steps=[
    ('CT', CT),
    ('model', LogisticRegression(n_jobs= -1))
])

In [None]:
baseline.fit(X_train, y_train)

region_code -> 6 more than region, 

In [44]:
original_features['water_quality'].nunique(), original_features['district_code'].isna().sum(), \
original_features['district_code'].value_counts()

(8,
 0,
 1     12203
 2     11173
 3      9998
 4      8999
 5      4356
 6      4074
 7      3343
 8      1043
 30      995
 33      874
 53      745
 43      505
 13      391
 23      293
 63      195
 62      109
 60       63
 0        23
 80       12
 67        6
 Name: district_code, dtype: int64)

In [11]:
pd.DataFrame(original_features.groupby(['waterpoint_type'])['amount_tsh']

waterpoint_type,cattle trough,cattle trough,cattle trough,cattle trough,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,communal standpipe multiple,dam,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,hand pump,improved spring,improved spring,improved spring,improved spring,improved spring,improved spring,improved spring,improved spring,improved spring,improved spring,improved spring,improved spring,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other,other
amount_tsh,0.00,50.00,100.00,500.00,0.00,500.00,50.00,20.00,1000.00,200.00,10.00,30.00,100.00,300.00,250.00,5.00,2000.00,5000.00,25.00,3000.00,6.00,1200.00,1500.00,4000.00,2400.00,6000.00,2500.00,7.00,8000.00,12000.00,750.00,40.00,3600.00,2200.00,400.00,10000.00,4700.00,33.00,600.00,2800.00,450.00,7200.00,20000.00,15.00,1300.00,15000.00,60.00,6500.00,25000.00,7500.00,35.00,40000.00,117000.00,150.00,70.00,700.00,18000.00,2.00,550.00,7000.00,30000.00,45000.00,50000.00,1.00,520.00,800.00,11000.00,16000.00,16300.00,26000.00,0.20,0.25,12.00,53.00,59.00,220.00,306.00,900.00,1400.00,3500.00,4500.00,5500.00,6300.00,8500.00,13000.00,14000.00,38000.00,60000.00,70000.00,100000.00,120000.00,138000.00,170000.00,200000.00,250000.00,350000.00,0.00,50.00,20.00,500.00,250.00,100.00,30.00,600.00,10.00,25.00,1000.00,200.00,5000.00,20000.00,450.00,1200.00,3000.00,70.00,700.00,2000.00,40.00,6000.00,300.00,750.00,150.00,2.00,10000.00,60.00,3600.00,4000.00,15.00,350.00,12000.00,1500.00,15000.00,0.20,1.00,5.00,7.00,26.00,35.00,400.00,2500.00,8000.00,30000.00,117000.00,0.00,0.00,500.00,1000.00,200.00,2000.00,20.00,50.00,100.00,5000.00,300.00,3000.00,30.00,10.00,2500.00,1500.00,1200.00,25.00,4000.00,2400.00,10000.00,250.00,6000.00,600.00,400.00,3500.00,5500.00,7000.00,30000.00,150.00,14000.00,40.00,590.00,700.00,3600.00,8000.00,9000.00,100000.00,5.00,9.00,15.00,2200.00,4500.00,5400.00,12000.00,13000.00,15000.00,50000.00,0.00,5000.00,2000.00,250.00,500.00,50.00,15000.00,10.00,20.00,200.00,1000.00,3000.00,0.00,500.00,50.00,200.00,1000.00,20.00,100.00,300.00,2000.00,25.00,30.00,10.00,600.00,5000.00,250.00,1200.00,1500.00,5.00,3000.00,400.00,2500.00,3600.00,10000.00,0.20,6.00,2400.00,4000.00,8000.00,9000.00,15000.00
amount_tsh,111,2,2,1,17277,1880,1492,956,865,761,597,482,461,390,377,370,360,264,216,205,189,178,133,128,122,97,71,68,57,47,45,41,32,30,28,28,23,20,20,14,13,12,12,11,10,10,9,9,9,8,7,7,6,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3837,657,195,181,159,155,148,136,111,68,63,57,40,33,29,28,22,19,18,17,16,16,14,14,13,10,7,6,6,4,3,3,3,2,2,1,1,1,1,1,1,1,1,1,1,1,7,13795,948,514,352,285,272,254,162,124,118,102,93,80,65,57,56,50,23,22,20,17,12,11,10,4,4,4,4,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,730,14,12,11,8,2,2,1,1,1,1,1,5882,84,65,49,45,39,36,35,30,22,20,17,9,8,5,5,5,4,4,3,2,2,2,1,1,1,1,1,1,1


The group of features of (extraction_type, extraction_type_group, extraction_type_class), (payment, payment_type),
(water_quality, quality_group), (source, source_class), (subvillage, region, region_code, district_code, lga, ward), and (waterpoint_type, waterpoint_type_group) all contain similar representation of data in different grains. Hence, we risk overfitting our data during training by including all the features in our analysis, which can be dropped.

In [None]:
columns =