In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from numpy import asarray

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, confusion_matrix, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [15]:
pd.set_option("display.max_columns", None)

In [6]:
!ls ../../data

clean_wells.csv features.csv    target.csv


In [9]:
df = pd.read_csv('../../data/clean_wells.csv')
df

Unnamed: 0.1,Unnamed: 0,amount_tsh,gps_height,basin,region,population,construction_year,extraction_type_class,management,management_group,...,quality_group,quantity_group,source_type,source_class,waterpoint_type_group,status_group,near_pop,level,has_static,has_year
0,0,6000.0,1390,Lake Nyasa,Iringa,109,1999,gravity,vwc,user-group,...,good,enough,spring,groundwater,communal standpipe,functional,True,above,True,True
1,1,0.0,1399,Lake Victoria,Mara,280,2010,gravity,wug,user-group,...,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,True,above,False,True
2,2,25.0,686,Pangani,Manyara,250,2009,gravity,vwc,user-group,...,good,enough,dam,surface,communal standpipe,functional,True,above,True,True
3,3,0.0,263,Ruvuma / Southern Coast,Mtwara,58,1986,submersible,vwc,user-group,...,good,dry,borehole,groundwater,communal standpipe,non functional/needs repair,True,above,False,True
4,4,0.0,0,Lake Victoria,Kagera,0,0,gravity,other,other,...,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,False,level,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59395,59395,10.0,1210,Pangani,Kilimanjaro,125,1999,gravity,water board,user-group,...,good,enough,spring,groundwater,communal standpipe,functional,True,above,True,True
59396,59396,4700.0,1212,Rufiji,Iringa,56,1996,gravity,vwc,user-group,...,good,enough,river/lake,surface,communal standpipe,functional,True,above,True,True
59397,59397,0.0,0,Rufiji,Mbeya,0,0,handpump,vwc,user-group,...,fluoride,enough,borehole,groundwater,hand pump,functional,False,level,False,False
59398,59398,0.0,0,Rufiji,Dodoma,0,0,handpump,vwc,user-group,...,good,insufficient,shallow well,groundwater,hand pump,functional,False,level,False,False


In [25]:
final_df = df.drop(columns = ['Unnamed: 0','amount_tsh', 'gps_height', 'population', 'construction_year'])

## Model 1: KNN 

In [26]:
X = final_df.drop(columns = ['status_group'])
y = final_df['status_group']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

In [27]:
#https://stackoverflow.com/questions/61641852/what-is-the-valid-specification-of-the-columns-needed-for-sklearn-classifier-p

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

In [28]:
num_trans = Pipeline(steps=[
    ('scaler', StandardScaler())
])
cat_trans = Pipeline(steps=[
    ('ohe', OneHotEncoder(drop='first',
                          sparse=False))
])

In [29]:
pp = ColumnTransformer(transformers=[
    ('num', num_trans, num_cols),
    ('cat', cat_trans, cat_cols)
])

In [30]:
pp.fit(X_train)
X_tr_pp = pp.transform(X_train)
X_te_pp = pp.transform(X_test)

In [31]:
knn = KNeighborsClassifier(3)

knn.fit(X_tr_pp, y_train)

KNeighborsClassifier(n_neighbors=3)

In [32]:
scores = cross_val_score(estimator=knn, X=X_tr_pp,
                y=y_train, cv=5)
print(f"Median score: {np.median(scores):.4f} (+/- {np.std(scores):.4f})")

Median score: 0.7441 (+/- 0.0064)


In [33]:
knn.score(X_te_pp, y_test)

0.7335353535353535

## Model 2: Decision Tree

In [34]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_tr_pp, y_train)

DecisionTreeClassifier(random_state=42)

In [35]:
scores = cross_val_score(estimator=dt, X=X_tr_pp,
                         y=y_train, cv=5)
print(f"Median score: {np.median(scores):.4f} (+/- {np.std(scores):.4f})")

Median score: 0.7773 (+/- 0.0056)


In [36]:
dt.score(X_te_pp, y_test)

0.7788552188552188

## Model 3: Logistic Regression

In [37]:
lr = LogisticRegression(max_iter=1000, random_state=42)

lr.fit(X_tr_pp, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [38]:
scores = cross_val_score(estimator=lr, X=X_tr_pp,
                         y=y_train, cv=5)
print(f"Median score: {np.median(scores):.4f} (+/- {np.std(scores):.4f})")

Median score: 0.7420 (+/- 0.0057)


In [39]:
lr.score(X_te_pp, y_test)

0.7423569023569023

## Model 4: Random Forest

In [40]:
rfc = RandomForestClassifier(random_state=42, max_depth=22, min_samples_leaf=14)

rfc.fit(X_tr_pp, y_train)

RandomForestClassifier(max_depth=22, min_samples_leaf=14, random_state=42)

In [41]:
scores = cross_val_score(estimator=rfc, X=X_tr_pp,
                         y=y_train, cv=9)
print(f"Median score: {np.median(scores):.4f} (+/- {np.std(scores):.4f})")

Median score: 0.7655 (+/- 0.0069)


In [42]:
rfc.score(X_te_pp, y_test)

0.7658585858585859

## Model 5: XG Boost

In [43]:
xgb_model = xgb.XGBClassifier(random_state=42, learning_rate=1, max_depth=6)

xgb_model.fit(X_tr_pp, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [44]:
print(f"Train Score: {xgb_model.score(X_tr_pp, y_train)}")
print(f"Test Score: {xgb_model.score(X_te_pp, y_test)}")

Train Score: 0.8163187429854096
Test Score: 0.7793265993265993


## Model 6: Naive Bayes (Bernoulli)

In [45]:
nb = BernoulliNB()

nb.fit(X_tr_pp,y_train)

train_preds = nb.predict(X_tr_pp)
y_preds = nb.predict(X_te_pp)


print(f'Naive Bayes Train Accuracy: {accuracy_score(y_train, train_preds):.4f}')
print(f'Naive Bayes Test Accuracy: {accuracy_score(y_test, y_preds):.4f}')


Naive Bayes Train Accuracy: 0.6694
Naive Bayes Test Accuracy: 0.6646


## Tuning with GridSearchCV

### Random Forest Model

In [46]:
rfc_cv = Pipeline(steps=[
    ('preprocessor', pp), 
    ('tree', DecisionTreeClassifier(max_depth=7, random_state=713))
])

In [47]:
param_grid = {
    'tree__max_depth':[5, 14, 22, 10], 
    'tree__min_samples_leaf':[5, 14, 22]
}

In [48]:
gs_rfc = GridSearchCV(rfc_cv, param_grid, cv=5, scoring='accuracy')


gs_rfc.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['near_pop', 'has_static', 'has_year'], dtype='object')),
                                                                        ('cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(drop='first',
                                                                                                        sparse=False))]),
                                                                         Index([

In [49]:
gs_rfc.best_params_

{'tree__max_depth': 22, 'tree__min_samples_leaf': 5}

### Naive Bayes (Bernoulli)

In [50]:
nb_cv = Pipeline(steps=[
    ('preprocessor', pp), 
    ('NB', BernoulliNB())
])

In [51]:
param_nb = {
    'NB__alpha':[0, 0.5, 1], 
    'NB__fit_prior':[True, False]
}

In [52]:
gs_nb = GridSearchCV(nb_cv, param_nb, cv=5, scoring='accuracy')


gs_nb.fit(X_train, y_train)



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['near_pop', 'has_static', 'has_year'], dtype='object')),
                                                                        ('cat',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(drop='first',
                                                                                                        sparse=False))]),
                                                                         Index([

In [53]:
gs_nb.best_params_

{'NB__alpha': 1, 'NB__fit_prior': True}