In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

In [2]:
X_train = pd.read_csv('data/X_train.csv', index_col='id')
X_test = pd.read_csv('data/X_test.csv', index_col='id')
y_train = pd.read_csv('data/y_train.csv', index_col='id')
y_test = pd.read_csv('data/y_test.csv', index_col='id')

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40095 entries, 47807 to 3912
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             40095 non-null  float64
 1   gps_height             40095 non-null  int64  
 2   installer              40095 non-null  object 
 3   longitude              40095 non-null  float64
 4   latitude               40095 non-null  float64
 5   num_private            40095 non-null  int64  
 6   basin                  40095 non-null  object 
 7   region                 40095 non-null  object 
 8   region_code            40095 non-null  int64  
 9   district_code          40095 non-null  int64  
 10  population             40095 non-null  int64  
 11  public_meeting         40095 non-null  object 
 12  scheme_management      40095 non-null  object 
 13  permit                 40095 non-null  object 
 14  construction_year      40095 non-null  int64  
 15 

In [4]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13365 entries, 42051 to 58131
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             13365 non-null  float64
 1   gps_height             13365 non-null  int64  
 2   installer              13365 non-null  object 
 3   longitude              13365 non-null  float64
 4   latitude               13365 non-null  float64
 5   num_private            13365 non-null  int64  
 6   basin                  13365 non-null  object 
 7   region                 13365 non-null  object 
 8   region_code            13365 non-null  int64  
 9   district_code          13365 non-null  int64  
 10  population             13365 non-null  int64  
 11  public_meeting         13365 non-null  object 
 12  scheme_management      13365 non-null  object 
 13  permit                 13365 non-null  object 
 14  construction_year      13365 non-null  int64  
 15

In [16]:
X_train.installer.value_counts()[:30]

DWE                   11783
missing                2429
Government             1237
RWE                     817
Commu                   712
DANIDA                  694
KKKT                    604
Hesawa                  580
0                       537
TCRS                    481
Central government      408
CES                     395
District Council        385
DANID                   377
Community               373
HESAWA                  367
LGA                     287
World vision            286
TASAF                   265
WEDECO                  261
Gover                   253
District council        247
AMREF                   226
TWESA                   203
WU                      197
Dmdd                    194
ACRA                    180
World Vision            175
SEMA                    173
DW                      166
Name: installer, dtype: int64

In [5]:
y_train.value_counts(normalize=True)

status_group           
functional                 0.542786
non functional             0.384437
functional needs repair    0.072777
dtype: float64

In [6]:
y_test.value_counts(normalize=True)

status_group           
functional                 0.542761
non functional             0.384437
functional needs repair    0.072802
dtype: float64

In [7]:
y_train = y_train.status_group

In [8]:
y_test = y_test.status_group

# Logistic Regression

In [9]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
num_pipe = Pipeline([('ss', StandardScaler())])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns), 
                                  ('numerical', num_pipe, X_train_num.columns)])

In [10]:
logreg_pipe = imbPipeline([
    ('trans', transformer), 
    ('smote', SMOTE(random_state=42)),
    ('logreg', LogisticRegression(n_jobs=-1, random_state=42))
])
logreg_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['installer', 'basin', 'region', 'public_meeting', 'scheme_management',
       'permit', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'water_qua...
       'source_class', 'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                           

In [11]:
print('Accuracy Score Train:', logreg_pipe.score(X_train,y_train))
print('Accuracy Score Test:', logreg_pipe.score(X_test,y_test))

Accuracy Score Train: 0.6571642349420127
Accuracy Score Test: 0.6474373363262252


In [12]:
#log_loss_logreg = cross_val_score(logreg_pipe, X_train, y_train, scoring='neg_log_loss', cv=5)
#log_loss_logreg = -log_loss_logreg.mean()
#print('Log Loss:', log_loss_logreg)

# KNN (Default Parameters)

In [13]:
knn_pipe = Pipeline([
    ('trans', transformer), 
    ('knn', KNeighborsClassifier(n_jobs=-1))
])
knn_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['installer', 'basin', 'region', 'public_meeting', 'scheme_management',
       'permit', 'extraction_type', 'extraction_type_group',
       'extraction_type_class', 'management', 'management_group', 'payment',
       'water_qua...', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),


In [14]:
print('Accuracy Score Train:', knn_pipe.score(X_train,y_train))
print('Accuracy Score Test:', knn_pipe.score(X_test,y_test))

Accuracy Score Train: 0.8371368000997631
Accuracy Score Test: 0.7751589973812196


# KNN (Drop `installer` to check for computational speed)

In [15]:
X_train_minus = X_train.drop('installer', axis=1)
X_test_minus = X_test.drop('installer', axis=1)

In [16]:
X_train_cat_minus = X_train_minus.select_dtypes('object')
X_train_num_minus = X_train_minus.select_dtypes(['float64', 'int64'])


cat_pipe = Pipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
num_pipe = Pipeline([('ss', StandardScaler())])

transformer = ColumnTransformer([
    ('categorical', cat_pipe, X_train_cat_minus.columns), 
    ('numerical', num_pipe, X_train_num_minus.columns)
])

In [17]:
knn_pipe_minus = Pipeline([('trans', transformer), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_pipe_minus.fit(X_train_minus, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['basin', 'region', 'public_meeting', 'scheme_management', 'permit',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'water_quality',
       'quali...', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),


In [18]:
print('Accuracy Score Train:', knn_pipe_minus.score(X_train_minus,y_train))
print('Accuracy Score Test:', knn_pipe_minus.score(X_test_minus,y_test))

Accuracy Score Train: 0.8345928420002494
Accuracy Score Test: 0.7713430602319491


In [28]:
log_loss_knn_pipe_minus = cross_val_score(knn_pipe_minus, X_train_minus, y_train, scoring='neg_log_loss', cv=5)
log_loss_knn_pipe_minus = -log_loss_knn_pipe_minus.mean()
print('Log Loss:', log_loss_knn_pipe_minus)

Log Loss: 2.480406761566641


Dropping `installer` decreases computational time for KNN with default hyperparameters from around 13 minutes to around 1 minute.  

# KNN with Gridsearch (Still without `installer`)

In [24]:
knn_pipe_grid = {'knn__n_neighbors': [3, 5, 7], 'knn__p': [1, 2, 3]}
gs_knn_pipe = GridSearchCV(estimator=knn_pipe_minus, param_grid=knn_pipe_grid)
gs_knn_pipe.fit(X_train_minus, y_train)

GridSearchCV(estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['basin', 'region', 'public_meeting', 'scheme_management', 'permit',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment'...
       'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                                        ('numerical',
                                                                         Pi

In [25]:
gs_knn_pipe.best_params_

{'knn__n_neighbors': 5, 'knn__p': 1}

In [26]:
print('Accuracy Score Train:', gs_knn_pipe.score(X_train_minus,y_train))
print('Accuracy Score Test:', gs_knn_pipe.score(X_test_minus,y_test))

Accuracy Score Train: 0.8364883401920439
Accuracy Score Test: 0.773063973063973


In [None]:
#log_loss_gs_knn = cross_val_score(gs_knn_pipe, X_train_minus, y_train, scoring='neg_log_loss', cv=5)
#log_loss_gs_knn = -log_loss_gs_knn.mean()
#print('Log Loss:', log_loss_gs_knn)

The grid search suggests keeping the default n_neighbors (5), but changing the p hyperparameter from the default (2) to 1.

# Run KNN again with new hyperparameter, p=1

In [31]:
knn_pipe_minus_after_gs = Pipeline([
    ('trans', transformer), 
    ('knn', KNeighborsClassifier(n_neighbors=5, p=1, n_jobs=-1))
])
knn_pipe_minus_after_gs.fit(X_train_minus, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['basin', 'region', 'public_meeting', 'scheme_management', 'permit',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'water_quality',
       'quali...ource', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
     

In [32]:
print('Accuracy Score Train:', knn_pipe_minus_after_gs.score(X_train_minus,y_train))
print('Accuracy Score Test:', knn_pipe_minus_after_gs.score(X_test_minus,y_test))

Accuracy Score Train: 0.8364883401920439
Accuracy Score Test: 0.773063973063973


In [33]:
log_loss_knn_after_gs = cross_val_score(knn_pipe_minus_after_gs, X_train_minus, y_train, 
                                        scoring='neg_log_loss', cv=5)
log_loss_knn_after_gs = -log_loss_knn_after_gs.mean()
print('Log Loss:', log_loss_knn_after_gs)

Log Loss: 2.455056297350457


# Support Vector Machines (still without `installer`)

In [36]:
svm_pipe = Pipeline([('trans', transformer), ('svm', SVC())])
svm_pipe.fit(X_train_minus, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['basin', 'region', 'public_meeting', 'scheme_management', 'permit',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'water_quality',
       'quality_group', 'quantity', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   Stan

In [37]:
print('Accuracy Score Train:', svm_pipe.score(X_train_minus,y_train))
print('Accuracy Score Test:', svm_pipe.score(X_test_minus,y_test))

Accuracy Score Train: 0.7805711435341065
Accuracy Score Test: 0.7637860082304527


# SVM with GridSearch

In [None]:
#svm_pipe_grid = {'svm__C': [1, 1e2, 1e4, 1e6]}
#gs_svm_pipe = GridSearchCV(estimator=svm_pipe, param_grid=svm_pipe_grid)
#gs_svm_pipe.fit(X_train_minus, y_train)

In [None]:
gs_svm_pipe.best_params_

# Run SVM with new hyperparamter, C=

In [None]:
svm_pipe_after_gs = Pipeline([('trans', transformer), ('svm', SVC(C=))])
svm_pipe_after_gs.fit(X_train_minus, y_train)