In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

In [2]:
X_train = pd.read_csv('data/X_train.csv', index_col='id')
X_test = pd.read_csv('data/X_test.csv', index_col='id')
y_train = pd.read_csv('data/y_train.csv', index_col='id')
y_test = pd.read_csv('data/y_test.csv', index_col='id')

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40095 entries, 58975 to 23269
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gps_height         40095 non-null  int64  
 1   installer          40095 non-null  object 
 2   longitude          40095 non-null  float64
 3   latitude           40095 non-null  float64
 4   num_private        40095 non-null  int64  
 5   basin              40095 non-null  object 
 6   region             40095 non-null  object 
 7   region_code        40095 non-null  int64  
 8   district_code      40095 non-null  int64  
 9   population         40095 non-null  int64  
 10  public_meeting     40095 non-null  object 
 11  scheme_management  40095 non-null  object 
 12  permit             40095 non-null  object 
 13  construction_year  40095 non-null  int64  
 14  extraction_type    40095 non-null  object 
 15  management         40095 non-null  object 
 16  payment           

In [4]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13365 entries, 33770 to 7443
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gps_height         13365 non-null  int64  
 1   installer          13365 non-null  object 
 2   longitude          13365 non-null  float64
 3   latitude           13365 non-null  float64
 4   num_private        13365 non-null  int64  
 5   basin              13365 non-null  object 
 6   region             13365 non-null  object 
 7   region_code        13365 non-null  int64  
 8   district_code      13365 non-null  int64  
 9   population         13365 non-null  int64  
 10  public_meeting     13365 non-null  object 
 11  scheme_management  13365 non-null  object 
 12  permit             13365 non-null  object 
 13  construction_year  13365 non-null  int64  
 14  extraction_type    13365 non-null  object 
 15  management         13365 non-null  object 
 16  payment            

In [5]:
y_train.value_counts(normalize=True)

status_group           
functional                 0.543609
non functional             0.383589
functional needs repair    0.072802
dtype: float64

In [6]:
y_test.value_counts(normalize=True)

status_group           
functional                 0.543659
non functional             0.383539
functional needs repair    0.072802
dtype: float64

In [7]:
y_train = y_train.status_group

In [8]:
y_test = y_test.status_group

In [9]:
def log_loss(model, X=X_train, y=y_train, scoring='neg_log_loss', cv=3):
    log_loss = cross_val_score(model, X, y, scoring=scoring, cv=cv)
    log_loss = -log_loss.mean()
    print(f'Log loss: {log_loss}')

# Logistic Regression

In [10]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
num_pipe = Pipeline([('ss', StandardScaler())])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns), 
                                  ('numerical', num_pipe, X_train_num.columns)])

In [11]:
logreg_pipe = imbPipeline([
    ('trans', transformer), 
    ('smote', SMOTE(random_state=42)),
    ('logreg', LogisticRegression(n_jobs=-1, random_state=42))
])
logreg_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['installer', 'basin', 'region', 'public_meeting', 'scheme_management',
       'permit', 'extraction_type', 'management', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'num_private', 'region_code',
 

In [12]:
print('Accuracy Score Train:', logreg_pipe.score(X_train,y_train))
print('Accuracy Score Test:', logreg_pipe.score(X_test,y_test))

Accuracy Score Train: 0.6367128070831775
Accuracy Score Test: 0.6333707444818556


In [13]:
log_loss(logreg_pipe, X_train, y_train)

Log loss: 0.8005327453806812


# Logistic Regression Grid Search

In [14]:
logreg_pipe_grid = {'logreg__C': [1e-2, 1, 1e2]}
gs_logreg_pipe = GridSearchCV(estimator=logreg_pipe, param_grid=logreg_pipe_grid, cv=3)
gs_logreg_pipe.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['installer', 'basin', 'region', 'public_meeting', 'scheme_management',
       'permit', 'extraction_type', 'management', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type'],
      dtype='object')),
                                                                        ('numerical',
                                                                         Pipeline(steps=[('ss',
            

In [15]:
gs_logreg_pipe.best_params_

{'logreg__C': 1}

# KNN (Default Parameters)

In [16]:
knn_pipe = imbPipeline([
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier(n_jobs=-1))
])
knn_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['installer', 'basin', 'region', 'public_meeting', 'scheme_management',
       'permit', 'extraction_type', 'management', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'num_private', 'region_code',
 

In [17]:
print('Accuracy Score Train:', knn_pipe.score(X_train,y_train))
print('Accuracy Score Test:', knn_pipe.score(X_test,y_test))
#(before further data cleaning and SMOTE) Accuracy Score Train: 0.8371368000997631
#(before further data cleaning and SMOTE) Accuracy Score Test: 0.7751589973812196

Accuracy Score Train: 0.8260880409028557
Accuracy Score Test: 0.722484100261878


In [18]:
log_loss(knn_pipe, X_train, y_train)

Log loss: 3.7789466887696026


# KNN (Drop `installer` to check for computational speed)

In [19]:
#X_train_minus = X_train.drop('installer', axis=1)
#X_test_minus = X_test.drop('installer', axis=1)

In [20]:
#X_train_cat_minus = X_train_minus.select_dtypes('object')
#X_train_num_minus = X_train_minus.select_dtypes(['float64', 'int64'])
#
#
#cat_pipe = Pipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
#num_pipe = Pipeline([('ss', StandardScaler())])
#
#transformer = ColumnTransformer([
#    ('categorical', cat_pipe, X_train_cat_minus.columns), 
#    ('numerical', num_pipe, X_train_num_minus.columns)
#])

In [21]:
#knn_pipe_minus = Pipeline([('trans', transformer), ('knn', KNeighborsClassifier(n_jobs=-1))])
#knn_pipe_minus.fit(X_train_minus, y_train)

In [22]:
#print('Accuracy Score Train:', knn_pipe_minus.score(X_train_minus,y_train))
#print('Accuracy Score Test:', knn_pipe_minus.score(X_test_minus,y_test))

In [23]:
#log_loss_knn_pipe_minus = cross_val_score(knn_pipe_minus, X_train_minus, y_train, scoring='neg_log_loss', cv=5)
#log_loss_knn_pipe_minus = -log_loss_knn_pipe_minus.mean()
#print('Log Loss:', log_loss_knn_pipe_minus)

Dropping `installer` decreases computational time for KNN with default hyperparameters from around 13 minutes to around 1 minute.  

# KNN with Gridsearch 

In [24]:
knn_pipe_grid = {'knn__n_neighbors': [3, 5, 7], 'knn__p': [1, 2, 3]}
gs_knn_pipe = GridSearchCV(estimator=knn_pipe, param_grid=knn_pipe_grid, cv=3)
gs_knn_pipe.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(transformers=[('categorical',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncoder(handle_unknown='ignore',
                                                                                                        sparse=False))]),
                                                                         Index(['installer', 'basin', 'region', 'public_meeting', 'scheme_management',
       'permit', 'extraction_type', 'management', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type'],
      dtype='object')),
                                                                        ('numerical',
                                                                         Pipeline(steps=[('ss',
            

In [25]:
gs_knn_pipe.best_params_

{'knn__n_neighbors': 3, 'knn__p': 1}

In [26]:
print('Accuracy Score Train:', gs_knn_pipe.score(X_train,y_train))
print('Accuracy Score Test:', gs_knn_pipe.score(X_test,y_test))

Accuracy Score Train: 0.8800349170719541
Accuracy Score Test: 0.7503928170594837


The grid search suggests using n_neighbors (3), and changing the p hyperparameter from the default (2) to 1.

# Run KNN again with new hyperparameters: n_neighbors=3, p=1

In [27]:
knn_pipe_after_gs = imbPipeline([
    ('trans', transformer),
    ('smote', SMOTE(random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=3, p=1, n_jobs=-1))
])
knn_pipe_after_gs.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['installer', 'basin', 'region', 'public_meeting', 'scheme_management',
       'permit', 'extraction_type', 'management', 'payment', 'water_quality',
       'quantity', 'source', 'waterpoint_type'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['gps_height', 'longitude', 'latitude', 'num_private', 'region_code',
 

In [28]:
print('Accuracy Score Train:', knn_pipe_after_gs.score(X_train,y_train))
print('Accuracy Score Test:', knn_pipe_after_gs.score(X_test,y_test))

Accuracy Score Train: 0.8800349170719541
Accuracy Score Test: 0.7503928170594837


In [29]:
log_loss(knn_pipe_after_gs, X_train, y_train)

Log loss: 4.639916739202775


# Support Vector Machines 

In [None]:
svm_pipe = imbPipeline([
    ('trans', transformer), 
    ('smote', SMOTE(random_state=42)),
    ('svm', SVC())
])
svm_pipe.fit(X_train, y_train)

In [None]:
print('Accuracy Score Train:', svm_pipe.score(X_train,y_train))
print('Accuracy Score Test:', svm_pipe.score(X_test,y_test))

# SVM with GridSearch

In [None]:
#svm_pipe_grid = {'svm__C': [1, 1e2, 1e4, 1e6]}
#gs_svm_pipe = GridSearchCV(estimator=svm_pipe, param_grid=svm_pipe_grid)
#gs_svm_pipe.fit(X_train_minus, y_train)

In [None]:
#gs_svm_pipe.best_params_

# Run SVM with new hyperparamter, C=

In [None]:
#svm_pipe_after_gs = Pipeline([('trans', transformer), ('svm', SVC(C=))])
#svm_pipe_after_gs.fit(X_train_minus, y_train)