In [145]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [124]:
X_train = pd.read_csv('data/X_train.csv', index_col='id')
X_test = pd.read_csv('data/X_test.csv', index_col='id')
y_train = pd.read_csv('data/y_train.csv', index_col='id')
y_test = pd.read_csv('data/y_test.csv', index_col='id')

In [125]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40095 entries, 45522 to 9914
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             40095 non-null  float64
 1   funder                 40095 non-null  object 
 2   gps_height             40095 non-null  int64  
 3   installer              40095 non-null  object 
 4   longitude              40095 non-null  float64
 5   latitude               40095 non-null  float64
 6   num_private            40095 non-null  int64  
 7   basin                  40095 non-null  object 
 8   region                 40095 non-null  object 
 9   region_code            40095 non-null  int64  
 10  district_code          40095 non-null  int64  
 11  population             40095 non-null  int64  
 12  public_meeting         40095 non-null  object 
 13  scheme_management      40095 non-null  object 
 14  permit                 40095 non-null  object 
 15 

In [126]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13365 entries, 70336 to 10514
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             13365 non-null  float64
 1   funder                 13365 non-null  object 
 2   gps_height             13365 non-null  int64  
 3   installer              13365 non-null  object 
 4   longitude              13365 non-null  float64
 5   latitude               13365 non-null  float64
 6   num_private            13365 non-null  int64  
 7   basin                  13365 non-null  object 
 8   region                 13365 non-null  object 
 9   region_code            13365 non-null  int64  
 10  district_code          13365 non-null  int64  
 11  population             13365 non-null  int64  
 12  public_meeting         13365 non-null  object 
 13  scheme_management      13365 non-null  object 
 14  permit                 13365 non-null  object 
 15

In [127]:
y_train.value_counts(normalize=True)

status_group           
functional                 0.543010
non functional             0.384138
functional needs repair    0.072852
dtype: float64

In [128]:
y_test.value_counts(normalize=True)

status_group           
functional                 0.542985
non functional             0.384138
functional needs repair    0.072877
dtype: float64

In [129]:
y_train = y_train.status_group

In [130]:
y_test = y_test.status_group

# Logistic Regression

In [134]:
X_train_cat = X_train.select_dtypes('object')
X_train_num = X_train.select_dtypes(['float64', 'int64'])

cat_pipe = Pipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
num_pipe = Pipeline([('ss', StandardScaler())])

transformer = ColumnTransformer([('categorical', cat_pipe, X_train_cat.columns), 
                                  ('numerical', num_pipe, X_train_num.columns)])

In [137]:
logreg_pipe = Pipeline([
    ('trans', transformer), 
    ('logreg', LogisticRegression(max_iter=2000, n_jobs=-1, random_state=42))
])
logreg_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['funder', 'installer', 'basin', 'region', 'public_meeting',
       'scheme_management', 'permit', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', '...
       'quality_group', 'quantity', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                     

In [138]:
print('Accuracy Score Train:', logreg_pipe.score(X_train,y_train))
print('Accuracy Score Test:', logreg_pipe.score(X_test,y_test))

Accuracy Score Train: 0.7789250529991271
Accuracy Score Test: 0.7620650953984287


In [139]:
#log_loss_logreg = cross_val_score(logreg_pipe, X_train, y_train, scoring='neg_log_loss', cv=5)
#log_loss_logreg = -log_loss_logreg.mean()
#print('Log Loss:', log_loss_logreg)

KeyboardInterrupt: 

# KNN (Default Parameters)

In [140]:
knn_pipe = Pipeline([
    ('trans', transformer), 
    ('knn', KNeighborsClassifier(n_jobs=-1))
])
knn_pipe.fit(X_train, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['funder', 'installer', 'basin', 'region', 'public_meeting',
       'scheme_management', 'permit', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', '...
       'quality_group', 'quantity', 'source', 'source_type', 'source_class',
       'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                     

In [141]:
print('Accuracy Score Train:', knn_pipe.score(X_train,y_train))
print('Accuracy Score Test:', knn_pipe.score(X_test,y_test))

Accuracy Score Train: 0.8353410649706946
Accuracy Score Test: 0.7740366629255518


# KNN (Drop `funder` and `installer` to check for computational speed)

In [148]:
X_train_minus = X_train.drop(['funder', 'installer'], axis=1)
X_test_minus = X_test.drop(['funder', 'installer'], axis=1)

In [143]:
X_train_cat_minus = X_train_minus.select_dtypes('object')
X_train_num_minus = X_train_minus.select_dtypes(['float64', 'int64'])


cat_pipe = Pipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
num_pipe = Pipeline([('ss', StandardScaler())])

transformer = ColumnTransformer([
    ('categorical', cat_pipe, X_train_cat_minus.columns), 
    ('numerical', num_pipe, X_train_num_minus.columns)
])

In [147]:
knn_pipe_minus = Pipeline([('trans', transformer), ('knn', KNeighborsClassifier(n_jobs=-1))])
knn_pipe_minus.fit(X_train_minus, y_train)

Pipeline(steps=[('trans',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  Index(['basin', 'region', 'public_meeting', 'scheme_management', 'permit',
       'extraction_type', 'extraction_type_group', 'extraction_type_class',
       'management', 'management_group', 'payment', 'payment_type',
       'water_...', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group'],
      dtype='object')),
                                                 ('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),


In [150]:
print('Accuracy Score Train:', knn_pipe_minus.score(X_train_minus,y_train))
print('Accuracy Score Test:', knn_pipe_minus.score(X_test_minus,y_test))

Accuracy Score Train: 0.8305773787255268
Accuracy Score Test: 0.7703703703703704


In [None]:
log_loss_logreg = cross_val_score(knn_pipe_minus, X_train_minus, y_train, scoring='neg_log_loss', cv=5)
log_loss_logreg = -log_loss_logreg.mean()
print('Log Loss:', log_loss_logreg)

# KNN with Gridsearch

In [None]:
knn_pipe_grid = {'knn__n_neighbors': [10, 50, 100]}