# modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
%time
import time
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.11 µs


### before feature engineering

In [2]:
X_train = pd.read_csv('data/X_train.csv', index_col='id')
X_test = pd.read_csv('data/X_test.csv', index_col='id')
y_train = pd.read_csv('data/y_train.csv', index_col='id')
y_test = pd.read_csv('data/y_test.csv', index_col='id')

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40095 entries, 73454 to 15434
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gps_height         40095 non-null  int64  
 1   longitude          40095 non-null  float64
 2   latitude           40095 non-null  float64
 3   basin              40095 non-null  object 
 4   region             40095 non-null  object 
 5   district_code      40095 non-null  int64  
 6   population         40095 non-null  int64  
 7   public_meeting     37825 non-null  object 
 8   scheme_management  37501 non-null  object 
 9   permit             38005 non-null  object 
 10  construction_year  40095 non-null  int64  
 11  extraction_type    40095 non-null  object 
 12  management         40095 non-null  object 
 13  payment            40095 non-null  object 
 14  water_quality      40095 non-null  object 
 15  quantity           40095 non-null  object 
 16  source            

In [4]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13365 entries, 26552 to 42035
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gps_height         13365 non-null  int64  
 1   longitude          13365 non-null  float64
 2   latitude           13365 non-null  float64
 3   basin              13365 non-null  object 
 4   region             13365 non-null  object 
 5   district_code      13365 non-null  int64  
 6   population         13365 non-null  int64  
 7   public_meeting     12624 non-null  object 
 8   scheme_management  12489 non-null  object 
 9   permit             12674 non-null  object 
 10  construction_year  13365 non-null  int64  
 11  extraction_type    13365 non-null  object 
 12  management         13365 non-null  object 
 13  payment            13365 non-null  object 
 14  water_quality      13365 non-null  object 
 15  quantity           13365 non-null  object 
 16  source            

In [5]:
y_train_array = y_train['status_group']

In [6]:
y_test_array = y_test['status_group']

In [7]:
rfc = RandomForestClassifier()
ss = StandardScaler()

X_train_nums = X_train.select_dtypes(exclude=object)
X_test_nums = X_test.select_dtypes(exclude=object)

X_train_nums_scaled = ss.fit_transform(X_train_nums)
X_test_nums_scaled = ss.transform(X_test_nums)

In [8]:
%time
rfc.fit(X_train_nums_scaled, y_train_array)

preds = rfc.predict(X_test_nums_scaled)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs


In [9]:
train_scores = rfc.score(X_train_nums_scaled, y_train_array)
test_scores = rfc.score(X_test_nums_scaled, y_test_array)
print('The Mode\'s accuracy on the training data is', round(train_scores, 4))
print('The Model\'s accuracy on the test data is', round(test_scores, 4))

The Mode's accuracy on the training data is 0.9859
The Model's accuracy on the test data is 0.703


In [10]:
log_loss_rfc = cross_val_score(rfc, X_train_nums_scaled, y_train_array, scoring='neg_log_loss', n_jobs = -2, cv = 2)
log_loss_rfc = -log_loss_rfc.mean()
print('Log Loss:', log_loss_rfc)

Log Loss: 1.0539624148602948


This inital model that only contains numerical data is severly overfitting the data. The next model will need to tune some hyperparameters to reduce the gap between the training and testing data. Our log loss has been decreased from the dummy classifer, but there is still room for improvement

### tuning some hyperparameters to see if the model can be improved

no categorical columns have yet been introduced.

In [19]:
pipeline_1 = Pipeline([('ss', StandardScaler()), ('rfc', RandomForestClassifier())])
pipeline_1.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()), ('rfc', RandomForestClassifier())],
 'verbose': False,
 'ss': StandardScaler(),
 'rfc': RandomForestClassifier(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'rfc__bootstrap': True,
 'rfc__ccp_alpha': 0.0,
 'rfc__class_weight': None,
 'rfc__criterion': 'gini',
 'rfc__max_depth': None,
 'rfc__max_features': 'auto',
 'rfc__max_leaf_nodes': None,
 'rfc__max_samples': None,
 'rfc__min_impurity_decrease': 0.0,
 'rfc__min_impurity_split': None,
 'rfc__min_samples_leaf': 1,
 'rfc__min_samples_split': 2,
 'rfc__min_weight_fraction_leaf': 0.0,
 'rfc__n_estimators': 100,
 'rfc__n_jobs': None,
 'rfc__oob_score': False,
 'rfc__random_state': None,
 'rfc__verbose': 0,
 'rfc__warm_start': False}

In [21]:
params_1 = {
    'rfc__max_depth': list(range(5,50,5)),
    'rfc__min_samples_leaf': list(range(5,50,5)),
    'rfc__n_estimators': list(range(25,500,25)),
}

In [23]:
rs_1 = RandomizedSearchCV(pipeline_1, params_1, n_jobs=-2, verbose=3, n_iter=10)

rs_1.fit(X_train_nums, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  26 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-2)]: Done  50 out of  50 | elapsed:  2.3min finished
  self._final_estimator.fit(Xt, y, **fit_params_last_step)


RandomizedSearchCV(estimator=Pipeline(steps=[('ss', StandardScaler()),
                                             ('rfc',
                                              RandomForestClassifier())]),
                   n_jobs=-2,
                   param_distributions={'rfc__max_depth': [5, 10, 15, 20, 25,
                                                           30, 35, 40, 45],
                                        'rfc__min_samples_leaf': [5, 10, 15, 20,
                                                                  25, 30, 35,
                                                                  40, 45],
                                        'rfc__n_estimators': [25, 50, 75, 100,
                                                              125, 150, 175,
                                                              200, 225, 250,
                                                              275, 300, 325,
                                                              350, 375, 4

In [24]:
print('The train score for the random search with some hyperparameter tuning is',
      round(rs_1.score(X_train_nums, y_train), 4))
print('')
print('The test score for the random search with some hyperparameter tuning is',
      round(rs_1.score(X_test_nums, y_test), 4))

The train score for the random search with some hyperparameter tuning is 0.827

The test score for the random search with some hyperparameter tuning is 0.7113


In [26]:
rs_1.best_params_

{'rfc__n_estimators': 100, 'rfc__min_samples_leaf': 5, 'rfc__max_depth': 40}

In [27]:
log_loss_rfc = cross_val_score(rs_1, X_train_nums_scaled, y_train, scoring='neg_log_loss', n_jobs = -2, verbose=1, cv = 2)
log_loss_rfc = -log_loss_rfc.mean()
print('Log Loss:', log_loss_rfc)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


Log Loss: 0.7138834137178536


[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:  3.2min finished


### after feature engineering

In [11]:
X_train_eng = pd.read_csv('data/X_train_eng.csv', index_col='id')
X_test_eng = pd.read_csv('data/X_test_eng.csv', index_col='id')
y_train_eng = pd.read_csv('data/y_train_eng.csv', index_col='id')
y_test_eng = pd.read_csv('data/y_test_eng.csv', index_col='id')

In [12]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40095 entries, 73454 to 15434
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   status_group  40095 non-null  object
dtypes: object(1)
memory usage: 626.5+ KB


In [13]:
print(X_train.columns)
print('----------------------------------------------------------')
print(X_train_eng.columns)
print('----------------------------------------------------------')
print(X_test.columns)
print('----------------------------------------------------------')
print(X_test_eng.columns)
print('----------------------------------------------------------')
print(y_train.columns)
print('----------------------------------------------------------')
print(y_train_eng.columns)
print('----------------------------------------------------------')
print(y_test.columns)
print('----------------------------------------------------------')
print(y_test_eng.columns)
print('----------------------------------------------------------')

Index(['gps_height', 'longitude', 'latitude', 'basin', 'region',
       'district_code', 'population', 'public_meeting', 'scheme_management',
       'permit', 'construction_year', 'extraction_type', 'management',
       'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type'],
      dtype='object')
----------------------------------------------------------
Index(['gps_height', 'longitude', 'latitude', 'basin', 'region',
       'district_code', 'population', 'public_meeting', 'scheme_management',
       'permit', 'construction_year', 'extraction_type', 'management',
       'payment', 'water_quality', 'quantity', 'source', 'waterpoint_type'],
      dtype='object')
----------------------------------------------------------
Index(['gps_height', 'longitude', 'latitude', 'basin', 'region',
       'district_code', 'population', 'public_meeting', 'scheme_management',
       'permit', 'construction_year', 'extraction_type', 'management',
       'payment', 'water_quality', 'quantity'

In [14]:
y_train_eng_array = y_train_eng['status_group']
y_test_eng_array = y_test_eng['status_group']

In [15]:
rfc = RandomForestClassifier()
ss = StandardScaler()

X_train_nums = X_train_eng.select_dtypes(exclude=object)
X_test_nums = X_test_eng.select_dtypes(exclude=object)

X_train_nums_scaled = ss.fit_transform(X_train_nums)
X_test_nums_scaled = ss.transform(X_test_nums)

In [16]:
%time
rfc.fit(X_train_nums_scaled, y_train_eng_array)

preds = rfc.predict(X_test_nums_scaled)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 9.06 µs


In [17]:
train_scores = rfc.score(X_train_nums_scaled, y_train_eng_array)
test_scores = rfc.score(X_test_nums_scaled, y_test_eng_array)
print('The Mode\'s accuracy on the training data is', round(train_scores, 4))
print('The Model\'s accuracy on the test data is', round(test_scores, 4))

The Mode's accuracy on the training data is 0.9859
The Model's accuracy on the test data is 0.7018


In [18]:
log_loss_rfc = cross_val_score(rfc, X_train_nums_scaled, y_train_eng_array, scoring='neg_log_loss', n_jobs = -2, cv = 2)
log_loss_rfc = -log_loss_rfc.mean()
print('Log Loss:', log_loss_rfc)

Log Loss: 1.0217877104878517
