In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
%time
import time
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbPipeline

Wall time: 0 ns


In [2]:
# Import the presplit data
X_train = pd.read_csv('data/X_train.csv', index_col='id')
X_test = pd.read_csv('data/X_test.csv', index_col='id')
y_train = pd.read_csv('data/y_train.csv', index_col='id')
y_test = pd.read_csv('data/y_test.csv', index_col='id')

In [3]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40095 entries, 45522 to 9914
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             40095 non-null  float64
 1   funder                 40095 non-null  object 
 2   gps_height             40095 non-null  int64  
 3   installer              40095 non-null  object 
 4   longitude              40095 non-null  float64
 5   latitude               40095 non-null  float64
 6   num_private            40095 non-null  int64  
 7   basin                  40095 non-null  object 
 8   region                 40095 non-null  object 
 9   region_code            40095 non-null  int64  
 10  district_code          40095 non-null  int64  
 11  population             40095 non-null  int64  
 12  public_meeting         40095 non-null  object 
 13  scheme_management      40095 non-null  object 
 14  permit                 40095 non-null  object 
 15 

In [4]:
# X_train = pd.get_dummies(X_train, drop_first=True)

# X_test = pd.get_dummies(X_test, drop_first=True)

In [5]:
y_train = np.array(y_train)
y_train = y_train.reshape(40095, )

In [6]:
y_test = np.array(y_test)
y_test = y_test.reshape(13365, )

In [7]:
rfc = RandomForestClassifier()
ss = StandardScaler()

X_train_nums = X_train.select_dtypes(exclude=object)
X_test_nums = X_test.select_dtypes(exclude=object)

X_train_nums_scaled = ss.fit_transform(X_train_nums)
X_test_nums_scaled = ss.transform(X_test_nums)

In [8]:
%time
rfc.fit(X_train_nums_scaled, y_train)

preds = rfc.predict(X_test_nums_scaled)

Wall time: 0 ns


In [9]:
train_scores = rfc.score(X_train_nums_scaled, y_train)
test_scores = rfc.score(X_test_nums_scaled, y_test)
print('The Mode\'s accuracy on the training data is', round(train_scores, 4))
print('The Model\'s accuracy on the test data is', round(test_scores, 4))

The Mode's accuracy on the training data is 0.9858
The Model's accuracy on the test data is 0.7167


In [10]:
log_loss_rfc = cross_val_score(rfc, X_train_nums_scaled, y_train, scoring='neg_log_loss', n_jobs = -2, cv = 2)
log_loss_rfc = -log_loss_rfc.mean()
print('Log Loss:', log_loss_rfc)

Log Loss: 1.0189239606183915


This inital model that only contains numerical data is severly overfitting the data. The next model will need to tune some hyperparameters to reduce the gap between the training and testing data. Our log loss has been decreased from the dummy classifer, but there is still room for improvement

### Tuning some hyperparameters to see if the model can be improved

No categorial colums have yet been introduced.

In [11]:
pipline_1 = Pipeline([('ss', StandardScaler()), ('rfc', RandomForestClassifier())])

In [12]:
pipline_1.get_params()

{'memory': None,
 'steps': [('ss', StandardScaler()), ('rfc', RandomForestClassifier())],
 'verbose': False,
 'ss': StandardScaler(),
 'rfc': RandomForestClassifier(),
 'ss__copy': True,
 'ss__with_mean': True,
 'ss__with_std': True,
 'rfc__bootstrap': True,
 'rfc__ccp_alpha': 0.0,
 'rfc__class_weight': None,
 'rfc__criterion': 'gini',
 'rfc__max_depth': None,
 'rfc__max_features': 'auto',
 'rfc__max_leaf_nodes': None,
 'rfc__max_samples': None,
 'rfc__min_impurity_decrease': 0.0,
 'rfc__min_impurity_split': None,
 'rfc__min_samples_leaf': 1,
 'rfc__min_samples_split': 2,
 'rfc__min_weight_fraction_leaf': 0.0,
 'rfc__n_estimators': 100,
 'rfc__n_jobs': None,
 'rfc__oob_score': False,
 'rfc__random_state': None,
 'rfc__verbose': 0,
 'rfc__warm_start': False}

In [13]:
params_1 = {
    'rfc__max_depth': list(range(5,50,5)),
    'rfc__min_samples_leaf': list(range(5,50,5)),
    'rfc__n_estimators': list(range(25,500,25)),
}

In [14]:
rs_1 = RandomizedSearchCV(pipline_1, params_1, n_jobs=-2, verbose=3, n_iter=10)

rs_1.fit(X_train_nums, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-2)]: Done  50 out of  50 | elapsed:  2.9min finished


RandomizedSearchCV(estimator=Pipeline(steps=[('ss', StandardScaler()),
                                             ('rfc',
                                              RandomForestClassifier())]),
                   n_jobs=-2,
                   param_distributions={'rfc__max_depth': [5, 10, 15, 20, 25,
                                                           30, 35, 40, 45],
                                        'rfc__min_samples_leaf': [5, 10, 15, 20,
                                                                  25, 30, 35,
                                                                  40, 45],
                                        'rfc__n_estimators': [25, 50, 75, 100,
                                                              125, 150, 175,
                                                              200, 225, 250,
                                                              275, 300, 325,
                                                              350, 375, 4

In [15]:
print('The train score for the random search with some hyperparameter tuning is',
      round(rs_1.score(X_train_nums, y_train), 4))
print('')
print('The test score for the random search with some hyperparameter tuning is',
      round(rs_1.score(X_test_nums, y_test), 4))

The train score for the random search with some hyperparameter tuning is 0.8274

The test score for the random search with some hyperparameter tuning is 0.7212


In [16]:
rs_1.best_params_

{'rfc__n_estimators': 375, 'rfc__min_samples_leaf': 5, 'rfc__max_depth': 30}

In [17]:
log_loss_rfc = cross_val_score(rs_1, X_train_nums_scaled, y_train, scoring='neg_log_loss', n_jobs = -2, verbose=1, cv = 2)
log_loss_rfc = -log_loss_rfc.mean()
print('Log Loss:', log_loss_rfc)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


Log Loss: 0.677179717281616


[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:  3.3min finished


# Model 1

This is he first model that will contain the categorical colums, there will be no hyperparameter tuning done yet. 

In [18]:
X_train_nums = X_train.select_dtypes(exclude = object)

X_train_cat = X_train.select_dtypes('object')

In [19]:
numerical_pipeline = Pipeline(steps=[
    ('ss', StandardScaler())
])
                
categorical_pipeline = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'))
])


In [20]:
ohe = OneHotEncoder(handle_unknown='ignore')

ohe.fit(X_train)

OneHotEncoder(handle_unknown='ignore')

In [21]:
trans = ColumnTransformer(transformers=[
    ('numerical', numerical_pipeline, X_train_nums.columns),
    ('categorical', categorical_pipeline, X_train_cat.columns)
], verbose=True, n_jobs=-1)

In [22]:
model_pipe = imbPipeline(steps=[
    ('trans', trans),
    ('smote', SMOTE(random_state=42)),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

In [23]:
model_pipe.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:   31.2s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:  1.3min finished


Pipeline(steps=[('trans',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('numerical',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
       'region_code', 'district_code', 'population', 'construction_year'],
      dtype='object')),
                                                 ('categorical',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))...
       'scheme_management', 'permit', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantit

In [24]:
model_pipe.score(X_train, y_train)

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.1s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    0.3s finished


0.9963586482105

In [25]:
model_pipe.score(X_test, y_test)

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    0.0s finished


0.7873550317994762

In [26]:
cv_1 = cross_val_score(model_pipe, X_train, y_train, scoring='neg_log_loss', n_jobs = -2, verbose = 3, cv = 2)
-cv_1.mean()

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   49.1s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   49.1s finished


0.9206636703988862

In [27]:
cv_1 = cross_val_score(model_pipe, X_test, y_test, scoring='neg_log_loss', n_jobs = -2, verbose = 3, cv = 2)
-cv_1.mean()

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   10.0s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   10.0s finished


0.8820308158165975

This first model is a significant improvement from the first Random Forst Classifer model. The train score increase by $0.0736$. However, the model is still significanly overfitting and that will need to be reduced by hyperparameter tuning. 

The log loss also improved from $1.005$ with the first model with only numeric colums to $0.8861$ with all colums included. This was an improvement of $0.1188$.

# Hyperparameter Tuning

## The first model

Since the training scores are severly overfitting the test scores in the data I will first limit the max depth of the tree. This should certainly help to limit overfitting. I will leave other hyperparamters in their default state for now.

In [28]:
model_pipe_1 = Pipeline(steps=[
    ('trans', trans),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

In [29]:
model_pipe_1.get_params()

{'memory': None,
 'steps': [('trans', ColumnTransformer(n_jobs=-1,
                     transformers=[('numerical',
                                    Pipeline(steps=[('ss', StandardScaler())]),
                                    Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
          'region_code', 'district_code', 'population', 'construction_year'],
         dtype='object')),
                                   ('categorical',
                                    Pipeline(steps=[('ohe',
                                                     OneHotEncoder(handle_unknown='ignore'))]),
                                    Index(['funder', 'installer', 'basin', 'region', 'public_meeting',
          'scheme_management', 'permit', 'extraction_type',
          'extraction_type_group', 'extraction_type_class', 'management',
          'management_group', 'payment', 'payment_type', 'water_quality',
          'quality_group', 'quantity', 'source', 'source_type', 'sourc

In [30]:
params = {
    'rfc__max_depth': list(range(10,100,10))
}

In [31]:
gs_1 = GridSearchCV(model_pipe_1, params, n_jobs=-2, verbose=3, cv = 2)
gs_1.fit(X_train, y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  12 out of  18 | elapsed:  2.0min remaining:  1.0min
[Parallel(n_jobs=-2)]: Done  18 out of  18 | elapsed:  2.7min finished
[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   21.1s finished


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('trans',
                                        ColumnTransformer(n_jobs=-1,
                                                          transformers=[('numerical',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         Index(['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private',
       'region_code', 'district_code', 'population', 'construction_year'],
      dtype='object')),
                                                                        ('categorical',
                                                                         Pipeline(steps=[('ohe',
                                                                                          OneHotEncod...
       'extraction_type_group', '

In [32]:
gs_1.score(X_train, y_train)

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    0.2s finished


0.937797730390323

In [33]:
gs_1.score(X_test, y_test)

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 100 out of 100 | elapsed:    0.0s finished


0.8055368499812944

In [34]:
cv_1 = cross_val_score(model_pipe_1, X_train, y_train, scoring='neg_log_loss', n_jobs = -2, verbose = 3, cv = 2)
-cv_1.mean()

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   25.7s remaining:    0.0s
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:   25.7s finished


0.8668872820124996

In [35]:
cv_1 = cross_val_score(model_pipe_1, X_test, y_test, scoring='neg_log_loss', n_jobs = -2, verbose = 1, cv = 2)
-cv_1.mean()

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   2 out of   2 | elapsed:    4.4s finished


0.9492371018382766

In [36]:
gs_1.best_params_

{'rfc__max_depth': 30}

## Model 2

In this second iteration I will still change max depth, but will also change the number of estimators to see how a change in the number of trees would effect the accuracy score. I will also use criterion to see if it has any effect on the score of the model.

In [37]:
model_pipe_2 = Pipeline(steps=[
    ('trans', trans),
    ('rfc', RandomForestClassifier(verbose=1, n_jobs=-2))
])

In [38]:
params = {
    'rfc__max_depth': list(range(10,100,10)),
    'rfc__criterion': ['gini', 'entropy'],
    'rfc__n_estimators': list(range(50,250,50))   
}

In [39]:
gs_2 = GridSearchCV(model_pipe_2, params, n_jobs=-2, verbose=3, cv = 2)
gs_2.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


Fitting 2 folds for each of 72 candidates, totalling 144 fits


[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed:  1.1min


KeyboardInterrupt: 

In [None]:
gs_2.score(X_train, y_train)

In [None]:
gs_2.score(X_test, y_test)

In [None]:
cv_1 = cross_val_score(model_pipe_2, X_train, y_train, scoring='neg_log_loss', n_jobs = -2, verbose = 1, cv = 2)
-cv_1.mean()

In [None]:
cv_1 = cross_val_score(model_pipe_2, X_test, y_test, scoring='neg_log_loss', n_jobs = -2, verbose = 1, cv = 2)
-cv_1.mean()

In [None]:
gs_1.best_params_