# Water Pumps: Modeling

In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

## Load Train and Test Sets

In [2]:
def load_train_test():
    file_list = ['X_train', 'X_test', 'y_train', 'y_test']
    data_sets = []
    for filename in file_list:
        data_sets.append(pickle.load(open(f'../data/clean/{filename}', 'rb')))
    return tuple(data_sets)

In [3]:
X_train, X_test, y_train, y_test = load_train_test()

In [4]:
X_train.shape

(21296, 230)

Load predictions from baseline model.

In [5]:
y_test_base = pickle.load(open(f'../data/clean/y_test_base', 'rb'))

In [6]:
y_test_base.shape

(9127,)

## Rescaling
Rescale the features to values between 0 and 1. Since the categorical variables are one-hot-encoded, this will ensure that the continuous variables are on the same scale.

In [7]:
def rescale_features(feature_list):
    rescaled_list = []
    for features in feature_list:
        rescaled = MinMaxScaler().fit_transform(features)
        rescaled_list.append(rescaled)
    return tuple(rescaled_list)

In [9]:
X_train_rescaled, X_test_rescaled = rescale_features([X_train, X_test])

In [10]:
X_train_rescaled.min(), X_train_rescaled.max()

(0.0, 1.0)

In [11]:
X_test_rescaled.min(), X_test_rescaled.max()

(0.0, 1.0)

In [12]:
X_train_rescaled.shape, X_test_rescaled.shape

((21296, 230), (9127, 230))

## Resampling
Counter the class imbalanced data set by performing resampling. I will consider both over sampling and under sampling.

### Over Sampling

In [None]:
X_train_over, y_train_over = SMOTE().fit_resample(X_train, y_train)

In [None]:
print(pd.Series(y_train_over).value_counts())

### Under Sampling

In [None]:
X_train_under, y_train_under = RandomUnderSampler(random_state=42).fit_resample(X_train, y_train)

In [None]:
print(pd.Series(y_train_under).value_counts())

## Modeling
I will create two sets of models, one for over sampled training sets, and another for under sampled training sets. For each set of models, I will consider the following models:
* Logistic Regression.
* Random Forrest.
* XGBoost.

### Models with Over Sampling
#### Logistic Regression

In [None]:
params_logreg_over = [('min_max', MinMaxScaler()), ('log_reg', LogisticRegression(solver='saga', multi_class='multinomial'))]
pipe_logreg_over = Pipeline(params_logreg_over)
pipe_logreg_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_logreg_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_logreg_over.score(X_test, y_test):0.3%}.')

In [None]:
C = pipe_logreg_over.get_params()['log_reg__C']
print(f'The regularization parameter is {C:0.3f}.')

**Observations:**
The roughly 5% drop in performance between the training and test scores indicates that over fitting could be a problem. I will try to address this using cross-validation. I will also use a grid search to identify the best regularization parameter, which could reduce the over fitting.

In [None]:
params_logreg_over_gs = [('min_max', MinMaxScaler()), ('log_reg', LogisticRegression(solver='saga', multi_class='multinomial', max_iter=10000))]
pipe_logreg_over_gs = Pipeline(params_logreg_over_gs)
gs_logreg_params = {'log_reg__C': np.logspace(-1, 1, num=5)}
gs_logreg = GridSearchCV(pipe_logreg_over_gs, gs_logreg_params, n_jobs=-1)
gs_logreg.fit(X_train_over, y_train_over)
best_c = gs_logreg.best_estimator_.get_params()['log_reg__C']
print(f'The best value for C is {best_c:0.3f}.')

In [None]:
params_logreg_over = [('min_max', MinMaxScaler()), ('log_reg', LogisticRegression(solver='saga', multi_class='multinomial', C=best_c))]
pipe_logreg_over = Pipeline(params_logreg_over)
pipe_logreg_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_logreg_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_logreg_over.score(X_test, y_test):0.3%}.')

**Observations:** The grid search found a regularization parameter that gave a better accuracy, but did not improve overfitting.

In [None]:
y_pred_train_logreg_over = pipe_logreg_over.predict(X_train_over)
y_pred_logreg_over = pipe_logreg_over.predict(X_test)

In [None]:
print(classification_report(y_train_over, y_pred_train_logreg_over))

In [None]:
print(classification_report(y_test, y_pred_logreg_over))

**Observations:**
The recall value is pretty mediocre across the board.

#### Random Forest

In [None]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=100, random_state = 42, n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

**Observations:** Without doing any hyperparameter tuning, this model overfit a lot on the first try.

In [None]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=100, 
                                                                                        random_state = 42, 
                                                                                        max_features='sqrt', 
                                                                                        n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

In [None]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=100, 
                                                                                        random_state = 42, 
                                                                                        max_features='sqrt', 
                                                                                        max_depth=None, 
                                                                                        min_samples_split=2,
                                                                                        n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

**Observations:** After adjusting the parameters using the [suggestions](https://scikit-learn.org/stable/modules/ensemble.html#random-forest-parameters) from scikit-learn, there is still a lot of overfitting. I will need to do a grid search with cross validation to find the best hyperparameters.

In [None]:
params_rf_over = [('min_max', MinMaxScaler()), ('rf', RandomForestClassifier(random_state = 42, n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)

In [None]:
max_depth_list = list(np.arange(10, 110, 10))
max_depth_list.append(None)

In [None]:
rs_params_rf_over = {
    'rf__bootstrap': [True, False],
    'rf__max_depth': max_depth_list,
    'rf__max_features': ['auto', 'sqrt'],
    'rf__min_samples_leaf': list(np.arange(1, 11, 1)),
    'rf__min_samples_split': list(np.arange(1, 11, 1)),
    'rf__n_estimators': list(np.arange(200, 2200, 200))
}

In [None]:
list(pipe_rf_over.get_params().keys())

In [None]:
rs_rf_over = RandomizedSearchCV(pipe_rf_over, rs_params_rf_over, random_state=42, n_jobs=-1)
rs_rf_over.fit(X_train_over, y_train_over)

In [None]:
rs_rf_over.best_params_

In [None]:
np.arange(100, 350, 50)

In [None]:
gs_params_rf_over = {
    'rf__bootstrap': [False],
    'rf__max_depth': list(np.arange(90, 115, 5)),
    'rf__max_features': ['auto', 'sqrt'],
    'rf__min_samples_leaf': list(np.arange(1, 5, 1)),
    'rf__min_samples_split': list(np.arange(8, 13, 1)),
    'rf__n_estimators': list(np.arange(100, 350, 50))
}

In [None]:
# gs_rf_over = GridSearchCV(pipe_rf_over, gs_params_rf_over, n_jobs=-1)
# gs_rf_over.fit(X_train_over, y_train_over)

In [None]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=800, 
                                                                                        random_state = 42, 
                                                                                        max_features='sqrt', 
                                                                                        max_depth=100, 
                                                                                        min_samples_split=10,
                                                                                        min_samples_leaf=1,
                                                                                        bootstrap=False,
                                                                                        n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

In [None]:
y_pred_train_rf_over = pipe_rf_over.predict(X_train_over)
y_pred_rf_over = pipe_rf_over.predict(X_test)

In [None]:
print(classification_report(y_train_over, y_pred_train_rf_over))

In [None]:
print(classification_report(y_test, y_pred_rf_over))