# Water Pumps: Modeling

In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

## Load Train and Test Sets

In [2]:
def load_train_test():
    file_list = ['X_train', 'X_test', 'y_train', 'y_test']
    data_sets = []
    for filename in file_list:
        data_sets.append(pickle.load(open(f'../data/clean/{filename}', 'rb')))
    return tuple(data_sets)

In [3]:
X_train, X_test, y_train, y_test = load_train_test()

In [4]:
X_train.shape

(21296, 230)

## Resampling
Counter the class imbalanced data set by performing resampling. I will consider both over sampling and under sampling.

### Over Sampling

In [5]:
X_train_over, y_train_over = SMOTE().fit_resample(X_train, y_train)

In [6]:
print(pd.Series(y_train_over).value_counts())

non functional             12482
functional needs repair    12482
functional                 12482
dtype: int64


### Under Sampling

In [7]:
X_train_under, y_train_under = RandomUnderSampler(random_state=42).fit_resample(X_train, y_train)

In [8]:
print(pd.Series(y_train_under).value_counts())

non functional             1505
functional                 1505
functional needs repair    1505
dtype: int64


## Modeling
I will create two sets of models, one for over sampled training sets, and another for under sampled training sets. For each set of models, I will consider the following models:
* Logistic Regression.
* Random Forrest.
* XGBoost.

### Models with Over Sampling
#### Logistic Regression

In [9]:
params_logreg_over = [('min_max', MinMaxScaler()), ('log_reg', LogisticRegression(solver='saga', multi_class='multinomial'))]
pipe_logreg_over = Pipeline(params_logreg_over)
pipe_logreg_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_logreg_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_logreg_over.score(X_test, y_test):0.3%}.')

The training score is: 71.890%.
The test score is: 67.196%.


In [10]:
C = pipe_logreg_over.get_params()['log_reg__C']
print(f'The regularization parameter is {C:0.3f}.')

The regularization parameter is 1.000.


**Observations:**
The roughly 5% drop in performance between the training and test scores indicates that over fitting could be a problem. I will try to address this using cross-validation. I will also use a grid search to identify the best regularization parameter, which could reduce the over fitting.

In [11]:
params_logreg_over_gs = [('min_max', MinMaxScaler()), ('log_reg', LogisticRegression(solver='saga', multi_class='multinomial', max_iter=10000))]
pipe_logreg_over_gs = Pipeline(params_logreg_over_gs)
gs_logreg_params = {'log_reg__C': np.logspace(-1, 1, num=5)}
gs_logreg = GridSearchCV(pipe_logreg_over_gs, gs_logreg_params, n_jobs=-1)
gs_logreg.fit(X_train_over, y_train_over)
best_c = gs_logreg.best_estimator_.get_params()['log_reg__C']
print(f'The best value for C is {best_c:0.3f}.')

The best value for C is 10.000.


In [12]:
params_logreg_over = [('min_max', MinMaxScaler()), ('log_reg', LogisticRegression(solver='saga', multi_class='multinomial', C=best_c))]
pipe_logreg_over = Pipeline(params_logreg_over)
pipe_logreg_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_logreg_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_logreg_over.score(X_test, y_test):0.3%}.')

The training score is: 71.936%.
The test score is: 67.240%.




**Observations:** The grid search found a regularization parameter that gave a better accuracy, but did not improve overfitting.

In [13]:
y_pred_train_logreg_over = pipe_logreg_over.predict(X_train_over)
y_pred_logreg_over = pipe_logreg_over.predict(X_test)

In [14]:
print(classification_report(y_train_over, y_pred_train_logreg_over))

                         precision    recall  f1-score   support

             functional       0.69      0.67      0.68     12482
functional needs repair       0.71      0.78      0.74     12482
         non functional       0.76      0.71      0.73     12482

               accuracy                           0.72     37446
              macro avg       0.72      0.72      0.72     37446
           weighted avg       0.72      0.72      0.72     37446



In [15]:
print(classification_report(y_test, y_pred_logreg_over))

                         precision    recall  f1-score   support

             functional       0.84      0.67      0.74      5349
functional needs repair       0.22      0.70      0.34       645
         non functional       0.74      0.68      0.71      3133

               accuracy                           0.67      9127
              macro avg       0.60      0.68      0.60      9127
           weighted avg       0.76      0.67      0.70      9127



**Observations:**
The recall value is pretty mediocre across the board.

#### Random Forest

In [16]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=100, random_state = 42, n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

The training score is: 99.997%.
The test score is: 80.607%.


**Observations:** Without doing any hyperparameter tuning, this model overfit a lot on the first try.

In [17]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=100, 
                                                                                        random_state = 42, 
                                                                                        max_features='sqrt', 
                                                                                        n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

The training score is: 99.997%.
The test score is: 80.607%.


In [18]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=100, 
                                                                                        random_state = 42, 
                                                                                        max_features='sqrt', 
                                                                                        max_depth=None, 
                                                                                        min_samples_split=2,
                                                                                        n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

The training score is: 99.997%.
The test score is: 80.607%.


**Observations:** After adjusting the parameters using the [suggestions](https://scikit-learn.org/stable/modules/ensemble.html#random-forest-parameters) from scikit-learn, there is still a lot of overfitting. I will need to do a grid search with cross validation to find the best hyperparameters.

In [19]:
params_rf_over = [('min_max', MinMaxScaler()), ('rf', RandomForestClassifier(random_state = 42, n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)

In [20]:
max_depth_list = list(np.arange(10, 110, 10))
max_depth_list.append(None)

In [21]:
rs_params_rf_over = {
    'rf__bootstrap': [True, False],
    'rf__max_depth': max_depth_list,
    'rf__max_features': ['auto', 'sqrt'],
    'rf__min_samples_leaf': list(np.arange(1, 11, 1)),
    'rf__min_samples_split': list(np.arange(1, 11, 1)),
    'rf__n_estimators': list(np.arange(200, 2200, 200))
}

In [22]:
list(pipe_rf_over.get_params().keys())

['memory',
 'steps',
 'verbose',
 'min_max',
 'rf',
 'min_max__clip',
 'min_max__copy',
 'min_max__feature_range',
 'rf__bootstrap',
 'rf__ccp_alpha',
 'rf__class_weight',
 'rf__criterion',
 'rf__max_depth',
 'rf__max_features',
 'rf__max_leaf_nodes',
 'rf__max_samples',
 'rf__min_impurity_decrease',
 'rf__min_impurity_split',
 'rf__min_samples_leaf',
 'rf__min_samples_split',
 'rf__min_weight_fraction_leaf',
 'rf__n_estimators',
 'rf__n_jobs',
 'rf__oob_score',
 'rf__random_state',
 'rf__verbose',
 'rf__warm_start']

In [23]:
rs_rf_over = RandomizedSearchCV(pipe_rf_over, rs_params_rf_over, random_state=42, n_jobs=-1)
rs_rf_over.fit(X_train_over, y_train_over)

RandomizedSearchCV(estimator=Pipeline(steps=[('min_max', MinMaxScaler()),
                                             ('rf',
                                              RandomForestClassifier(n_jobs=-1,
                                                                     random_state=42))]),
                   n_jobs=-1,
                   param_distributions={'rf__bootstrap': [True, False],
                                        'rf__max_depth': [10, 20, 30, 40, 50,
                                                          60, 70, 80, 90, 100,
                                                          None],
                                        'rf__max_features': ['auto', 'sqrt'],
                                        'rf__min_samples_leaf': [1, 2, 3, 4, 5,
                                                                 6, 7, 8, 9,
                                                                 10],
                                        'rf__min_samples_split': [1, 2, 3, 

In [24]:
rs_rf_over.best_params_

{'rf__n_estimators': 800,
 'rf__min_samples_split': 3,
 'rf__min_samples_leaf': 1,
 'rf__max_features': 'auto',
 'rf__max_depth': 90,
 'rf__bootstrap': True}

In [25]:
np.arange(100, 350, 50)

array([100, 150, 200, 250, 300])

In [26]:
gs_params_rf_over = {
    'rf__bootstrap': [False],
    'rf__max_depth': list(np.arange(90, 115, 5)),
    'rf__max_features': ['auto', 'sqrt'],
    'rf__min_samples_leaf': list(np.arange(1, 5, 1)),
    'rf__min_samples_split': list(np.arange(8, 13, 1)),
    'rf__n_estimators': list(np.arange(100, 350, 50))
}

In [27]:
# gs_rf_over = GridSearchCV(pipe_rf_over, gs_params_rf_over, n_jobs=-1)
# gs_rf_over.fit(X_train_over, y_train_over)

In [28]:
params_rf_over = [('min_max', MinMaxScaler()), ('random_forest', RandomForestClassifier(n_estimators=800, 
                                                                                        random_state = 42, 
                                                                                        max_features='sqrt', 
                                                                                        max_depth=100, 
                                                                                        min_samples_split=10,
                                                                                        min_samples_leaf=1,
                                                                                        bootstrap=False,
                                                                                        n_jobs=-1))]
pipe_rf_over = Pipeline(params_rf_over)
pipe_rf_over.fit(X_train_over, y_train_over)
print(f'The training score is: {pipe_rf_over.score(X_train_over, y_train_over):0.3%}.')
print(f'The test score is: {pipe_rf_over.score(X_test, y_test):0.3%}.')

The training score is: 97.749%.
The test score is: 81.221%.


In [29]:
y_pred_train_rf_over = pipe_rf_over.predict(X_train_over)
y_pred_rf_over = pipe_rf_over.predict(X_test)

In [30]:
print(classification_report(y_train_over, y_pred_train_rf_over))

                         precision    recall  f1-score   support

             functional       0.95      0.99      0.97     12482
functional needs repair       0.99      0.98      0.99     12482
         non functional       0.99      0.97      0.98     12482

               accuracy                           0.98     37446
              macro avg       0.98      0.98      0.98     37446
           weighted avg       0.98      0.98      0.98     37446



In [31]:
print(classification_report(y_test, y_pred_rf_over))

                         precision    recall  f1-score   support

             functional       0.82      0.90      0.86      5349
functional needs repair       0.56      0.33      0.42       645
         non functional       0.83      0.76      0.79      3133

               accuracy                           0.81      9127
              macro avg       0.73      0.67      0.69      9127
           weighted avg       0.80      0.81      0.81      9127

