# Water Pumps - Modeling Pipeline

**Plan:**
1. Load training and testing data.
2. Role resampling, encoding, and scaling into single function.
3. Develop out each model.
    1. Include hyperparameter tuning, saving best parameters.
    2. Run model with best parameters.
    3. Collect results from classification reports to a data frame.

## Import Libraries

In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

## Load Train and Test Sets

In [2]:
def load_train_test():
    file_list = ['X_train', 'X_test', 'y_train', 'y_test']
    data_sets = []
    for filename in file_list:
        data_sets.append(pickle.load(open(f'../data/clean/{filename}', 'rb')))
    return tuple(data_sets)

In [3]:
X_train, X_test, y_train, y_test = load_train_test()

Load predictions from baseline model.

In [11]:
y_pred_base = pickle.load(open(f'../data/clean/y_pred_base', 'rb'))

## Prepare Training Data

In [21]:
def prepare_data(X_train, X_test, y_train, y_test, sampling_type):
    scaler = MinMaxScaler().fit(X_train)
    X_train_rescaled = scaler.transform(X_train)
    X_test_pipe = scaler.transform(X_test)
    
    if sampling_type == 'over':
        X_train_pipe, y_train_pipe = SMOTE().fit_resample(X_train_rescaled, y_train)
    elif sampling_type == 'under':
        X_train_pipe, y_train_pipe = RandomUnderSampler(random_state=42).fit_resample(X_train_rescaled, y_train)
    else:
        raise Exception("sampling_type must be 'over' or 'under'. Please try again.")
    
    y_test_pipe = y_test
    
    return X_train_pipe, X_test_pipe, y_train_pipe, y_test_pipe

In [8]:
sampling_type = 'over'

In [22]:
X_train_pipe, X_test_pipe, y_train_pipe, y_test_pipe = prepare_data(X_train, X_test, y_train, y_test, sampling_type)

## Modeling

In [15]:
def store_results(y_test, y_pred, model_type, df=None):
    results = classification_report(y_test, y_pred, output_dict=True)
    df_results = pd.DataFrame(results).T
    df_results.drop(columns=['f1-score', 'support'], inplace=True)
    df_results.drop(['accuracy', 'macro avg', 'weighted avg'], inplace=True)
    
    multi_columns = [(model_type, x) for x in df_results.columns]
    df_results.columns = pd.MultiIndex.from_tuples(multi_columns)
    
    if df is None:
        return df_results
    else:
        return pd.concat([df, df_results], axis=1)

### Baseline Model
Set up baseline model results and store in data frame.

In [18]:
model_type = 'base_line'

In [19]:
df_results = store_results(y_test, y_pred_base, model_type, df=None)

In [20]:
df_results

Unnamed: 0_level_0,base_line,base_line
Unnamed: 0_level_1,precision,recall
functional,0.767559,0.911198
functional needs repair,0.632258,0.151938
non functional,0.787948,0.659432


### Logistic Regression

In [23]:
model_type = f'logreg_{sampling_type}'

In [25]:
logreg_rs = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=10000)
rs_logreg_params = {'C': np.arange(0.2, 2.4, 0.4), 'penalty': ['l1', 'l2']}
rs_logreg = RandomizedSearchCV(logreg_rs, rs_logreg_params, random_state=42, n_jobs=-1)
rs_logreg.fit(X_train_pipe, y_train_pipe)

RandomizedSearchCV(estimator=LogisticRegression(max_iter=10000,
                                                multi_class='multinomial',
                                                solver='saga'),
                   n_jobs=-1,
                   param_distributions={'C': array([0.2, 0.6, 1. , 1.4, 1.8, 2.2]),
                                        'penalty': ['l1', 'l2']},
                   random_state=42)

In [26]:
best_C = rs_logreg.best_estimator_.get_params()['C']
best_penalty = rs_logreg.best_estimator_.get_params()['penalty']
print(f'The best value for C is {best_C:0.3f}.')
print(f'The best penalty is {best_penalty}.')

The best value for C is 2.200.
The best penalty is l1.


In [27]:
logreg_best = LogisticRegression(solver='saga', multi_class='multinomial', C=best_C, penalty=best_penalty, max_iter=10000)
logreg_best.fit(X_train_pipe, y_train_pipe)

LogisticRegression(C=2.2000000000000006, max_iter=10000,
                   multi_class='multinomial', penalty='l1', solver='saga')

In [28]:
y_pred_logreg_best = logreg_best.predict(X_test_pipe)

In [29]:
df_results = store_results(y_test, y_pred_logreg_best, model_type, df=df_results)

In [30]:
df_results

Unnamed: 0_level_0,base_line,base_line,logreg_over,logreg_over
Unnamed: 0_level_1,precision,recall,precision,recall
functional,0.767559,0.911198,0.846024,0.656384
functional needs repair,0.632258,0.151938,0.224869,0.731783
non functional,0.787948,0.659432,0.736275,0.676349


### Random Forest

In [31]:
model_type = f'rf_{sampling_type}'

In [32]:
rf_rs = RandomForestClassifier(random_state = 42, n_jobs=-1)

In [33]:
max_depth_list = list(np.arange(10, 110, 10))
max_depth_list.append(None)

In [34]:
rs_rf_params = {
    'bootstrap': [True, False],
    'max_depth': max_depth_list,
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': list(np.arange(1, 11, 1)),
    'min_samples_split': list(np.arange(1, 11, 1)),
    'n_estimators': list(np.arange(200, 2200, 200))
}

In [35]:
rs_rf = RandomizedSearchCV(rf_rs, rs_rf_params, random_state=42, n_jobs=-1)
rs_rf.fit(X_train_pipe, y_train_pipe)

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42)

In [36]:
print('The best hyperparameters for random forest are:')
print(rs_rf.best_params_)

The best hyperparameters for random forest are:
{'n_estimators': 800, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}


In [38]:
rf_best = RandomForestClassifier(
    n_estimators = rs_rf.best_params_['n_estimators']*2,
    min_samples_split = rs_rf.best_params_['min_samples_split'],
    min_samples_leaf = rs_rf.best_params_['min_samples_leaf'],
    max_features = rs_rf.best_params_['max_features'],
    max_depth = rs_rf.best_params_['max_depth'],
    bootstrap = rs_rf.best_params_['bootstrap'],
    random_state = 42, 
    n_jobs=-1
)

In [39]:
rf_best.fit(X_train_pipe, y_train_pipe)

RandomForestClassifier(max_depth=90, min_samples_split=3, n_estimators=1600,
                       n_jobs=-1, random_state=42)

In [40]:
y_pred_rf = rf_best.predict(X_test_pipe)

In [41]:
df_results = store_results(y_test, y_pred_logreg_best, model_type, df=df_results)

In [42]:
df_results

Unnamed: 0_level_0,base_line,base_line,logreg_over,logreg_over,rf_over,rf_over
Unnamed: 0_level_1,precision,recall,precision,recall,precision,recall
functional,0.767559,0.911198,0.846024,0.656384,0.846024,0.656384
functional needs repair,0.632258,0.151938,0.224869,0.731783,0.224869,0.731783
non functional,0.787948,0.659432,0.736275,0.676349,0.736275,0.676349
