# Water Pumps - Modeling Pipeline

**Plan:**
1. Load training and testing data.
2. Role resampling, encoding, and scaling into single function.
3. Develop out each model.
    1. Include hyperparameter tuning, saving best parameters.
    2. Run model with best parameters.
    3. Collect results from classification reports to a data frame.

## Import Libraries

In [44]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

## Load Train and Test Sets

In [2]:
def load_train_test():
    file_list = ['X_train', 'X_test', 'y_train', 'y_test']
    data_sets = []
    for filename in file_list:
        data_sets.append(pickle.load(open(f'../data/clean/{filename}', 'rb')))
    return tuple(data_sets)

In [3]:
X_train, X_test, y_train, y_test = load_train_test()

Load predictions from baseline model.

In [4]:
y_pred_base = pickle.load(open(f'../data/clean/y_pred_base', 'rb'))

## Prepare Training Data

In [5]:
def prepare_data(X_train, X_test, y_train, y_test, sampling_type):
    scaler = MinMaxScaler().fit(X_train)
    X_train_rescaled = scaler.transform(X_train)
    X_test_pipe = scaler.transform(X_test)
    
    if sampling_type == 'over':
        X_train_pipe, y_train_pipe = SMOTE().fit_resample(X_train_rescaled, y_train)
    elif sampling_type == 'under':
        X_train_pipe, y_train_pipe = RandomUnderSampler(random_state=42).fit_resample(X_train_rescaled, y_train)
    else:
        raise Exception("sampling_type must be 'over' or 'under'. Please try again.")
    
    y_test_pipe = y_test
    
    return X_train_pipe, X_test_pipe, y_train_pipe, y_test_pipe

In [6]:
# sampling_type = 'over'
sampling_type = 'under'

In [7]:
X_train_pipe, X_test_pipe, y_train_pipe, y_test_pipe = prepare_data(X_train, X_test, y_train, y_test, sampling_type)

## Modeling

### Load Saved Results
If a results file exists, load it, otherwise set it to `None`.

In [45]:
results_filename = 'pipeline_results.csv'
results_filepath = f'../results/{results_filename}'

In [None]:
if os.path.isfile(results_filepath):
    df_results = pd.read_csv(results_filepath)
else:
    df_results = None

In [8]:
def store_results(y_test, y_pred, model_type, df=df_results):
    results = classification_report(y_test, y_pred, output_dict=True)
    df_results = pd.DataFrame(results).T
    df_results.drop(columns=['f1-score', 'support'], inplace=True)
    df_results.drop(['accuracy', 'macro avg', 'weighted avg'], inplace=True)
    
    multi_columns = [(model_type, x) for x in df_results.columns]
    df_results.columns = pd.MultiIndex.from_tuples(multi_columns)
    
    if df is None:
        return df_results
    else:
        return pd.concat([df, df_results], axis=1)

### Baseline Model
Set up baseline model results and store in data frame.

In [9]:
model_type = 'base_line'

In [10]:
df_results = store_results(y_test, y_pred_base, model_type, df=None)

In [11]:
df_results

Unnamed: 0_level_0,base_line,base_line
Unnamed: 0_level_1,precision,recall
functional,0.767559,0.911198
functional needs repair,0.632258,0.151938
non functional,0.787948,0.659432


### Logistic Regression

In [12]:
model_type = f'logreg_{sampling_type}'

In [13]:
logreg_rs = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=10000)
rs_logreg_params = {'C': np.arange(0.2, 2.4, 0.4), 'penalty': ['l1', 'l2']}
rs_logreg = RandomizedSearchCV(logreg_rs, rs_logreg_params, random_state=42, n_jobs=-1)
rs_logreg.fit(X_train_pipe, y_train_pipe)

RandomizedSearchCV(estimator=LogisticRegression(max_iter=10000,
                                                multi_class='multinomial',
                                                solver='saga'),
                   n_jobs=-1,
                   param_distributions={'C': array([0.2, 0.6, 1. , 1.4, 1.8, 2.2]),
                                        'penalty': ['l1', 'l2']},
                   random_state=42)

In [14]:
best_C = rs_logreg.best_estimator_.get_params()['C']
best_penalty = rs_logreg.best_estimator_.get_params()['penalty']
print(f'The best value for C is {best_C:0.3f}.')
print(f'The best penalty is {best_penalty}.')

The best value for C is 2.200.
The best penalty is l2.


In [15]:
logreg_best = LogisticRegression(solver='saga', multi_class='multinomial', C=best_C, penalty=best_penalty, max_iter=10000)
logreg_best.fit(X_train_pipe, y_train_pipe)

LogisticRegression(C=2.2000000000000006, max_iter=10000,
                   multi_class='multinomial', solver='saga')

In [16]:
y_pred_logreg_best = logreg_best.predict(X_test_pipe)

In [17]:
df_results = store_results(y_test, y_pred_logreg_best, model_type, df=df_results)

In [18]:
df_results

Unnamed: 0_level_0,base_line,base_line,logreg_under,logreg_under
Unnamed: 0_level_1,precision,recall,precision,recall
functional,0.767559,0.911198,0.838948,0.637876
functional needs repair,0.632258,0.151938,0.218284,0.725581
non functional,0.787948,0.659432,0.717764,0.66805


### Random Forest

In [19]:
model_type = f'rf_{sampling_type}'

In [20]:
rf_rs = RandomForestClassifier(random_state = 42, n_jobs=-1)

In [21]:
max_depth_list = list(np.arange(10, 110, 10))
max_depth_list.append(None)

In [22]:
rs_rf_params = {
    'bootstrap': [True, False],
    'max_depth': max_depth_list,
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': list(np.arange(1, 11, 1)),
    'min_samples_split': list(np.arange(1, 11, 1)),
    'n_estimators': list(np.arange(200, 2200, 200))
}

In [23]:
rs_rf = RandomizedSearchCV(rf_rs, rs_rf_params, random_state=42, n_jobs=-1)
rs_rf.fit(X_train_pipe, y_train_pipe)

RandomizedSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                              7, 8, 9, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42)

In [24]:
print('The best hyperparameters for random forest are:')
print(rs_rf.best_params_)

The best hyperparameters for random forest are:
{'n_estimators': 800, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 90, 'bootstrap': True}


In [25]:
rf_best = RandomForestClassifier(
    n_estimators = rs_rf.best_params_['n_estimators']*2,
    min_samples_split = rs_rf.best_params_['min_samples_split'],
    min_samples_leaf = rs_rf.best_params_['min_samples_leaf'],
    max_features = rs_rf.best_params_['max_features'],
    max_depth = rs_rf.best_params_['max_depth'],
    bootstrap = rs_rf.best_params_['bootstrap'],
    random_state = 42, 
    n_jobs=-1
)

In [26]:
rf_best.fit(X_train_pipe, y_train_pipe)

RandomForestClassifier(max_depth=90, min_samples_split=3, n_estimators=1600,
                       n_jobs=-1, random_state=42)

In [27]:
y_pred_rf = rf_best.predict(X_test_pipe)

In [28]:
df_results = store_results(y_test, y_pred_rf, model_type, df=df_results)

In [29]:
df_results

Unnamed: 0_level_0,base_line,base_line,logreg_under,logreg_under,rf_under,rf_under
Unnamed: 0_level_1,precision,recall,precision,recall,precision,recall
functional,0.767559,0.911198,0.838948,0.637876,0.851116,0.648719
functional needs repair,0.632258,0.151938,0.218284,0.725581,0.259851,0.756589
non functional,0.787948,0.659432,0.717764,0.66805,0.715637,0.724545


### XGBoost

In [30]:
model_type = f'xgb_{sampling_type}'

In [31]:
class_mapping = {
    'functional': 0,
    'functional needs repair': 1,
    'non functional': 2
}

In [32]:
y_train_encoded = pd.Series(y_train_pipe).replace(class_mapping).values
y_test_encoded = pd.Series(y_test_pipe).replace(class_mapping).values

In [33]:
xgb_rs = XGBClassifier(n_estimators=1000)

In [34]:
rs_params_xgb_over = {
    'max_depth': list(np.arange(1, 7, 2)),
    'min_child_weight': list(np.arange(1, 7, 2)),
    'gamma': [0, 1, 5]
}

In [35]:
rs_xgb = RandomizedSearchCV(xgb_rs, rs_params_xgb_over, random_state=42, n_jobs=-1, n_iter=100)
rs_xgb.fit(X_train_pipe, y_train_encoded)





RandomizedSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=1000, n_jobs=None,
                                           num_parallel_tree=None,
                                           random_state=None, reg_alpha=None,
                                           reg_lam

In [36]:
print('The best parameters for XGBoost are:')
print(rs_xgb.best_params_)

The best parameters for XGBoost are:
{'min_child_weight': 1, 'max_depth': 5, 'gamma': 1}


In [37]:
xgb_best = XGBClassifier(
    n_estimators=1000,
    max_depth=rs_xgb.best_params_['max_depth'],
    min_child_weight=rs_xgb.best_params_['min_child_weight'],
    gamma=rs_xgb.best_params_['gamma']
)

In [38]:
xgb_best.fit(X_train_pipe, y_train_encoded)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [39]:
y_pred_encoded = rf_best.predict(X_test_pipe)

In [40]:
reverse_mapping = {
    0:'functional',
    1:'functional needs repair',
    2:'non functional'
}

In [41]:
y_pred_xgb = pd.Series(y_pred_encoded).replace(reverse_mapping).values

In [42]:
df_results = store_results(y_test, y_pred_xgb, model_type, df=df_results)

In [43]:
df_results

Unnamed: 0_level_0,base_line,base_line,logreg_under,logreg_under,rf_under,rf_under,xgb_under,xgb_under
Unnamed: 0_level_1,precision,recall,precision,recall,precision,recall,precision,recall
functional,0.767559,0.911198,0.838948,0.637876,0.851116,0.648719,0.851116,0.648719
functional needs repair,0.632258,0.151938,0.218284,0.725581,0.259851,0.756589,0.259851,0.756589
non functional,0.787948,0.659432,0.717764,0.66805,0.715637,0.724545,0.715637,0.724545
