### ML Models when applying Upsampling

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [5]:
# Load cleaned_df.csv file
cleaned_df = pd.read_csv('../data/clean/cleaned_df.csv')
# Read the file and its shape
display(cleaned_df.head())
print(cleaned_df.shape)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
1,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0
2,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708,0
3,10.223862,248.071735,28749.716544,7.513408,393.663396,283.651634,13.789695,84.603556,2.672989,0
4,8.635849,203.361523,13672.091764,4.563009,303.309771,474.607645,12.363817,62.798309,4.401425,0


(2011, 10)


### Dataset Split

In [7]:
# Separate the features from the target
y = cleaned_df['Potability']
X = cleaned_df.drop(['Potability'], axis=1)

### Train-Test Split

In [8]:
# Generate the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Balance Target Column

In [17]:
# Concatenate X_train and y_train
trainset = pd.concat([X_train, y_train], axis=1)
trainset

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
557,5.949519,160.442631,16898.808297,6.045906,367.328542,451.012788,16.359951,62.368234,4.072198,0
1063,5.345345,238.510230,10315.353973,7.822601,281.771707,496.350875,12.624613,87.342833,2.362886,0
1872,5.039407,194.404170,19336.608073,7.194765,339.232126,515.807182,10.728669,61.129517,3.126956,0
1355,8.801934,225.895468,23659.211520,2.458609,408.417866,446.992465,14.340818,42.800911,3.769832,0
1309,7.191962,228.409943,15395.469082,7.398603,334.777619,294.915584,13.932293,50.748365,4.102719,0
...,...,...,...,...,...,...,...,...,...,...
1130,7.965337,151.541889,25274.610305,7.106043,352.316182,527.688242,15.792625,52.268490,3.390982,0
1294,6.519848,183.225998,14284.647917,8.389078,373.091575,495.417954,11.638475,58.661649,3.272286,0
860,7.615557,204.869974,8403.284552,6.012316,418.309431,311.178955,12.742329,67.282328,4.243134,0
1459,7.899452,210.734124,15896.365937,6.907203,319.886957,448.666423,18.169921,124.000000,2.853767,1


In [18]:
# Unique value count of target column
trainset['Potability'].value_counts()

Potability
0    845
1    562
Name: count, dtype: int64

In [22]:
# Let's upsample the minority 
from sklearn.utils import resample
df_majority = trainset[trainset['Potability'] == 0]
df_minority = trainset[trainset['Potability'] == 1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results
trainset1 = pd.concat([df_majority, df_minority_upsampled], axis=0)
trainset1

X_train1 = trainset1.drop(['Potability'], axis =1)
y_train1 = trainset1['Potability']

In [None]:
# imblearn randomupsampler, SMOTE
# x_upsampled, y_upsampled
# x_smote, y_smote
# 845 * 2 =  

In [23]:
# Unique value count of target column after upsampling
trainset1['Potability'].value_counts()

Potability
0    845
1    845
Name: count, dtype: int64

### Scale Columns

In [24]:
# Select StandardScaler
scaler = StandardScaler()

# Fit scaler with TRAIN data
scaler.fit(X_train1)

# Scale X_train_num_transformed with fitted scaler. Output is a np.array.
X_train_scaled = scaler.transform(X_train1)
X_test_scaled = scaler.transform(X_test)

# Add columns to np.array to create a DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, 
                                 columns=X_train.columns, 
                                 index=X_train.index)

X_test_scaled_df = pd.DataFrame(X_test_scaled, 
                                columns=X_test.columns,
                                index=X_test.index)

### Model Selection

In [25]:
# Create models
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = KNeighborsClassifier()
model4 = AdaBoostClassifier()
model5 = GradientBoostingClassifier()


model_pipeline = [model1, model2, model3, model4, model5]
model_names = ['Logistic Regression', 'Random Forest Classifier', 'KNN', 'AdaBoostClassifier', 'GradientBoostingClassifier']

scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_scaled_df, y_train1, cv=5))
    scores[model_name] = mean_score

print(scores)

{'Logistic Regression': 0.5130177514792899, 'Random Forest Classifier': 0.8366863905325443, 'KNN': 0.6467455621301775, 'AdaBoostClassifier': 0.6088757396449704, 'GradientBoostingClassifier': 0.7307692307692307}


### Playing with the parameters

In [26]:
from sklearn.datasets import make_classification
from sklearn.metrics import make_scorer, cohen_kappa_score

# Parameter grid
param_grid = {
    'n_estimators': [10, 50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf' : [10, 20],
    'max_depth':[5, 10, 20],
}

# Multiple scoring metrics
scoring = {
    'Accuracy': 'accuracy',
    'Precision': 'precision',
    'Kappa': make_scorer(cohen_kappa_score)
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, scoring=scoring, refit=False, cv=5)
grid_search.fit(X_train_scaled_df,y_train)

# Retrieve best parameters and best score for each scoring metric
for metric_name in scoring.keys():
    print(f"Best parameters for {metric_name}:")
    index = grid_search.cv_results_['rank_test_' + metric_name].argmin()
    params = grid_search.cv_results_['params'][index]
    best_score = grid_search.cv_results_['mean_test_' + metric_name][index]
    print("Parameters:", params)
    print(f"Score: {best_score:.4f}")
    print("------")


Best parameters for Accuracy:
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 15, 'n_estimators': 50}
Score: 0.8124
------
Best parameters for Precision:
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 15, 'n_estimators': 50}
Score: 0.8099
------
Best parameters for Kappa:
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 15, 'n_estimators': 50}
Score: 0.6249
------


In [29]:
#get predictions
clf = RandomForestClassifier(max_depth=20,
                             min_samples_split=15,
                             min_samples_leaf =10,
                             n_estimators=50,
                            bootstrap=False,
                            max_features='sqrt',
                            criterion='entropy')

clf.fit(X_train_scaled_df, y_train1)

print("The Accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train_scaled_df, y_train)))
print("The Accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf.score(X_test_scaled_df, y_test)))

y_pred = clf.predict(X_test_scaled_df)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

The Accuracy for the Random Forest in the TRAIN set is 0.97
The Accuracy for the Random Forest in the TEST  set is 0.67


Potability
0    355
1    249
Name: count, dtype: int64

array([[293,  62],
       [135, 114]])

[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=150; total time=   0.4s
[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=sqr

In [30]:
# Find other metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.83      0.75       355
           1       0.65      0.46      0.54       249

    accuracy                           0.67       604
   macro avg       0.67      0.64      0.64       604
weighted avg       0.67      0.67      0.66       604



### Feature Importance Rank

In [31]:
# Get the feature importances
rf_importances = list(zip(clf.feature_importances_, cleaned_df.columns))
rf_importances.sort(reverse=True)
# Print the feature rankings
rf_importances

[(0.18344757380320836, 'ph'),
 (0.18249748374570443, 'Sulfate'),
 (0.11524349147034664, 'Chloramines'),
 (0.1125413907524365, 'Solids'),
 (0.11101466433648101, 'Hardness'),
 (0.08041233639528743, 'Turbidity'),
 (0.07711787542002556, 'Organic_carbon'),
 (0.07379749099019108, 'Trihalomethanes'),
 (0.06392769308631907, 'Conductivity')]