### ML Models with KNNImputer 

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

### Load Dataset

In [2]:
# Load raw file
data1 = pd.read_csv('../data/raw/water_potability.csv')

In [3]:
print(data1.shape)
display(data1.head())

(3276, 10)


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [22]:
data1.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


### Dealing with Missing Values

In [4]:
# Check null values
data1.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [15]:
from sklearn.impute import KNNImputer

# Create the imputer object with a k of neighbors parameter of 5
imputer = KNNImputer(n_neighbors=5)

# Apply the imputer to our data
imputed_data = imputer.fit_transform(data1)

# Convert the result back to dataframe
dataset_imputed = pd.DataFrame(imputed_data, columns=data1.columns)

In [17]:
dataset_imputed.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [18]:
dataset_imputed.shape

(3276, 10)

In [21]:
dataset_imputed.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0
mean,7.078167,196.369496,22014.092526,7.122277,333.605767,426.205111,14.28497,66.403218,3.966786,0.39011
std,1.497228,32.879761,8768.570828,1.583085,37.453055,80.824064,3.308162,15.837754,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.187017,176.850538,15666.690297,6.127421,312.389295,365.734414,12.065801,56.409254,3.439711,0.0
50%,7.052221,196.967627,20927.833607,7.130299,333.255192,421.884968,14.218338,66.603114,3.955028,0.0
75%,7.94857,216.667456,27332.762127,8.114887,355.013586,481.792304,16.557652,76.843337,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


### Split Features/ Target

In [23]:
# Separate the features from the target
y = dataset_imputed['Potability']
X = dataset_imputed.drop(['Potability'], axis=1)

### Train-Test Split

In [24]:
# Generate the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Scale Columns

In [25]:
# Select StandardScaler
scaler = StandardScaler()

# Fit scaler with TRAIN data
scaler.fit(X_train)

# Scale X_train_num_transformed with fitted scaler. Output is a np.array.
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add columns to np.array to create a DataFrame
X_train_scaled_df = pd.DataFrame(X_train_scaled, 
                                 columns=X_train.columns, 
                                 index=X_train.index)

X_test_scaled_df = pd.DataFrame(X_test_scaled, 
                                columns=X_test.columns,
                                index=X_test.index)

### Model Selection

In [26]:
# Create models
model1 = LogisticRegression()
model2 = RandomForestClassifier()
model3 = KNeighborsClassifier()
model4 = AdaBoostClassifier()
model5 = GradientBoostingClassifier()


model_pipeline = [model1, model2, model3, model4, model5]
model_names = ['Logistic Regression', 'Random Forest Classifier', 'KNN', 'AdaBoostClassifier', 'GradientBoostingClassifier']

scores = {}

for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_scaled_df, y_train, cv=5))
    scores[model_name] = mean_score

print(scores)

{'Logistic Regression': 0.6013966188124934, 'Random Forest Classifier': 0.6576742681546175, 'KNN': 0.6205858568560855, 'AdaBoostClassifier': 0.5878842366640981, 'GradientBoostingClassifier': 0.6275737077946172}


Again Random Forest Classifier can be the best performing model in this case.

### Model Building

In [27]:
# Initialize the RandomForest Classifier
clf = RandomForestClassifier()

# Define the Parameter Grid for GridSearch
param_grid = {
    'n_estimators': [10, 50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf' : [10, 20],
    'max_depth':[5, 10, 20],
}
# Initialize and Fit GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_scaled_df, y_train)

# Get the best estimator
grid_search.best_params_


Fitting 5 folds for each of 576 candidates, totalling 2880 fits
[CV] END bootstrap=True, criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=5, n_estimators=150; total time=   0.3s
[CV] END bootstrap=True, criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=10, n_estimators=150; total time=   0.3s
[CV] END bootstrap=True, criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=50; total time=   0.1s
[CV] END bootstrap=True, criterion=gini, max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=15, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 20,
 'max_features': 'log2',
 'min_samples_leaf': 10,
 'min_samples_split': 10,
 'n_estimators': 150}

In [28]:
#get predictions
clf = RandomForestClassifier(max_depth=20,
                             min_samples_split=10,
                             min_samples_leaf =10,
                             n_estimators=150,
                            bootstrap=False,
                            max_features='log2',
                            criterion='gini')

clf.fit(X_train_scaled_df, y_train)

print("The Accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train_scaled_df, y_train)))
print("The Accuracy for the Random Forest in the TEST  set is {:.2f}".format(clf.score(X_test_scaled_df, y_test)))

y_pred = clf.predict(X_test_scaled_df)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

The Accuracy for the Random Forest in the TRAIN set is 0.93
The Accuracy for the Random Forest in the TEST  set is 0.67


Potability
0.0    617
1.0    366
Name: count, dtype: int64

array([[541,  76],
       [249, 117]])

In [29]:
grid_search.best_score_

0.6642083131166101

[CV] END bootstrap=False, criterion=entropy, max_depth=10, max_features=sqrt, min_samples_leaf=20, min_samples_split=10, n_estimators=100; total time=   0.6s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, max_features=sqrt, min_samples_leaf=20, min_samples_split=15, n_estimators=10; total time=   0.1s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, max_features=sqrt, min_samples_leaf=20, min_samples_split=15, n_estimators=10; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, max_features=sqrt, min_samples_leaf=20, min_samples_split=15, n_estimators=50; total time=   0.3s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, max_features=sqrt, min_samples_leaf=20, min_samples_split=15, n_estimators=100; total time=   0.5s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, max_features=sqrt, min_samples_leaf=20, min_samples_split=15, n_estimators=150; total time=   0.8s
[CV] END bootstrap=False, criterion=entropy, max_depth=

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.68      0.88      0.77       617
         1.0       0.61      0.32      0.42       366

    accuracy                           0.67       983
   macro avg       0.65      0.60      0.59       983
weighted avg       0.66      0.67      0.64       983



### Feature Importance Rank

In [31]:
# Get the feature importances
rf_importances = list(zip(clf.feature_importances_, data1.columns))
rf_importances.sort(reverse=True)
# Print the feature rankings
rf_importances

[(0.16925131315709122, 'Sulfate'),
 (0.16058729025462418, 'ph'),
 (0.13501443034133198, 'Hardness'),
 (0.12192992938975937, 'Chloramines'),
 (0.11619249292315212, 'Solids'),
 (0.0881679637573793, 'Organic_carbon'),
 (0.07812397006881013, 'Conductivity'),
 (0.06560960494952661, 'Trihalomethanes'),
 (0.06512300515832496, 'Turbidity')]

In [32]:
y_pred = clf.predict(X_test_scaled_df)
y_pred_df = pd.DataFrame(y_pred, columns=['Potability'], index=X_test_scaled_df.index)
y_pred_df

Unnamed: 0,Potability
2947,0.0
2782,0.0
1644,0.0
70,1.0
2045,0.0
...,...
542,0.0
3189,0.0
1921,0.0
2839,0.0
