# Model Training
1.1 Import Data and Required Packages

Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
#Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import confusion_matrix,precision_score,recall_score
from sklearn.metrics import classification_report, accuracy_score,f1_score,roc_curve,roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings

In [2]:
df = pd.read_csv(r"F:\water_potability\water_potability.csv")

In [3]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


# Filling up the null values

In [4]:
def find_nan(df,variable,median):
    df[variable]=df[variable].fillna(median)

In [5]:
median = df.Trihalomethanes.median()
find_nan(df,'Trihalomethanes',median)

In [6]:
median = df.ph.median()
find_nan(df,'ph',median)

In [7]:
median = df.Sulfate.median()
find_nan(df,'Sulfate',median)

In [8]:
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               3276 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          3276 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3276 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


# Preparing X and Y

In [10]:
x = df.iloc[:,0:9]
y = df.iloc[:,9]

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2020)
x_train.shape,x_test.shape

((2293, 9), (983, 9))


#  Model Training

In [28]:
models={
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classsifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Naives Bayes": GaussianNB(),
    "Gradient Boosting Classifier":GradientBoostingClassifier(),
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Suppport Vector Machine": SVC()
}

model_list =[]
accuracy_list = []
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)  #Train Model
    # make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    #Trainning set performance
    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    model_train_f1 = f1_score(y_train,y_train_pred,average = 'weighted')
    model_train_precision = precision_score(y_train,y_train_pred)
    model_train_recall = recall_score(y_train,y_train_pred)
    model_train_rocauc_score = roc_auc_score(y_train,y_train_pred)
    #Test set performance
    model_test_accuracy = accuracy_score(y_test,y_test_pred)
    model_test_f1 = f1_score(y_test,y_test_pred,average = 'weighted')
    model_test_precision = precision_score(y_test,y_test_pred)
    model_test_recall = recall_score(y_test,y_test_pred)
    model_test_rocauc_score = roc_auc_score(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print('Model Performance of the training Set')
    print('- Accuracy: {:.4f}'.format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- ROC Auc Score: {:.4f}'.format(model_train_rocauc_score))

    print('-----------------------------------------------------------------')

    print('Model Performance of the test Set')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- ROC Auc Score: {:.4f}'.format(model_test_rocauc_score))

    accuracy_list.append(model_test_accuracy)
    print("="*35)
    print('\n')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression
Model Performance of the training Set
- Accuracy: 0.6167
- F1 score: 0.4704
- Precision: 0.0000
- Recall: 0.0000
- ROC Auc Score: 0.5000
-----------------------------------------------------------------
Model Performance of the test Set
- Accuracy: 0.5941
- F1 score: 0.4428
- Precision: 0.0000
- Recall: 0.0000
- ROC Auc Score: 0.5000


K-Neighbors Classsifier
Model Performance of the training Set
- Accuracy: 0.7191
- F1 score: 0.7079
- Precision: 0.6839
- Recall: 0.4972
- ROC Auc Score: 0.6771
-----------------------------------------------------------------
Model Performance of the test Set
- Accuracy: 0.5290
- F1 score: 0.5070
- Precision: 0.3815
- Recall: 0.2581
- ROC Auc Score: 0.4861


Decision Tree
Model Performance of the training Set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC Auc Score: 1.0000
-----------------------------------------------------------------
Model Performance of the test Set
- Accuracy: 0.5473
- F1 score

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy_Score']).sort_values(by=["Accuracy_Score"],ascending=False)

Unnamed: 0,Model Name,Accuracy_Score
6,CatBoosting Classifier,0.655137
3,Random Forest Classifier,0.638861
5,Gradient Boosting Classifier,0.606307
0,Logistic Regression,0.5941
8,Suppport Vector Machine,0.5941
4,Naives Bayes,0.586979
7,AdaBoost Classifier,0.570702
2,Decision Tree,0.547304
1,K-Neighbors Classsifier,0.528993
