In [1]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder
from sklearn.metrics import classification_report
from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv("water_potability.csv")

In [3]:
X=df.drop(['Potability'],axis=1)
y=df.filter(['Potability'])

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.25,random_state=42)

In [6]:
numeric_pipa=Pipeline([
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ])

preposecor=ColumnTransformer([
        ("numeric",numeric_pipa,['ph','Hardness','Sulfate','Conductivity','Chloramines','Solids','Organic_carbon','Trihalomethanes','Turbidity']),
    ])



In [12]:
# random forest
pipaRF=Pipeline([
        ("prep",preposecor),
        ("rf",RandomForestClassifier())
    ])

parameterRF={
        "rf__n_estimators" : [500],
        "rf__min_samples_leaf":[2],
        "rf__max_depth":range(1,50),
        "rf__criterion":["gini","entropy"],
        "rf__max_features":["auto","sqrt","log2"],
    }

# Decision tree
pipaDT=Pipeline([
        ("prep",preposecor),
        ("dt",DecisionTreeClassifier())
    ])

parameterDT={
        "dt__min_samples_leaf":[2],
        "dt__max_depth":range(1,50),
        "dt__criterion":["gini","entropy"],
    }

# KNN
pipaKNN=Pipeline([
        ("prep",preposecor),
        ("knn",KNeighborsClassifier())
    ])

parameterKNN={
        "knn__n_neighbors" : range(1,50),
        "knn__algorithm" : ['auto', 'ball_tree', 'kd_tree', 'brute'],
        "knn__p" : range(1,50),
    }

# SVC
pipaSVC=Pipeline([
        ("prep",preposecor),
        ("svc",SVC())
    ])

parameterSVC={
        "svc__kernel" : ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
        "svc__degree" : range(1,50),
    }


In [None]:
# simple model Random Forest
modelRF = GridSearchCV(pipaRF,parameterRF,cv=5,n_jobs=-1,verbose=1)
modelRF.fit(X_train,y_train.values.ravel())

tuning_terbaikRF=modelRF.best_params_

latihan_akurasiRF=modelRF.score(X_train,y_train)
ujian_akurasiRF=modelRF.score(X_test,y_test)
predRF=modelRF.predict(X_test)
reportRF = classification_report(y_test,predRF)


Fitting 5 folds for each of 294 candidates, totalling 1470 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  9.7min


In [None]:
# simple model Decision Tree
modelDT = GridSearchCV(pipaDT,parameterDT,cv=5,n_jobs=-1,verbose=1)
modelDT.fit(X_train,y_train.values.ravel())

tuning_terbaikDT=modelDT.best_params_

latihan_akurasiDT=modelDT.score(X_train,y_train)
ujian_akurasiDT=modelDT.score(X_test,y_test)
predDT=modelDT.predict(X_test)
reportDT = classification_report(y_test,predDT)

In [None]:
# simple model KNN
modelKNN = GridSearchCV(pipaKNN,parameterKNN,cv=5,n_jobs=-1,verbose=1)
modelKNN.fit(X_train,y_train.values.ravel())

tuning_terbaikKNN=modelKNN.best_params_

latihan_akurasiKNN=modelKNN.score(X_train,y_train)
ujian_akurasiKNN=modelKNN.score(X_test,y_test)
predKNN=modelKNN.predict(X_test)
reportKNN = classification_report(y_test,predKNN)


In [None]:
# simple model SVC
modelSVC = GridSearchCV(pipaSVC,parameterSVC,cv=5,n_jobs=-1,verbose=1)
modelSVC.fit(X_train,y_train.values.ravel())

tuning_terbaikSVC=modelSVC.best_params_

latihan_akurasiSVC=modelSVC.score(X_train,y_train)
ujian_akurasiSVC=modelSVC.score(X_test,y_test)
predSVC=modelSVC.predict(X_test)
reportSVC = classification_report(y_test,predSVC)

In [9]:
print("")
print("Random Forest")
print('ini adalah tuning terbaik :', tuning_terbaikRF)
print('ini adalah hasil testing',ujian_akurasiRF)
print('ini adalah hasil training', latihan_akurasiRF)
print(reportRF)
print('==============================================')

print("")
print("Decision Tree")
print('ini adalah tuning terbaik :', tuning_terbaikDT)
print('ini adalah hasil testing',ujian_akurasiDT)
print('ini adalah hasil training', latihan_akurasiDT)
print(reportDT)
print('==============================================')

print("")
print("KNN")
print('ini adalah tuning terbaik :', tuning_terbaikKNN)
print('ini adalah hasil testing',ujian_akurasiKNN)
print('ini adalah hasil training', latihan_akurasiKNN)
print(reportKNN)
print('==============================================')

print("")
print("SVC")
print('ini adalah tuning terbaik :', tuning_terbaikSVC)
print('ini adalah hasil testing',ujian_akurasiSVC)
print('ini adalah hasil training', latihan_akurasiSVC)
print(reportSVC)
print('==============================================')

              precision    recall  f1-score   support

           0       0.72      0.87      0.79       429
           1       0.60      0.36      0.45       226

    accuracy                           0.69       655
   macro avg       0.66      0.62      0.62       655
weighted avg       0.68      0.69      0.67       655

{'a__criterion': 'gini', 'a__max_depth': 22, 'a__max_features': 'auto', 'a__min_samples_leaf': 2, 'a__n_estimators': 500}
1.0


In [None]:

numeric_pipa=Pipeline([
        ("imputer",SimpleImputer(strategy="mean")),
        ("scaler",MinMaxScaler())
    ])

preposecor=ColumnTransformer([
        ("numeric",numeric_pipa,['ph','Hardness','Sulfate','Conductivity','Chloramines','Solids','Organic_carbon','Trihalomethanes','Turbidity']),
    ])

pipa=Pipeline([
        ("prep",preposecor),
        ("a",RandomForestClassifier())
    ])

parameter={
        "a__n_estimators" : [500],
        "a__min_samples_leaf":[2],
        "a__max_depth":range(1,50),
        "a__criterion":["gini","entropy"],
        "a__max_features":["auto","sqrt","log2"],
    }