In [1]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df=pd.read_csv("water_potability.csv").dropna()
# df.drop(labels=3116,axis=0,inplace=True)
# print(df.shape)

In [None]:
df.Conductivity = pd.cut(df.Conductivity,[0,400,1500], labels = ['layak','tinggi'])
df.Sulfate = pd.cut(df.Sulfate,[0,250,1000], labels = ['layak','tinggi'])
df.Chloramines = pd.cut(df.Chloramines,[0,5,100], labels = ['layak','tinggi'])
# df.Solids = pd.cut(df.Solids,[500,1000,100000], labels = ['layak','tinggi'])
df.Hardness = pd.cut(df.Hardness,[0,500,1000], labels = ['layak','tinggi'])
df.ph = pd.cut(df.ph,[0,6.5,8.5,20], labels = ['basa','netral','asam'])

In [None]:
print(df.Hardness)

In [7]:
X=df.drop(['Potability'],axis=1)
y=df.filter(['Potability'])

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.25,random_state=42)

In [8]:
numeric_pipa=Pipeline([
        ("imputer",SimpleImputer(strategy="mean")),
        ("scaler",MinMaxScaler())
    ])

catagori_pipa=Pipeline([
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("onehot",OneHotEncoder())
    ])

preposecor=ColumnTransformer([
        ("numeric",numeric_pipa,['ph','Hardness','Sulfate','Conductivity','Chloramines','Solids','Organic_carbon','Trihalomethanes','Turbidity']),
#         ("catagoric",catagori_pipa,['ph','Hardness','Sulfate','Conductivity','Chloramines'])
    ])

pipa=Pipeline([
        ("prep",preposecor),
        ("a",RandomForestClassifier())
    ])

# print(pipa.get_params())

parameter={
    "a__n_estimators" : [500],
#     decision tree
        "a__min_samples_leaf":[2],
        "a__max_depth":range(1,50),
#         "a__random_state":range(1,2,50),
        "a__criterion":["gini","entropy"],
#         "a__splitter":["best", "random"],
        "a__max_features":["auto","sqrt","log2"],
    }

In [None]:
print(X_test)

In [None]:
data = [
    ["asam",185.9267231,31548.00646,7.079462297,333.6395113,342.3556975,18.24836789,62.18868705,5.100857854],
]

data_test = pd.DataFrame(data,columns = X.columns)

In [9]:
# simple model
model=GridSearchCV(pipa,parameter,cv=3,n_jobs=-1,verbose=1)
model.fit(X_train,y_train.values.ravel())
tuning_terbaik=model.best_params_
latihan_akurasi=model.score(X_train,y_train)
ujian_akurasi=model.score(X_test,y_test)
pred=model.predict(X_test)
print(classification_report(y_test,pred))
print(tuning_terbaik)
print(ujian_akurasi)


Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed: 10.9min finished


              precision    recall  f1-score   support

           0       0.65      0.89      0.75       231
           1       0.71      0.37      0.48       171

    accuracy                           0.67       402
   macro avg       0.68      0.63      0.62       402
weighted avg       0.68      0.67      0.64       402

{'a__criterion': 'gini', 'a__max_depth': 45, 'a__max_features': 'log2', 'a__min_samples_leaf': 2, 'a__n_estimators': 500}
0.6666666666666666


In [None]:
print(pred)

In [10]:
yhat = model.predict(X_test)
yhat[0:50]
print("Train set Accuracy: ", metrics.accuracy_score(y_train, model.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

Train set Accuracy:  1.0
Test set Accuracy:  0.6666666666666666


In [None]:

for x in df:
    print(pd.crosstab(df[x],df.Potability,normalize="index",))
    print("============================================================")
    plt.figure(figsize=(12,10))
    sns.histplot(df[x][df.Potability==0],color="r",label="tidak layak")
    sns.histplot(df[x][df.Potability==1],color="g",label="layak")
    plt.legend()
    plt.show()