In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score
import seaborn as sns

  from numpy.core.umath_tests import inner1d


In [2]:
df = pd.read_csv('water_potability.csv')

In [3]:
df.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2785.0,3276.0,3276.0,3276.0,2495.0,3276.0,3276.0,3114.0,3276.0,3276.0
mean,7.080795,196.369496,22014.092526,7.122277,333.775777,426.205111,14.28497,66.396293,3.966786,0.39011
std,1.59432,32.879761,8768.570828,1.583085,41.41684,80.824064,3.308162,16.175008,0.780382,0.487849
min,0.0,47.432,320.942611,0.352,129.0,181.483754,2.2,0.738,1.45,0.0
25%,6.093092,176.850538,15666.690297,6.127421,307.699498,365.734414,12.065801,55.844536,3.439711,0.0
50%,7.036752,196.967627,20927.833607,7.130299,333.073546,421.884968,14.218338,66.622485,3.955028,0.0
75%,8.062066,216.667456,27332.762127,8.114887,359.95017,481.792304,16.557652,77.337473,4.50032,1.0
max,14.0,323.124,61227.196008,13.127,481.030642,753.34262,28.3,124.0,6.739,1.0


In [4]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [5]:
df.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [6]:
df['ph'].fillna(df['ph'].mean(), inplace = True)
df['Sulfate'].fillna(df['Sulfate'].median(), inplace = True)
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace = True)

In [7]:
df.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [8]:
predictors = df.drop('Potability',axis=1)
y = df['Potability'].values

In [9]:
predictors.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135
1,3.71608,129.422921,18630.057858,6.635246,333.073546,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.541732,9.275884,333.073546,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075


In [10]:
X_train,X_test,y_train,y_test = train_test_split(predictors,y, random_state =42, test_size = 0.2)

scaler = StandardScaler()

training = scaler.fit_transform(X_train) 
testing = scaler.transform(X_test)

In [11]:
def print_results(results):
    print('Best params : {}'.format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean,3), round(std*2,3), params))

# Testing LogsticRegression

In [12]:
lr = LogisticRegression()

In [13]:
lr

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
params = {'C':[0.001,0.01,0.1,1,10,100],
         'penalty':['l1','l2']}

In [15]:
lrCV = GridSearchCV(lr,params, cv = 5)
print_results(lrCV.fit(training, y_train))

Best params : {'C': 1, 'penalty': 'l1'}
0.605 (+/-0.001) for {'C': 0.001, 'penalty': 'l1'}
0.605 (+/-0.003) for {'C': 0.001, 'penalty': 'l2'}
0.605 (+/-0.001) for {'C': 0.01, 'penalty': 'l1'}
0.605 (+/-0.003) for {'C': 0.01, 'penalty': 'l2'}
0.605 (+/-0.001) for {'C': 0.1, 'penalty': 'l1'}
0.605 (+/-0.003) for {'C': 0.1, 'penalty': 'l2'}
0.606 (+/-0.005) for {'C': 1, 'penalty': 'l1'}
0.605 (+/-0.003) for {'C': 1, 'penalty': 'l2'}
0.606 (+/-0.003) for {'C': 10, 'penalty': 'l1'}
0.605 (+/-0.003) for {'C': 10, 'penalty': 'l2'}
0.605 (+/-0.003) for {'C': 100, 'penalty': 'l1'}
0.605 (+/-0.003) for {'C': 100, 'penalty': 'l2'}


In [16]:
joblib.dump(lrCV.best_estimator_,'./waterpotability_LR.pkl')

['./waterpotability_LR.pkl']

# Training RandomForest

In [17]:
rfC = RandomForestClassifier()
params = {"n_estimators":[1,3,5,10,20,30,100,200],
         "max_depth":[3,6,9,13,100,250]}
grfC = GridSearchCV(rfC, params,cv=5)
print_results(grfC.fit(training,y_train))

Best params : {'max_depth': 250, 'n_estimators': 200}
0.607 (+/-0.023) for {'max_depth': 3, 'n_estimators': 1}
0.617 (+/-0.03) for {'max_depth': 3, 'n_estimators': 3}
0.613 (+/-0.015) for {'max_depth': 3, 'n_estimators': 5}
0.615 (+/-0.007) for {'max_depth': 3, 'n_estimators': 10}
0.624 (+/-0.018) for {'max_depth': 3, 'n_estimators': 20}
0.625 (+/-0.015) for {'max_depth': 3, 'n_estimators': 30}
0.623 (+/-0.008) for {'max_depth': 3, 'n_estimators': 100}
0.624 (+/-0.015) for {'max_depth': 3, 'n_estimators': 200}
0.594 (+/-0.035) for {'max_depth': 6, 'n_estimators': 1}
0.617 (+/-0.026) for {'max_depth': 6, 'n_estimators': 3}
0.626 (+/-0.009) for {'max_depth': 6, 'n_estimators': 5}
0.642 (+/-0.02) for {'max_depth': 6, 'n_estimators': 10}
0.655 (+/-0.018) for {'max_depth': 6, 'n_estimators': 20}
0.64 (+/-0.013) for {'max_depth': 6, 'n_estimators': 30}
0.643 (+/-0.011) for {'max_depth': 6, 'n_estimators': 100}
0.654 (+/-0.026) for {'max_depth': 6, 'n_estimators': 200}
0.591 (+/-0.02) for {'m

In [18]:
joblib.dump(grfC.best_estimator_,'./waterpotability_RF.pkl')

['./waterpotability_RF.pkl']

# Training MLP

In [19]:
mlp = MLPClassifier()

In [20]:
mlp

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [21]:
params = {"activation":['relu','logistic','tanh'],
         "hidden_layer_sizes":[(100,2),(150,3),(200,4)],
         "learning_rate":['constant','invscaling','adaptive']}
mlpGS = GridSearchCV(mlp,params, cv = 5)
print_results(mlpGS.fit(training, y_train))





Best params : {'activation': 'tanh', 'hidden_layer_sizes': (200, 4), 'learning_rate': 'invscaling'}
0.658 (+/-0.047) for {'activation': 'relu', 'hidden_layer_sizes': (100, 2), 'learning_rate': 'constant'}
0.648 (+/-0.02) for {'activation': 'relu', 'hidden_layer_sizes': (100, 2), 'learning_rate': 'invscaling'}
0.647 (+/-0.015) for {'activation': 'relu', 'hidden_layer_sizes': (100, 2), 'learning_rate': 'adaptive'}
0.67 (+/-0.033) for {'activation': 'relu', 'hidden_layer_sizes': (150, 3), 'learning_rate': 'constant'}
0.653 (+/-0.033) for {'activation': 'relu', 'hidden_layer_sizes': (150, 3), 'learning_rate': 'invscaling'}
0.661 (+/-0.035) for {'activation': 'relu', 'hidden_layer_sizes': (150, 3), 'learning_rate': 'adaptive'}
0.659 (+/-0.033) for {'activation': 'relu', 'hidden_layer_sizes': (200, 4), 'learning_rate': 'constant'}
0.652 (+/-0.015) for {'activation': 'relu', 'hidden_layer_sizes': (200, 4), 'learning_rate': 'invscaling'}
0.658 (+/-0.039) for {'activation': 'relu', 'hidden_laye

In [22]:
joblib.dump(mlpGS.best_estimator_,'./waterpotability_MLP.pkl')

['./waterpotability_MLP.pkl']

# Training SVM

In [23]:
svc = SVC(cache_size = 100)
params = {'C':[0.001,0.01,0.1,1.0,10.0],
         'kernel':['linear','rbf']}
gsvm = GridSearchCV(svc,params, cv =5)
print_results(gsvm.fit(training,y_train))

Best params : {'C': 1.0, 'kernel': 'rbf'}
0.605 (+/-0.001) for {'C': 0.001, 'kernel': 'linear'}
0.605 (+/-0.001) for {'C': 0.001, 'kernel': 'rbf'}
0.605 (+/-0.001) for {'C': 0.01, 'kernel': 'linear'}
0.605 (+/-0.001) for {'C': 0.01, 'kernel': 'rbf'}
0.605 (+/-0.001) for {'C': 0.1, 'kernel': 'linear'}
0.606 (+/-0.002) for {'C': 0.1, 'kernel': 'rbf'}
0.605 (+/-0.001) for {'C': 1.0, 'kernel': 'linear'}
0.671 (+/-0.016) for {'C': 1.0, 'kernel': 'rbf'}
0.605 (+/-0.001) for {'C': 10.0, 'kernel': 'linear'}
0.668 (+/-0.04) for {'C': 10.0, 'kernel': 'rbf'}


In [24]:
joblib.dump(gsvm.best_estimator_,'./waterpotability_SVC.pkl')

['./waterpotability_SVC.pkl']

# Testing GradientBoostingClassifier

In [25]:
gb = GradientBoostingClassifier()

In [26]:
gb

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [27]:
params = {'criterion':['friedman_mse','mse','mae'],
         'learning_rate':[0.001,0.01,0.1,1.0,10],
         'max_depth':[3,6,9],
         'n_estimators':[1,3,5,10,20,30]}

ggb = GridSearchCV(gb, params, cv = 5)
print_results(ggb.fit(training, y_train))

Best params : {'criterion': 'mse', 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 30}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 1}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 3}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 5}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 10}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 20}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 30}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 6, 'n_estimators': 1}
0.605 (+/-0.001) for {'criterion': 'friedman_mse', 'learning_rate': 0.001, 'max_depth': 6, 'n_estimators': 3}
0.605 (+/-0.001) for {'c

In [28]:
joblib.dump(ggb.best_estimator_,'./waterpotability_GB.pkl')

['./waterpotability_GB.pkl']

# Training KNearest Neighbor

In [29]:
knn = KNeighborsClassifier()
params = {'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
         'n_neighbors':[10,20,25,30,35,40,45,50],
          'leaf_size':[35,40,45,50,55,60],
         'weights':['uniform','distance']}

gknn = GridSearchCV(knn,params, cv = 5)
print_results(gknn.fit(training, y_train))

In [34]:
joblib.dump(gknn.best_estimator_,'./waterpotability_KNN.pkl')

['./waterpotability_KNN.pkl']

# Testing

In [12]:
def results(model, labels, predictors):
    #caling/loafding the model
    model = joblib.load('./'+model)
    predictions = model.predict(predictors)
    
    accu = accuracy_score(labels, predictions)
    recall = recall_score(labels, predictions, average='micro')
    precision = precision_score(labels,predictions, average='micro')
    print('Acuracy : {}, Recall: {}, Precision_score: {}'.format(accu,recall,precision))

models = ['waterpotability_LR.pkl','waterpotability_RF.pkl','waterpotability_MLP.pkl','waterpotability_SVC.pkl',\
          'waterpotability_GB.pkl','waterpotability_KNN.pkl']
for m in models:
    results(m,y_test, testing)

Acuracy : 0.6280487804878049, Recall: 0.6280487804878049, Precision_score: 0.6280487804878049
Acuracy : 0.6692073170731707, Recall: 0.6692073170731707, Precision_score: 0.6692073170731707
Acuracy : 0.6814024390243902, Recall: 0.6814024390243902, Precision_score: 0.6814024390243902
Acuracy : 0.6951219512195121, Recall: 0.6951219512195121, Precision_score: 0.6951219512195121
Acuracy : 0.6844512195121951, Recall: 0.6844512195121951, Precision_score: 0.6844512195121951
Acuracy : 0.6615853658536586, Recall: 0.6615853658536586, Precision_score: 0.6615853658536586
