# Project Outline - Water Quality

- Water Quality Dataset from Kaggel - https://www.kaggle.com/adityakadiwal/water-potability

- Predicting podability based on existing features without potability feature

- Testing our predictions on original database potability feature

- Create a feature that is MORE then binary for potability

In [1]:
import pandas            as pd
import numpy             as np
import matplotlib.pyplot as plt
import seaborn           as sns
import pydotplus

from six                  import StringIO
from IPython.display      import Image
from data.ds_models       import ds_models

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline                import Pipeline
from sklearn.impute                  import SimpleImputer, KNNImputer
from sklearn.preprocessing           import StandardScaler, OneHotEncoder, normalize
from sklearn.model_selection         import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics                 import mean_squared_error, mean_squared_log_error, accuracy_score
from sklearn.metrics                 import plot_confusion_matrix, classification_report

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors    import KNeighborsClassifier
from sklearn.naive_bayes  import GaussianNB
from sklearn.tree         import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble     import BaggingClassifier, RandomForestClassifier
from sklearn.ensemble     import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm          import SVC

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'data'

# Data Exploration and Cleaning

In [None]:
df = pd.read_csv('data/water_potability.csv')

In [None]:
df.head()

### Raw Potability Histogram

In [None]:
plt.hist(df['Potability'])

### Cleaning NaN values

In [None]:
df.isna().sum()

In [None]:
df_scrub = df.dropna()
df_scrub.isna().sum()

In [None]:
sns.heatmap(df_scrub.corr(), annot=True);

In [None]:
df_scrub.corr()

In [None]:
df_scrub.info()

In [None]:
pd.plotting.scatter_matrix(df_scrub);

In [None]:
X = df_scrub.drop(columns=['Potability'], axis=1)

## Removing Outliers

In [None]:
#Q1 = df_scrub.quantile(0.25)
#Q3 = df_scrub.quantile(0.75)
#IQR = df_scrub.apply(stats.iqr)
#df_out = df_scrub[~((X < (Q1-1.5*IQR)) | (X > Q3+1.5*IQR)).any(axis=1)]
#df_out

In [None]:
#df_out.corr()

### Clean Potability Histogram

In [None]:
#plt.hist(df1['Potability'])

# Data Instantiation

### Raw Data

In [None]:
X = df.drop(columns=['Potability'], axis=1)
y = df['Potability']

### Dropping NaNs Data

In [None]:
df_dn = df.dropna()
X_dn  = df_dn.drop(columns=['Potability'], axis=1)
y_dn  = df_dn['Potability']

### Dropping Outliers Data

In [None]:



#X_do = df.drop(columns=['Potability'], axis=1)
#y_do = df['Potability']

# Model Instantiations

## Gaussian Naive Bayes

In [None]:
#model = GaussianNB()
#model = BaggingClassifier(GaussianNB())
#model = AdaBoostClassifier(GaussianNB())

## KNeighbors Classifier

In [None]:
#model = KNeighborsClassifier()
#knc_grid   = {'n_neighbors': [1, 2, 3, 4, 5], 'weights': ['uniform', 'distance']}
#model      = GridSearchCV(KNeighborsClassifier(), knc_grid, scoring='accuracy', verbose=1)
#model      = AdaBoostClassifier(KNeighborsClassifier())

## Logistic Regression

In [None]:
#model = LogisticRegression(fit_intercept = False, C = 1e12, solver='lbfgs', multi_class='auto')
#logreg_grid = {'C': [1, 2, 3], 'solver': [‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’]}
#model       = GridSearchCV(LogisticRegression(), logreg_grid, scoring='accuracy', verbose=1)

## Decision Tree Classifier

In [None]:
#model = DecisionTreeClassifier()
#dtc_grid = {'criterion': ['gini', 'entropy'], 'max_depth':[1, 2, 3, 4, 5]}
#model    = GridSearchCV(DecisionTreeClassifier(), dtc_grid, scoring='accuracy', verbose=1)

## Bagging Classifier

In [None]:
#model = BaggingClassifier()
#bc_grid = {'n_estimators': [10, 20, 30, 40, 50], 'oob_score':['True', 'False']}
#model   = GridSearchCV(BaggingClassifier(), bc_grid, scoring='accuracy', verbose=1)

## Random Forest Classifier

In [None]:
#model = RandomForestClassifier()
#rfc_grid = {'n_estimators': [100, 200], 'criterion':['gini', 'entropy'], 
#            'oob_score':['True', 'False'], 'max_depth': [3, 4, 5, 6, 7, 8]}
#model    = GridSearchCV(RandomForestClassifier(), rfc_grid, scoring='accuracy', verbose=1)

## Adaptive Boosting Classifier

In [None]:
#model = AdaBoostClassifier()

## Gradient Boosting Classifier

In [None]:
#model = GradientBoostingClassifier()

## Support Vector Classification

In [None]:
#model = SVC()
#svc_grid = {'C': [1, 2, 3], 'kernel': [‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’]}
#model    = GridSearchCV(SVC(), svc_grid, scoring='accuracy', verbose=1)

# Model Testing 

### Gaussian Naive Bayes - NaNs Filled w/KNN - 61% Test Accuracy

In [None]:
model = GaussianNB()

gnb   = ds_models(model, X, y, model_name='Gaussian Naive Bayes - NaNs Filled w/KNN',
                  output='class', imputer=True)


In [None]:
gnb

### Gaussian Naive Bayes - NaNs Filled w/KNN - Bagging Classifier - 62% Test Accuracy

In [None]:
model  = BaggingClassifier(GaussianNB())

gnb_bc = ds_models(model, X, y, model_name='Gaussian Naive Bayes - NaNs w/KNN - Bagging Classifier',
                   output='class', imputer=True)


In [None]:
gnb_bc

### Gaussian Naive Bayes - NaNs Filled w/KNN - Adaptive Boosting Classifier - 52% Test Accuracy

In [None]:
model   = AdaBoostClassifier(GaussianNB())

gnb_abc = ds_models(model, X, y, model_name='Gaussian Naive Bayes - NaNs w/KNN - Adaptive Boosting Classifier',
                    output='class', imputer=True)


In [None]:
gnb_abc

### Gaussian Naive Bayes - Dropped NaNs - 62% Test Accuracy

In [None]:
model  = GaussianNB()

gnb_dn = ds_models(model, X_dn, y_dn, model_name='Gaussian Naive Bayes - Dropped NaNs',
                   output='class')


In [None]:
gnb_dn

### Gaussian Naive Bayes - Dropped NaNs - Bagging Classifier - 61% Test Accuracy

In [None]:
model     = BaggingClassifier(GaussianNB())

gnb_dn_bc = ds_models(model, X_dn, y_dn, model_name='Gaussian Naive Bayes - Dropped NaNs - Bagging Classifier',
                      output='class')


In [None]:
gnb_dn_bc

### Gaussian Naive Bayes - Dropped NaNs - Adaptive Boosting Classifier - 61% Test Accuracy

In [None]:
model      = BaggingClassifier(GaussianNB())

gnb_dn_abc = ds_models(model, X_dn, y_dn, model_name='Gaussian Naive Bayes - Dropped NaNs - Adaptive Boosting Classifier',
                       output='class')


In [None]:
gnb_dn_abc

### Decision Tree Classifier - NaNs Filled w/KNN - 59% Test Accuracy

In [None]:
model = DecisionTreeClassifier()

dtc   = ds_models(model, X, y, model_name='Decision Tree Classifier - NaNs Filled w/KNN',
                  output='class', imputer=True, fi=True)


In [None]:
dtc

### Decision Tree Classifier - NaNs Filled w/KNN - Gridsearch - 62% Test Accuracy

In [None]:
dtc_grid = {'criterion': ['gini', 'entropy'], 'max_depth':[1, 2, 3, 4, 5]}
model    = GridSearchCV(DecisionTreeClassifier(), dtc_grid, scoring='accuracy', verbose=1)

dtc_gs   = ds_models(model, X, y, model_name='Decision Tree Classifier - NaNs w/KNN - Gridsearch',
                     output='class', imputer=True)


In [None]:
dtc_gs

### Decision Tree Classifier - Dropped NaNs - 64% Test Accuracy

In [None]:
model  = DecisionTreeClassifier()

dtc_dn = ds_models(model, X_dn, y_dn, model_name='Decision Tree Classifier - Dropped NaNs',
                   output='class', fi=True)


In [None]:
dtc_dn

### Decision Tree Classifier - Dropped NaNs - Gridsearch - 62% Test Accuracy

In [None]:
dtc_grid  = {'criterion': ['gini', 'entropy'], 'max_depth':[1, 2, 3, 4, 5]}
model     = GridSearchCV(DecisionTreeClassifier(), dtc_grid, scoring='accuracy', verbose=1)

dtc_dn_gs = ds_models(model, X_dn, y_dn, model_name='Decision Tree Classifier - Dropped NaNs - Gridsearch',
                      output='class')


In [None]:
dtc_dn_gs

### Bagging Classifier - NaNs Filled w/KNN - 63% Test Accuracy

In [None]:
model = BaggingClassifier()

bc    = ds_models(model, X, y, model_name='Bagging Classifier - NaNs Filled w/KNN',
                  output='class', imputer=True)


In [None]:
bc

### Bagging Classifier - NaNs Filled w/KNN - Gridsearch - 64% Test Accuracy

In [None]:
bc_grid = {'n_estimators': [10, 20, 30, 40, 50], 'oob_score':['True', 'False']}
model   = GridSearchCV(BaggingClassifier(), bc_grid, scoring='accuracy', verbose=1)

bc_gs   = ds_models(model, X, y, model_name='Bagging Classifier - NaNs w/KNN- Gridsearch',
                    output='class', imputer=True)


In [None]:
bc_gs

### Bagging Classifier - Dropped NaNs - 66% Test Accuracy

In [None]:
model = BaggingClassifier()

bc_dn = ds_models(model, X_dn, y_dn, model_name='Bagging Classifier - Dropped NaNs',
                  output='class')


In [None]:
bc_dn

### Bagging Classifier - Dropped NaNs - Gridsearch - 70% Test Accuracy

In [None]:
bc_grid  = {'n_estimators': [10, 20, 30, 40, 50], 'oob_score':['True', 'False']}
model    = GridSearchCV(BaggingClassifier(), bc_grid, scoring='accuracy', verbose=1)

bc_dn_gs = ds_models(model, X_dn, y_dn, model_name='Bagging Classifier - Dropped NaNs - Gridsearch',
                     output='class')


In [None]:
bc_dn_gs

### Random Forest Classifier - NaNs Filled w/KNN - 65% Test Accuracy

In [None]:
model = RandomForestClassifier()

rfc   = ds_models(model, X, y, model_name='Randmon Forest Classifier - NaNs Filled w/KNN',
                  output='class', imputer=True, fi=True)


In [None]:
rfc

### Random Forest Classifier - NaNs Filled w/KNN - Gridsearch - 63% Test Accuracy

In [None]:
rfc_grid = {'n_estimators': [100, 200], 'criterion':['gini', 'entropy'], 
            'oob_score':['True', 'False'], 'max_depth': [3, 4, 5, 6, 7, 8]}
model    = GridSearchCV(RandomForestClassifier(), rfc_grid, scoring='accuracy', verbose=1)

rfc_gs   = ds_models(model, X, y, model_name='Randmon Forest Classifier - NaNs w/KNN - Gridsearch', 
                     output='class', imputer=True)


In [None]:
rfc_gs

### Random Forest Classifier - Dropped NaNs - 67% Test Accuracy

In [None]:
model  = RandomForestClassifier()

rfc_dn = ds_models(model, X_dn, y_dn, model_name='Randmon Forest Classifier - Dropped NaNs',
                   output='class')

In [None]:
rfc_dn

### Random Forest Classifier - Dropped NaNs - Gridsearch - 68% Test Accuracy

In [None]:
rfc_grid  = {'n_estimators': [100, 200], 'criterion':['gini', 'entropy'], 
            'oob_score':['True', 'False'], 'max_depth': [3, 4, 5, 6, 7, 8]}
model     = GridSearchCV(RandomForestClassifier(), rfc_grid, scoring='accuracy', verbose=1)

rfc_dn_gs = ds_models(model, X_dn, y_dn, model_name='Randmon Forest Classifier - Dropped NaNs - Gridsearch', 
                      output='class')

In [None]:
rfc_dn_gs

### Gradient Boosting Classifier - NaNs Filled w/KNN - 63% Test Accuracy

In [None]:
model = GradientBoostingClassifier()

gbc   = ds_models(model, X, y, model_name='Gradient Boosting Classifier - NaNs Filled w/KNN',
                  output='class', imputer=True, fi=True)


In [None]:
gbc

### Gradient Boosting Classifier - Dropped NaNs - 64% Test Accuracy

In [None]:
model  = GradientBoostingClassifier()

gbc_dn = ds_models(model, X_dn, y_dn, model_name='Gradient Boosting Classifier - Dropped NaNs',
                   output='class', fi=True)


In [None]:
gbc_dn