## Import the Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

## Import the Dataset

In [2]:
df = pd.read_csv('../../data/water_quality/water_potability.csv')

In [3]:
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


## Data Cleaning

From the data source, "WHO has recommended maximum permissible limit of pH from 6.5 to 8.5." So, if the WHO considers this the potable range, we can drop values outside of this range.

In [4]:
potable_df = df[(df['ph'] >= 6.5) & (df['ph'] <= 8.5)]

In [5]:
potable_df.isnull().sum()

ph                   0
Hardness             0
Solids               0
Chloramines          0
Sulfate            322
Conductivity         0
Organic_carbon       0
Trihalomethanes     57
Turbidity            0
Potability           0
dtype: int64

In [6]:
potable_df.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,1328.0,1328.0,1328.0,1328.0,1006.0,1328.0,1328.0,1271.0,1328.0,1328.0
mean,7.408354,198.166651,21817.531494,7.097904,332.185338,427.285294,14.356944,66.681124,3.958251,0.439006
std,0.552018,28.396601,8147.877918,1.51534,40.049799,81.199131,3.288237,16.38345,0.774506,0.496453
min,6.503638,98.452931,320.942611,1.683993,187.170714,181.483754,4.371899,0.738,1.492207,0.0
25%,6.9225,180.783954,15903.918838,6.124763,306.644348,367.467364,12.171123,56.436566,3.429591,0.0
50%,7.376743,198.925461,20840.341037,7.110798,333.389426,423.870142,14.373258,66.983589,3.947721,0.0
75%,7.866354,215.988821,27026.940788,8.035063,357.313002,482.068467,16.701148,77.999644,4.496852,1.0
max,8.490572,306.627481,56320.586979,12.580026,476.539717,666.690618,23.399516,124.0,6.494749,1.0


To make the data easier to work with, we will be dropping NA values. If desired, we could fill the NA values with the average value for the column.

In [7]:
potable_df.dropna().describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,961.0,961.0,961.0,961.0,961.0,961.0,961.0,961.0,961.0,961.0
mean,7.390524,198.555083,21788.952388,7.117027,332.349945,424.813252,14.402646,66.886486,3.96624,0.444329
std,0.548535,28.233402,8058.331211,1.491998,39.839651,81.237471,3.289782,16.400821,0.766117,0.49715
min,6.503638,98.452931,320.942611,2.397985,187.170714,201.619737,4.371899,14.343161,1.492207,0.0
25%,6.917858,181.434419,15979.06027,6.179496,306.685455,363.803476,12.18631,56.884528,3.443741,0.0
50%,7.345138,199.255322,21043.626929,7.117859,333.339282,420.394223,14.391606,66.983589,3.958609,0.0
75%,7.833971,216.116319,26999.382076,8.038872,357.265732,479.758924,16.848512,78.076738,4.497731,1.0
max,8.490572,306.627481,50793.898917,12.580026,476.539717,666.690618,23.399516,124.0,6.494749,1.0


In [8]:
potable_df = potable_df.dropna()

## Creating the Training Set and the Test Set

In [9]:
X = potable_df.iloc[:, :-1].values
y = potable_df.iloc[:, -1].values

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Using the Random Forest Classification Model

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf_classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)

In [14]:
rf_y_pred = rf_classifier.predict(X_test)

In [16]:
accuracy_score(y_test, rf_y_pred)

0.6265560165975104