## Packages

In [1]:
# Read in packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [23]:
# Install and read in hyperopt
!pip install hyperopt --quiet
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

## Data

In [5]:
# View our data
data = pd.read_csv("water_potability.csv")
data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [24]:
# Remove missing values
data = data.dropna()
data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
5,5.584087,188.313324,28748.687739,7.544869,326.678363,280.467916,8.399735,54.917862,2.559708,0
6,10.223862,248.071735,28749.716544,7.513408,393.663396,283.651634,13.789695,84.603556,2.672989,0
7,8.635849,203.361523,13672.091764,4.563009,303.309771,474.607645,12.363817,62.798309,4.401425,0
...,...,...,...,...,...,...,...,...,...,...
3267,8.989900,215.047358,15921.412018,6.297312,312.931022,390.410231,9.899115,55.069304,4.613843,1
3268,6.702547,207.321086,17246.920347,7.708117,304.510230,329.266002,16.217303,28.878601,3.442983,1
3269,11.491011,94.812545,37188.826022,9.263166,258.930600,439.893618,16.172755,41.558501,4.369264,1
3270,6.069616,186.659040,26138.780191,7.747547,345.700257,415.886955,12.067620,60.419921,3.669712,1


## Initial Model

In [7]:
# Split to train and test
X = data.drop(["Potability"], axis=1)
y = data["Potability"]
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=200
)

In [9]:
# Build initial model using a random forest
model = RandomForestClassifier(n_estimators=300, max_features="sqrt", random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [10]:
# View the performance
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.6724565756823822


## Hyperopt One Feature

In [11]:
# Define the function we want to minimise
def objective(n_estimators):
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_features="sqrt", random_state=42
    )
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {"loss": -accuracy, "status": STATUS_OK}

In [12]:
# Set the hyperparam tuning algorithm
algorithm = tpe.suggest

In [13]:
# Define the values to search over for n_estimators
search_space = hp.randint("n_estimators", 200, 1000)

In [15]:
# Run the tuning process
best_params = fmin(fn=objective, space=search_space, algo=algorithm, max_evals=10)

100%|████████| 10/10 [00:19<00:00,  1.95s/trial, best loss: -0.6799007444168734]


In [17]:
# View the best n_estimator value
best_params

{'n_estimators': 814}

## Hyperopt Multiple Features

In [18]:
# Redefine the function to include wider range of hyperparameters
def objective(search_space):
    model = RandomForestClassifier(**search_space, random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {"loss": -accuracy, "status": STATUS_OK}

In [19]:
# Define the search space ove
search_space = {
    "n_estimators": hp.randint("n_estimators", 200, 1000),
    "max_depth": hp.randint("max_depth", 10, 200),
    "min_samples_split": hp.uniform("min_samples_split", 0, 1),
    "min_samples_leaf": hp.randint("min_samples_leaf", 1, 10),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "max_features": hp.choice("max_features", ["sqrt", "log2"]),
}

In [21]:
best_params = fmin(fn=objective, space=search_space, algo=algorithm, max_evals=10)

100%|████████| 10/10 [00:08<00:00,  1.18trial/s, best loss: -0.6451612903225806]


In [22]:
space_eval(search_space, best_params)

{'criterion': 'gini',
 'max_depth': 147,
 'max_features': 'sqrt',
 'min_samples_leaf': 7,
 'min_samples_split': 0.11585243299745185,
 'n_estimators': 643}