In [19]:
# read in packages
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [5]:
# read in hyperopt values
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

In [13]:
# read in the data and info
data = pd.read_csv("../input/water-potability/water_potability.csv")
data.info()

In [14]:
# remove missing values
data = data.dropna()
data.info()

In [15]:
# split to train and test
X = data.drop(["Potability"], axis=1)
y = data["Potability"]
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=200
)

In [16]:
# build the model
model = RandomForestClassifier(n_estimators=300, max_features="sqrt", random_state=42)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred))

In [21]:
# define the function we want to minimise
def objective(n_estimators):
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_features="sqrt", random_state=42
    )
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {"loss": -accuracy, "status": STATUS_OK}

In [22]:
# set the hyperparam tuning algorithm
algorithm = tpe.suggest

In [23]:
# define the values to search over for n_estimators
search_space = hp.randint("n_estimators", 200, 1000)

In [24]:
best_params = fmin(fn=objective, space=search_space, algo=algorithm, max_evals=200)

In [25]:
best_params

In [26]:
# redefine the function usng a wider range of hyperparameters
def objective(search_space):
    model = RandomForestClassifier(**search_space, random_state=42)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {"loss": -accuracy, "status": STATUS_OK}

In [27]:
search_space = {
    "n_estimators": hp.randint("n_estimators", 200, 1000),
    "max_depth": hp.randint("max_depth", 10, 200),
    "min_samples_split": hp.uniform("min_samples_split", 0, 1),
    "min_samples_leaf": hp.randint("min_samples_leaf", 1, 10),
    "criterion": hp.choice("criterion", ["gini", "entropy"]),
    "max_features": hp.choice("max_features", ["sqrt", "log2"]),
}

In [28]:
best_params = fmin(fn=objective, space=search_space, algo=algorithm, max_evals=200)

In [31]:
space_eval(search_space, best_params)