# Wine Classifier


In [None]:
import numpy as np

import pandas as pd

import joblib

from sklearn.datasets import load_wine
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
np.random.seed(42)

In [None]:
def train_valid_test_split(X, y):
    length_X, length_y = len(X), len(y)
    n = length_X if length_X == length_y else 0

    threshold_1 = round(0.7 * n)
    threshold_2 = round(0.85 * n)

    X_train, y_train = X[:threshold_1], y[:threshold_1]
    X_valid, y_valid = X[threshold_1:threshold_2], y[threshold_1:threshold_2]
    X_test, y_test = X[threshold_2:], y[threshold_2:]

    return (X_train, y_train, X_valid, y_valid, X_test, y_test)


def evaluate_model(y_true, y_pred):
    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average="weighted"),
        "recall": recall_score(y_true, y_pred, average="weighted"),
        "f1": f1_score(y_true, y_pred, average="weighted"),
    }

    print(f"Accuracy: {metrics['accuracy']:.2%}")
    print(f"Precision: {metrics['precision']:.2%}")
    print(f"Recall: {metrics['recall']:.2%}")
    print(f"F1 Score: {metrics['f1']:.2%}")

In [None]:
data = load_wine()

df = pd.DataFrame(data["data"], columns=data["feature_names"])
df["target"] = data["target"]

df = df.sample(frac=1)

In [None]:
X = df.drop("target", axis=1)
y = df["target"]

## 1. Manual Use of Validation Sets


In [None]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(X, y)

In [None]:
model_1 = RandomForestClassifier(n_estimators=5)
model_1.fit(X_train, y_train)
evaluate_model(y_test, model_1.predict(X_test))

In [None]:
model_2 = RandomForestClassifier(n_estimators=10)
model_2.fit(X_train, y_train)
evaluate_model(y_test, model_2.predict(X_test))

In [None]:
model_3 = RandomForestClassifier(n_estimators=20)
model_3.fit(X_train, y_train)
evaluate_model(y_test, model_3.predict(X_test))

## 2. GridSearchCV


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
estimator = RandomForestClassifier()

param_distributions = {
    "n_estimators": [5, 25, 100],
    "max_depth": [None, 10, 50],
    "max_features": [None, "sqrt", "log2"],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2],
}

In [None]:
rscv_model = GridSearchCV(estimator, param_distributions, cv=5)

rscv_model.fit(X_train, y_train)

rscv_model.best_params_

In [None]:
evaluate_model(y_test, rscv_model.predict(X_test))

In [None]:
joblib.dump(rscv_model, "./models/wine-classifier.joblib")