## Data Exploration

In [None]:
# Use Jupyter Black for cell formatting
import jupyter_black

jupyter_black.load()

# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier


from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    precision_recall_fscore_support,
    f1_score,
)
from sklearn.preprocessing import label_binarize

from sklearn.svm import SVC

In [None]:
# Read in the red wine and white wine data and concatenate together
df_red = pd.read_csv("./data/winequality-red.csv", sep=";")
df_red["color"] = "red"

df_white = pd.read_csv("./data/winequality-white.csv", sep=";")
df_white["color"] = "white"

df = pd.concat([df_red, df_white])

In [None]:
df["color"] = df["color"].replace("white", "0").replace("red", "1").astype("Int64")

In [None]:
# The data type of each row
print(df.dtypes.to_markdown())

In [None]:
# Describe the data
print(df.describe().T.round(2).to_markdown())

In [None]:
# Verify that there are no null values
print(df.isnull().sum().to_markdown())

In [None]:
sns.set_context("talk")
sns.set_style("white")
sns.pairplot(df, hue="color")

In [None]:
# everything except "color"
fields = list(df.columns[:-1])
correlations = df[fields].corrwith(df["color"])
correlations.sort_values(inplace=True)
ax = correlations.plot(kind="bar")
ax.set(ylim=[-1, 1], ylabel="Pearson Correlation");

In [None]:
def minmax_scale_columns(df: pd.DataFrame, columns: list = None) -> pd.DataFrame:
    df_copy = df.copy()  # Avoid modifying the original DataFrame.

    if columns is None:
        numeric_cols = df_copy.select_dtypes(include=["number"]).columns.tolist()
        columns = numeric_cols

    scaler = MinMaxScaler()
    try:
        df_copy[columns] = scaler.fit_transform(df_copy[columns])
    except KeyError as e:
        print(f"Error: One or more specified columns not found. {e}")
    except ValueError as e:
        print(f"Error during scaling. Check if columns contain numeric data: {e}")
    return df_copy


df_norm = minmax_scale_columns(df, df.columns[:-1]).reset_index(drop=True)

In [None]:
print(df_norm.describe().round(2).T.to_markdown())

In [None]:
# Check class balance
print(df_norm["color"].value_counts(normalize=True).to_markdown())

In [None]:
# Create the train / test data
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(df_norm, df_norm["color"]):
    df_train = df_norm.loc[train_index]
    df_test = df_norm.loc[test_index]

# Create the Train and the Test data
X_train = df_train[df_train.columns[:-1]]
X_test = df_test[df_test.columns[:-1]]

y_train = df_train["color"]
y_test = df_test["color"]

In [None]:
print(y_train.value_counts(normalize=True).to_markdown())
print(y_test.value_counts(normalize=True).to_markdown())

---

# Logistic Regression

In [None]:
# Standard logistic regression
lr = LogisticRegression(solver="liblinear").fit(X_train, y_train)

# L1 regularized logistic regression
lr_l1 = LogisticRegressionCV(Cs=10, cv=4, penalty="l1", solver="liblinear").fit(
    X_train, y_train
)

# L2 regularized logistic regression
lr_l2 = LogisticRegressionCV(Cs=10, cv=4, penalty="l2", solver="liblinear").fit(
    X_train, y_train
)

In [None]:
# Predict the class for each model
y_pred = list()

coeff_labels = ["lr", "l1", "l2"]
coeff_models = [lr, lr_l1, lr_l2]

for lab, mod in zip(coeff_labels, coeff_models):
    y_pred.append(pd.Series(mod.predict(X_test), name=lab))

y_pred = pd.concat(y_pred, axis=1)

y_pred.head()

In [None]:
metrics = list()
cm = dict()

for lab in coeff_labels:

    # Preciision, recall, f-score from the multi-class support function
    precision, recall, fscore, _ = score(y_test, y_pred[lab], average="weighted")

    # The usual way to calculate accuracy
    accuracy = accuracy_score(y_test, y_pred[lab])

    # ROC-AUC scores can be calculated by binarizing the data
    auc = roc_auc_score(
        label_binarize(y_test, classes=[0, 1]),
        label_binarize(y_pred[lab], classes=[0, 1]),
        average="weighted",
    )

    # Last, the confusion matrix
    cm[lab] = confusion_matrix(y_test, y_pred[lab])

    metrics.append(
        pd.Series(
            {
                "precision": precision,
                "recall": recall,
                "accuracy": accuracy,
                "fscore": fscore,
                "auc": auc,
            },
            name=lab,
        )
    )

df_metrics = pd.concat(metrics, axis=1)
df_metrics.index.name = "model"

In [None]:
df_metrics = df_metrics.transpose()
df_metrics

In [None]:
print(df_metrics.round(5).to_markdown())

In [None]:
# Display or plot the confusion matrix for each model.
fig, axList = plt.subplots(nrows=2, ncols=3)
axList = axList.flatten()
fig.set_size_inches(12, 6)

axList[-1].axis("off")

for ax, lab in zip(axList[:-1], coeff_labels):
    sns.heatmap(cm[lab], ax=ax, annot=True, fmt="d")
    ax.set(title=lab)

plt.tight_layout()

---

# Support Vector Machines

In [None]:
# utility method to evaluate the model performance.
def evaluate_metrics(yt, yp, model_name):
    results_pos = {}
    precision, recall, f_beta, _ = precision_recall_fscore_support(
        yt, yp, average="binary"
    )
    results_pos["model"] = model_name
    results_pos["precision"] = float(precision)
    results_pos["recall"] = float(recall)
    results_pos["accuracy"] = accuracy_score(yt, yp)
    results_pos["fscore"] = float(f_beta)
    return results_pos

In [None]:
# Create the model
model = SVC(random_state=42)

# Train the model with training dataset:
model.fit(X_train, y_train.values.ravel())

# Make the predictions
svm_y_pred = model.predict(X_test)

In [None]:
svm_perf = evaluate_metrics(y_test, svm_y_pred, "svm")

In [None]:
# With Probability for AUC
model_prob = SVC(probability=True, random_state=42)
model_prob.fit(X_train, y_train.values.ravel())
svm_prob_y_pred = model_prob.predict(X_test)

# Calculate AUC-ROC and add to results
svm_perf["auc"] = roc_auc_score(y_test, svm_prob_y_pred)

In [None]:
# Add the model performance to the metrics dataframe
df_metrics = pd.concat([df_metrics, pd.DataFrame([svm_perf]).set_index("model")])

In [None]:
print(df_metrics.to_markdown())

In [None]:
# Next, let's try `GridSearchCV` to find the optimized `C` and `kernel` combination:
params_grid = {
    "C": [0.1, 1, 10, 100, 500],
    "kernel": ["poly", "rbf", "sigmoid"],
}
opto_model = SVC(random_state=42)

# Define a GridSearchCV to search the best parameters
grid_search = GridSearchCV(
    estimator=opto_model,
    param_grid=params_grid,
    scoring="f1",
    cv=5,
    verbose=1,
)
# Search the best parameters with training data
grid_search.fit(X_train, y_train.values.ravel())
best_params = grid_search.best_params_

best_params

In [None]:
best_params

In [None]:
# Create the model
best_model = SVC(random_state=42, C=1, kernel="poly")

# Train the model with training dataset:
best_model.fit(X_train, y_train.values.ravel())

# Make the predictions
best_model_y_pred = best_model.predict(X_test)

best_model_perf = evaluate_metrics(y_test, best_model_y_pred, "svm best")

# With Probability for AUC
best_model_prob = SVC(probability=True, random_state=42, C=1, kernel="poly")
best_model_prob.fit(X_train, y_train.values.ravel())
best_model_prob_y_pred = best_model_prob.predict(X_test)

# Calculate AUC-ROC and add to results
best_model_perf["auc"] = roc_auc_score(y_test, best_model_prob_y_pred)

# Add the model performance to the metrics dataframe
df_metrics = pd.concat([df_metrics, pd.DataFrame([best_model_perf]).set_index("model")])

In [None]:
print(df_metrics.round(4).to_markdown())

In [None]:
print(confusion_matrix(y_test, svm_y_pred))

print(confusion_matrix(y_test, best_model_y_pred))

---

# K Nearest Neighbours

In [None]:
# Try K from 1 to 50
max_k = 50

# Create an empty list to store f1score for each k
f1_scores = []

# Then we will train 50 KNN classifiers with K ranged from 1 to 50.
for k in range(1, max_k + 1):
    # Create a KNN classifier
    knn = KNeighborsClassifier(n_neighbors=k)
    # Train the classifier
    knn = knn.fit(X_train, y_train.values.ravel())
    preds = knn.predict(X_test)
    # Evaluate the classifier with f1score
    f1 = f1_score(preds, y_test)
    f1_scores.append((k, round(f1_score(y_test, preds), 4)))
# Convert the f1score list to a dataframe
f1_results = pd.DataFrame(f1_scores, columns=["K", "F1 Score"])
f1_results.set_index("K")

# This is a long list and different to analysis, so let's visualize the list using a linechart.
# Plot F1 results
ax = f1_results.plot(figsize=(6, 4))
ax.set(xlabel="Num of Neighbors", ylabel="F1 Score")
ax.set_xticks(range(1, max_k, 5))
plt.ylim((0.96, 1))
plt.title("KNN F1 Score")

In [None]:
knn = KNeighborsClassifier(n_neighbors=k)
# Train the classifier
knn = knn.fit(X_train, y_train.values.ravel())
knn_preds = knn.predict(X_test)

knn_perf = evaluate_metrics(y_test, knn_preds, "knn")

# 5. Get predicted probabilities for the positive class (class 1)
knn_prob = knn.predict_proba(X_test)[:, 1]

# 6. Calculate the AUC-ROC score
knn_perf["auc"] = np.nan

df_metrics = pd.concat([df_metrics, pd.DataFrame([knn_perf]).set_index("model")])

In [None]:
print(df_metrics.round(4).to_markdown())

In [None]:
print(confusion_matrix(y_test, knn_preds))

--- 

# Decision Trees

In [None]:
### BEGIN SOLUTION
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)


dt_preds = dt.predict(X_test)
dt_perf = evaluate_metrics(y_test, dt_preds, "dt")
dt_perf["auc"] = np.nan

df_metrics = pd.concat([df_metrics, pd.DataFrame([dt_perf]).set_index("model")])

In [None]:
print(dt.tree_.node_count, dt.tree_.max_depth)
print(confusion_matrix(y_test, dt_preds))

In [None]:
df_params_grid = {
    "max_depth": range(1, dt.tree_.max_depth + 1, 2),
    "max_features": range(1, len(dt.feature_importances_) + 1),
    "min_samples_leaf": [1, 2, 5],
    "criterion": ["gini", "entropy"],
}

df_grid_search = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid=df_params_grid,
    scoring="accuracy",
    n_jobs=-1,
)

df_best = df_grid_search.fit(X_train, y_train)

df_best_preds = df_best.predict(X_test)
df_best_perf = evaluate_metrics(y_test, df_best_preds, "dt best")
df_best_perf["auc"] = np.nan

df_metrics = pd.concat([df_metrics, pd.DataFrame([df_best_perf]).set_index("model")])

In [None]:
print(df_metrics.round(4).to_markdown())

In [None]:
print(df_best.best_estimator_.tree_.node_count, df_best.best_estimator_.tree_.max_depth)
print(confusion_matrix(y_test, df_best_preds))