In [17]:
from ucimlrepo import fetch_ucirepo
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

wine_quality = fetch_ucirepo(id=186)

In [18]:
# Pre-processing
df = wine_quality.data.original

def categorize_quality(quality):
    if quality < 7:
        return 'low'
    else:
        return 'high'


df['quality_category'] = df['quality'].apply(categorize_quality)

red_wine_df = df[df['color'] == 'red']
white_wine_df = df[df['color'] == 'white']

df_array = [red_wine_df, white_wine_df, df]

## Possible models:
SVM | Decision Tree | Random Forest | KNN | QDA | (Possible) Lasso

In [24]:
# KNN

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

df_num = 0
for df in df_array:
    print(df["color"].unique(), end=" | ")
    X = red_wine_df.drop(columns=['quality', 'quality_category', 'color'])
    y = red_wine_df['quality_category']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # find optimized K value
    param_grid = {'n_neighbors': range(1, 20)}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_['n_neighbors'], end=" | ")

    knn = KNeighborsClassifier(n_neighbors=grid_search.best_params_['n_neighbors'])
    knn.fit(X_train, y_train)

    accuracy = knn.score(X_test, y_test)
    print(f'Accuracy of KNN on {df_num} wine data: {accuracy:.2f}')
    df_num += 1

['red'] | 15 | Accuracy of KNN on 0 wine data: 0.86
['white'] | 15 | Accuracy of KNN on 1 wine data: 0.86
['red' 'white'] | 15 | Accuracy of KNN on 2 wine data: 0.86


In [16]:
# confusion matrix
y_pred = knn.predict(X_test)
confusion_matrix(y_test, y_pred, labels=knn.classes_)

array([[ 13,  54],
       [ 14, 399]])