# Setup

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Setting default figure size
plt.rcParams['figure.figsize'] = (2,2)

In [None]:
# Setting default seaborn visualization style
sns.set_theme(style='darkgrid')

# Data Reading

Dataset is downloaded from UC Irvine Machine Learning Repository.
URL: https://archive.ics.uci.edu/dataset/109/wine

In [None]:
col_names = ['Wine Type', 'Alcohol', 'Malic Acid', 'Ash', 'Alkalinity of Ash', 'Magnesium', 'Total Phenols', 'Flavonoids',
             'Nonflavonoid Phenols', 'Proanthocyanins', 'Color Intensity', 'Hue', 'OD280/OD315', 'Proline']
df = pd.read_csv('wine.data', names=col_names)
df

In [None]:
df.head(10)

# Dataset Splitting

Dataset splitting is done in the beginning to prevent data leakage when processing the data.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.drop('Wine Type', axis=1)
y = df['Wine Type']

In [None]:
x

75% of the dataset is used and processed for model training. The remaining 25% is used for testing the model.

In [None]:
# Splitting dataset using sklearn function, train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [None]:
x_train

In [None]:
y_train

# Exploratory Data Analysis

In [None]:
# Checking the datatype and number of non-null value for each feature
x_train.info()

In [None]:
# Checking the count of each class in the target variable
y_train.value_counts()

In [None]:
# Checking the number of rows and columns in the dataset
x_train.shape

In [None]:
# Descriptive statistic of the data
x_train.describe().T

In [None]:
# Checking if there are any duplicate value
x_train.duplicated().sum()

In [None]:
# Checking if there are any null value
x_train.isna().sum()

In this step we see the distribution of each feature.

In [None]:
x_cols = x_train.columns

In [None]:
plt.figure(figsize=(10,12))
for idx in range(0, len(x_cols)):
    plt.subplot(5,3,idx+1)
    plt.title(f'Distribution of {x_cols[idx]}')
    sns.histplot(x_train, x=x_train[x_cols[idx]], kde=True, color='#004b66')
    plt.tight_layout()
    idx = idx + 1

After seeing the distribution of each feature, we check the relationship of each feature to the target variable.

In [None]:
train_data = pd.concat([x_train, y_train], axis=1, ignore_index=True)
train_data.columns = ['Alcohol', 'Malic Acid', 'Ash', 'Alkalinity of Ash', 'Magnesium', 'Total Phenols', 'Flavonoids',
                      'Nonflavonoid Phenols', 'Proanthocyanins', 'Color Intensity', 'Hue', 'OD280/OD315', 'Proline', 'Wine Type']
train_data

In [None]:
train_data = pd.concat([x_train, y_train], axis=1, ignore_index=True)
train_data.columns = ['Alcohol', 'Malic Acid', 'Ash', 'Alkalinity of Ash', 'Magnesium', 'Total Phenols', 'Flavonoids',
                      'Nonflavonoid Phenols', 'Proanthocyanins', 'Color Intensity', 'Hue', 'OD280/OD315', 'Proline', 'Wine Type']
train_data
plt.figure(figsize=(10,10))
for i in range(0, len(x_cols)):
    plt.subplot(4,4,i+1)
    sns.scatterplot(data=train_data, x=train_data['Wine Type'], y=train_data.columns[i], color='#6600cc')
    plt.tight_layout()
    i = i + 1

After checking the relationship, we identify outliers using boxplot visualization.

In [None]:
plt.figure(figsize=(10,10))
for i in range(0, len(col_names)):
    plt.subplot(4,4,i+1)
    plt.title(f'Boxplot for {col_names[i]}')
    sns.boxplot(train_data[col_names[i]], color='#00ff99')
    plt.tight_layout()
    i = i + 1

Some outliers detected in the following features:
Ash, Alkalinity of Ash, Magnesium, Proanthocyanins, Color Intensity, Hue

# Data Preprocessing

### Outlier Removal

In this problem, we remove the outliers using the IQR (Interquartile Range) method because some of the features are not normally distributed.

In [None]:
Q1 = train_data.quantile(0.25)
Q3 = train_data.quantile(0.75)
IQR = Q3 - Q1
train_data_1 = train_data[~((train_data < (Q1 - 1.5 * IQR)) | (train_data > (Q3 + 1.5 * IQR))).any(axis=1)]
train_data_1

In [None]:
# Boxplot AFTER outlier removal #1
plt.figure(figsize=(10,10))
for i in range(0, len(col_names)):
    plt.subplot(4,4,i+1)
    plt.title(f'Boxplot for {col_names[i]}')
    sns.boxplot(train_data_1[col_names[i]], color='#00ff99')
    plt.tight_layout()
    i = i + 1

There are still some data points outside the whiskers of some boxplot, so I decided to re-iterate this method to ensure there are no outliers.

In [None]:
Q1_1 = train_data_1.quantile(0.25)
Q3_1 = train_data_1.quantile(0.75)
IQR_1 = Q3_1 - Q1_1
train_data_2 = train_data_1[~((train_data_1 < (Q1_1 - 1.5 * IQR_1)) | (train_data_1 > (Q3_1 + 1.5 * IQR_1))).any(axis=1)]
train_data_2

In [None]:
# Boxplot AFTER outlier removal #2
plt.figure(figsize=(10,10))
for i in range(0, len(col_names)):
    plt.subplot(4,4,i+1)
    plt.title(f'Boxplot for {col_names[i]}')
    sns.boxplot(train_data_2[col_names[i]], color='#00ff99')
    plt.tight_layout()
    i = i + 1

There are no more data points outside the whiskers of the boxplot so we can conclude that all the outliers were removed successfully.

Here we split the cleaned training data again into the variable x_train and y_train

In [None]:
x_train = train_data_2.drop('Wine Type', axis=1)
y_train = train_data_2['Wine Type']

### Data Scaling

Data scaling is necessary to ensure all of the values are in the same scale and modelling can be done optimally.

Here we use the MinMax scaling method to scale the data.

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(x_train)

We fit the scaler only to the training data so there are no data leakage.

In [None]:
x_train_scaled = scaler.transform(x_train)
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_cols)
x_train_scaled

# Model Training
(Hyperparameter Tuning is done simultaneously)

### K-Nearest Neighbors (KNN)

We use the K-Nearest Neighbors algorithm because this is a multiclass classification problem and the dataset is small.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
knn = KNeighborsClassifier()

# We store the hyperparameters we want to tune in the dictionary 'param_grid'
param_grid = {
    'n_neighbors' : range(1,10),
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2]
}

Hyperparameter tuning is done using Grid Search and Cross Validation

In [None]:
grid_search = GridSearchCV(knn, param_grid, cv=3)
grid_search.fit(x_train_scaled, y_train)

In [None]:
best_knn = grid_search.best_estimator_

In [None]:
print(f'Best Hyperparameters: {grid_search.best_params_}')

We test the model on the training dataset first.

In [None]:
y_train_pred = best_knn.predict(x_train_scaled)

In [None]:
train_matrix = confusion_matrix(y_train, y_train_pred)
plt.figure(figsize=(5,4))
plt.title('Confusion Matrix for Training Dataset')
sns.heatmap(train_matrix, annot=True, cmap='viridis', 
            xticklabels=['Wine 1', 'Wine 2', 'Wine 3'], yticklabels=['Wine 1', 'Wine 2', 'Wine 3'])
plt.xlabel('Predicted Value')
plt.ylabel('Actual Value')
plt.show()

In [None]:
train_accu = accuracy_score(y_train, y_train_pred)
train_report = classification_report(y_train, y_train_pred)
print(f'Accuracy Score for Training Dataset: {train_accu:.2%}\n')
print(train_report)

After using the model on the training data, we can see that we obtained a pretty high score, with 97.48% accuracy.

Next, we will test the model on the testing data.

# Model Testing

In [None]:
x_test.head(10)

We also have to scale our testing data so the model's performance is consistent. We will use the scaler that we used previously to scale the training data.

In [None]:
# We MUST NOT fit the scaler to the testing data as it will cause data leakage.
# Only use the transform method from the previously fitted scaler.

x_test_scaled = pd.DataFrame(scaler.transform(x_test), columns=x_cols)
x_test_scaled.head(10)

Here we use the previous model 'best_knn'.

In [None]:
y_test_pred = best_knn.predict(x_test_scaled)

test_pred_df = pd.DataFrame({'y_test': y_test, 'y_test_pred': y_test_pred})

In [None]:
test_pred_df

## Model Evaluation

In [None]:
test_accu = accuracy_score(y_test, y_test_pred)
print(f'Accuracy Score for Testing Dataset: {test_accu:.2%}')

In [None]:
test_matrix = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(5,4))
plt.title('Confusion Matrix for Training Dataset')
sns.heatmap(test_matrix, annot=True, cmap='viridis', 
            xticklabels=['Wine 1', 'Wine 2', 'Wine 3'], yticklabels=['Wine 1', 'Wine 2', 'Wine 3'])
plt.xlabel('Predicted Value')
plt.ylabel('Actual Value')
plt.show()

In [None]:
test_report = classification_report(y_test, y_test_pred)
print(test_report)

##### We can see that the model works great on the testing data, acquiring 97.78% accuracy. We can also see that the model only incorrectly classified 1 data point out of 45 data points.