 # Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Load the dataset

In [2]:
df = pd.read_csv('wine.data', header=None)
columns = [
    'Class',
    'Alcohol',
    'Malic_acid',
    'Ash',
    'Alcalinity_of_ash',
    'Magnesium',
    'Total_phenols',
    'Flavanoids',
    'Nonflavanoid_phenols',
    'Proanthocyanins',
    'Color_intensity',
    'Hue',
    'OD280/OD315_of_diluted_wines',
    'Proline'
]
# Add column labels
df.columns = columns
print(df)

     Class  Alcohol  Malic_acid   Ash  Alcalinity_of_ash  Magnesium  \
0        1    14.23        1.71  2.43               15.6        127   
1        1    13.20        1.78  2.14               11.2        100   
2        1    13.16        2.36  2.67               18.6        101   
3        1    14.37        1.95  2.50               16.8        113   
4        1    13.24        2.59  2.87               21.0        118   
..     ...      ...         ...   ...                ...        ...   
173      3    13.71        5.65  2.45               20.5         95   
174      3    13.40        3.91  2.48               23.0        102   
175      3    13.27        4.28  2.26               20.0        120   
176      3    13.17        2.59  2.37               20.0        120   
177      3    14.13        4.10  2.74               24.5         96   

     Total_phenols  Flavanoids  Nonflavanoid_phenols  Proanthocyanins  \
0             2.80        3.06                  0.28             2.29   
1

# For the number of the classes, find the number of train data (%80) for each class


In [3]:
class_counts = []
for i in range(1,4):
    count = (df.iloc[:,0] == i).sum()
    class_counts.append(count)

class_filtered = [int(0.8 * count) for count in class_counts]
print(class_filtered)

[47, 56, 38]


# Separate the data into train and test

In [4]:
row1 = df[0:class_filtered[0]]
row2 = df[class_counts[0]:class_counts[0]+class_filtered[1]]
row3 = df[class_counts[0]+class_counts[1]:class_counts[0]+class_counts[1]+class_filtered[2]]

train_datas = pd.concat([row1,row2,row3], ignore_index= True)

r1 = df[class_filtered[0]:class_counts[0]]
r2 = df[class_counts[0]+class_filtered[1]:class_counts[0]+class_counts[1]]
r3 = df[class_counts[0] + class_counts[1]+ class_filtered[2]:]

test_datas = pd.concat([r1,r2,r3],ignore_index= True)

# Remove the class label
train_features = train_datas.iloc[:,1:]
test_features = test_datas.iloc[:,1:]

# Normalize the test and train data according to the mean and std of train data


In [5]:
train_mean = train_features.mean()
train_std = train_features.std()

normalized_train_data  = (train_features - train_mean)/train_std
normalized_test_data = (test_features - train_mean)/train_std
print(normalized_train_data)

      Alcohol  Malic_acid       Ash  Alcalinity_of_ash  Magnesium  \
0    1.544045   -0.504827  0.262782          -1.147671   1.842491   
1    0.241375   -0.439240 -0.788604          -2.516667   0.008191   
2    0.190786    0.104195  1.132895          -0.214264   0.076128   
3    1.721107   -0.279957  0.516565          -0.774308   0.891373   
4    0.291964    0.319695  1.857988           0.532461   1.231058   
..        ...         ...       ...                ...        ...   
136 -0.062160    1.125478 -0.027255          -0.245378   0.415813   
137  0.974917    0.478978 -0.208529           0.843596  -0.671180   
138  0.911681    1.978108 -0.353547           0.999164  -0.807054   
139  0.557557    1.359717  0.879112           1.154732   0.755499   
140 -0.239222    1.050521 -0.208529           0.065758  -0.807054   

     Total_phenols  Flavanoids  Nonflavanoid_phenols  Proanthocyanins  \
0         0.845939    1.133225             -0.637055         1.253503   
1         0.605622    0.8

# Add class label to the train data

In [6]:
class_label = np.array(train_datas.iloc[:,0])
class_label = class_label.reshape(-1, 1)

labeled_norm_train = np.hstack((class_label,np.array(normalized_train_data)))

# Select k

In [7]:
k = 7

# Begin Classification

In [8]:
# Initialize the estimated class vector which contains test data which is labeled by KNN
estimated_clas = np.zeros((normalized_test_data.shape[0],1))

for i in range(normalized_test_data.shape[0]):
    # Select a row from test data
    test = normalized_test_data.iloc[i,:]
    
    #Calculate Euclidean distance
    cost = (normalized_train_data - test)**2
    distances = np.sqrt(np.sum(cost,axis=1))  
    
    # Sort the distances ascending order
    sorted_dist = np.argsort(distances)
    
    # Select the first k indices for classification
    closest_k_indices = np.array(sorted_dist.iloc[0:k])
    
    # Find the class of corresponding k minimum distances from the train data
    closest_class_label = labeled_norm_train[closest_k_indices,0]
    
    # Find which class number is more frequent
    appended_clas = mode(closest_class_label).mode
    
    # Assign the class to test data
    estimated_clas[i,0] = appended_clas

print(estimated_clas)

[[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [2.]
 [3.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [2.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]
 [3.]]


# Select the original class of the test data

In [9]:
original_class = (np.array(test_datas.iloc[:,0])).reshape(-1,1)
print(original_class)

[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [2]
 [3]
 [3]
 [3]
 [3]
 [3]
 [3]
 [3]
 [3]
 [3]
 [3]]


# Print the confusion matrix and classification report

In [10]:
conf_matrix = confusion_matrix(original_class, estimated_clas)
print(conf_matrix)
print(classification_report(original_class, estimated_clas))

[[12  0  0]
 [ 0 14  1]
 [ 0  0 10]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      0.93      0.97        15
           3       0.91      1.00      0.95        10

    accuracy                           0.97        37
   macro avg       0.97      0.98      0.97        37
weighted avg       0.98      0.97      0.97        37

