In [1]:
import random, warnings
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action = "ignore", category = SettingWithCopyWarning)

In [2]:
# getting feature list from csv file
features = pd.read_csv(r'features.csv')

In [3]:
# creating matrix of just formant data
data_matrix = features.iloc[:, -3:]

In [4]:
# dataframe with just class number for each row
data_labels = features.iloc[:, 4]

0      1
1      2
2      3
3      1
4      2
      ..
145    2
146    3
147    1
148    2
149    3
Name: class_number, Length: 150, dtype: int64


In [None]:
kList = [1, 3, 5, 7, 10] # k values that will be tried
distMetricList = ["euclidean", "manhattan", "chebyshev", "minkowski"]
n = 1000 # number of repetitions

# labels for graphs
phoneme_labels = ["IY", "AE", "ER"]
for distMetric in distMetricList:
    for k in kList:
        # initialising all values in confusion array to 0
        confusionArray = [[0] * 3 for _ in range(3)]
        
        for i in range(n):    
            # splitting the training and test data with a 3:1 split
            X_train, X_test, y_train, y_test = train_test_split(data_matrix, data_labels, test_size = 0.25)

            # training the model
            knn_model = KNeighborsClassifier(n_neighbors = k, metric = distMetric)
            knn_model.fit(X_train, y_train)

            # getting list of predicited results
            test_preds = knn_model.predict(X_test)

            # joining list of predicted results to test set
            X_test['predicted'] = test_preds.tolist()

            # looping through test set and adding to confusion array depending of predicted and actual result
            for i, row in X_test.iterrows():
                if X_test.loc[i]['predicted'] == 1 and features.iloc[i]['class_number'] == 1:
                    confusionArray[0][0] += 1
                elif X_test.loc[i]['predicted'] == 1 and features.iloc[i]['class_number'] == 2:
                    confusionArray[0][1] += 1
                elif X_test.loc[i]['predicted'] == 1 and features.iloc[i]['class_number'] == 3:
                    confusionArray[0][2] += 1
                elif X_test.loc[i]['predicted'] == 2 and features.iloc[i]['class_number'] == 1:
                    confusionArray[1][0] += 1
                elif X_test.loc[i]['predicted'] == 2 and features.iloc[i]['class_number'] == 2:
                    confusionArray[1][1] += 1
                elif X_test.loc[i]['predicted'] == 2 and features.iloc[i]['class_number'] == 3:
                    confusionArray[1][2] += 1
                elif X_test.loc[i]['predicted'] == 3 and features.iloc[i]['class_number'] == 1:
                    confusionArray[2][0] += 1
                elif X_test.loc[i]['predicted'] == 3 and features.iloc[i]['class_number'] == 2:
                    confusionArray[2][1] += 1
                elif X_test.loc[i]['predicted'] == 3 and features.iloc[i]['class_number'] == 3:
                    confusionArray[2][2] += 1

        # getting average of each k value before plotting graph
        for i in range(3):
            for j in range(3):
                confusionArray[i][j] /= n

        dataFrameGraph = pd.DataFrame(confusionArray, index = [i for i in phoneme_labels], 
                                      columns = [i for i in phoneme_labels])
        plt.figure(figsize = (4, 4))
        plt.xlabel("Actual Phoneme")
        plt.ylabel("Predicted Phoneme")
        plt.title("Confusion Array for k = " + str(k) + "\nwith distance metric: " + distMetric)
        sn.heatmap(dataFrameGraph, annot = True)
        # saving all graphs as images
#         plt.savefig("k_" + str(k) + "-dist_" + distMetric + ".png", transparent = True)
        