# KNN

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from matplotlib.colors import ListedColormap

In [2]:
etest_data = pd.read_csv("data/etest.csv")
etrain_data = pd.read_csv("data/etrain.csv")
eval_data = pd.read_csv("data/evalidation.csv")
test_data = pd.read_csv("data/test.csv")
train_data = pd.read_csv("data/train.csv")
val_data = pd.read_csv("data/validation.csv")
numeric_features = np.load('numeric_features.npy')

In [3]:
not_num=[]
for i in etest_data.columns:
    if(i not in numeric_features and i != 'target'):
        not_num.append(i)

In [4]:
etest_data_t = etest_data.drop(['target'],axis = 1)
etrain_data_t = etrain_data.drop(['target'],axis = 1)
eval_data_t = eval_data.drop(['target'],axis = 1)
n_etest_data_t = etest_data.drop(['target'],axis = 1).loc[:,numeric_features]
n_etrain_data_t = etrain_data.drop(['target'],axis = 1).loc[:,numeric_features]
n_eval_data_t = eval_data.drop(['target'],axis = 1).loc[:,numeric_features]
nn_etest_data_t = etest_data.drop(['target'],axis = 1).loc[:,not_num]
nn_etrain_data_t = etrain_data.drop(['target'],axis = 1).loc[:,not_num]
nn_eval_data_t = eval_data.drop(['target'],axis = 1).loc[:,not_num]
etrain_data_target = etrain_data[["target"]]
etest_data_target = etest_data[["target"]]

Mix feature

In [5]:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(etrain_data_t, etrain_data_target.values.ravel())
y_pred = classifier.predict(etest_data_t)
print(confusion_matrix(etest_data_target, y_pred))
print(classification_report(etest_data_target, y_pred))

[[1322  482  196   51]
 [ 750  775  448  117]
 [ 395  577  775  363]
 [  96  201  617 1144]]
              precision    recall  f1-score   support

           0       0.52      0.64      0.57      2051
           1       0.38      0.37      0.38      2090
           2       0.38      0.37      0.37      2110
           3       0.68      0.56      0.61      2058

    accuracy                           0.48      8309
   macro avg       0.49      0.48      0.48      8309
weighted avg       0.49      0.48      0.48      8309



Only use numeric features

In [6]:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(n_etrain_data_t, etrain_data_target.values.ravel())
y_pred = classifier.predict(n_etest_data_t)
print(confusion_matrix(etest_data_target, y_pred))
print(classification_report(etest_data_target, y_pred))

[[1600  364   80    7]
 [ 860  854  344   32]
 [ 314  710  887  199]
 [  43  157  643 1215]]
              precision    recall  f1-score   support

           0       0.57      0.78      0.66      2051
           1       0.41      0.41      0.41      2090
           2       0.45      0.42      0.44      2110
           3       0.84      0.59      0.69      2058

    accuracy                           0.55      8309
   macro avg       0.57      0.55      0.55      8309
weighted avg       0.57      0.55      0.55      8309



Only use category features

In [7]:
classifier = KNeighborsClassifier(n_neighbors=4)
classifier.fit(nn_etrain_data_t, etrain_data_target.values.ravel())
y_pred = classifier.predict(nn_etest_data_t)
print(confusion_matrix(etest_data_target, y_pred))
print(classification_report(etest_data_target, y_pred))

[[1144  487  282  138]
 [ 667  652  514  257]
 [ 377  521  695  517]
 [ 129  225  635 1069]]
              precision    recall  f1-score   support

           0       0.49      0.56      0.52      2051
           1       0.35      0.31      0.33      2090
           2       0.33      0.33      0.33      2110
           3       0.54      0.52      0.53      2058

    accuracy                           0.43      8309
   macro avg       0.43      0.43      0.43      8309
weighted avg       0.43      0.43      0.43      8309



## MDC

In [8]:
from sklearn.neighbors import NearestCentroid 

Mix feature

In [9]:
model = NearestCentroid() 
model.fit(etrain_data_t, etrain_data_target.values.ravel()) 
y_pred = model.predict(etest_data_t)
print(confusion_matrix(etest_data_target, y_pred))
print(classification_report(etest_data_target, y_pred))

[[1132  492  260  167]
 [ 620  628  561  281]
 [ 247  448  735  680]
 [  41  126  372 1519]]
              precision    recall  f1-score   support

           0       0.55      0.55      0.55      2051
           1       0.37      0.30      0.33      2090
           2       0.38      0.35      0.36      2110
           3       0.57      0.74      0.65      2058

    accuracy                           0.48      8309
   macro avg       0.47      0.48      0.47      8309
weighted avg       0.47      0.48      0.47      8309



Only use numeric features

In [10]:
model = NearestCentroid() 
model.fit(n_etrain_data_t, etrain_data_target.values.ravel()) 
y_pred = model.predict(n_etest_data_t)
print(confusion_matrix(etest_data_target, y_pred))
print(classification_report(etest_data_target, y_pred))

[[1369  567  111    4]
 [ 680  890  484   36]
 [ 310  497  967  336]
 [  83   66  451 1458]]
              precision    recall  f1-score   support

           0       0.56      0.67      0.61      2051
           1       0.44      0.43      0.43      2090
           2       0.48      0.46      0.47      2110
           3       0.79      0.71      0.75      2058

    accuracy                           0.56      8309
   macro avg       0.57      0.57      0.57      8309
weighted avg       0.57      0.56      0.56      8309



Only use category features

In [11]:
model = NearestCentroid() 
model.fit(nn_etrain_data_t, etrain_data_target.values.ravel()) 
y_pred = model.predict(nn_etest_data_t)
print(confusion_matrix(etest_data_target, y_pred))
print(classification_report(etest_data_target, y_pred))

[[1116  466  239  230]
 [ 630  604  488  368]
 [ 263  434  648  765]
 [  50  153  405 1450]]
              precision    recall  f1-score   support

           0       0.54      0.54      0.54      2051
           1       0.36      0.29      0.32      2090
           2       0.36      0.31      0.33      2110
           3       0.52      0.70      0.60      2058

    accuracy                           0.46      8309
   macro avg       0.45      0.46      0.45      8309
weighted avg       0.45      0.46      0.45      8309

