# KNN

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from matplotlib.colors import ListedColormap

In [2]:
etest_data = pd.read_csv("data/etest.csv")
etrain_data = pd.read_csv("data/etrain.csv")
eval_data = pd.read_csv("data/evalidation.csv")
test_data = pd.read_csv("data/test.csv")
train_data = pd.read_csv("data/train.csv")
val_data = pd.read_csv("data/validation.csv")
numeric_features = np.load('numeric_features.npy')
selected_features = np.load('selected_features.npy',allow_pickle=True)

In [3]:
not_num=[]
for i in etest_data.columns:
    if(i not in numeric_features and i != 'target'):
        not_num.append(i)

In [4]:
etest_data_t = etest_data.drop(['target'],axis = 1)
etrain_data_t = etrain_data.drop(['target'],axis = 1)
eval_data_t = eval_data.drop(['target'],axis = 1)
n_etest_data_t = etest_data.drop(['target'],axis = 1).loc[:,numeric_features]
n_etrain_data_t = etrain_data.drop(['target'],axis = 1).loc[:,numeric_features]
n_eval_data_t = eval_data.drop(['target'],axis = 1).loc[:,numeric_features]
nn_etest_data_t = etest_data.drop(['target'],axis = 1).loc[:,not_num]
nn_etrain_data_t = etrain_data.drop(['target'],axis = 1).loc[:,not_num]
nn_eval_data_t = eval_data.drop(['target'],axis = 1).loc[:,not_num]
sf_etest_data_t = etest_data.drop(['target'],axis = 1).loc[:,selected_features]
sf_etrain_data_t = etrain_data.drop(['target'],axis = 1).loc[:,selected_features]
sf_eval_data_t = eval_data.drop(['target'],axis = 1).loc[:,selected_features]
etrain_data_target = etrain_data[["target"]]
etest_data_target = etest_data[["target"]]

In [5]:
dataset_name_1 = ["Mix feature","Only use numeric features","Only use category features"]
train_data_name_1 = [etrain_data_t,n_etrain_data_t,nn_etrain_data_t,sf_etrain_data_t]
test_data_name_1 = [etest_data_t,n_etest_data_t,nn_etest_data_t,sf_etest_data_t]
dataset_name_2 = ["Selected mix feature","Only use numeric features","Only use category features"]
train_data_name_2 = [sf_etrain_data_t,n_etrain_data_t,nn_etrain_data_t]
test_data_name_2 = [sf_etest_data_t,n_etest_data_t,nn_etest_data_t]

In [6]:
def cal(i,result):
    f_score = 0
    accuracy = 0
    total = len(etest_data_target)
    for temp in result:
        f_score += temp['weighted avg']['f1-score'] * (temp['weighted avg']['support'] / total)
        accuracy += temp['accuracy'] * (temp['weighted avg']['support'] / total)
    return f_score,accuracy

In [7]:
classifier = KNeighborsClassifier(n_neighbors=4)
for i in range (len(dataset_name_1)):  
    print(dataset_name_1[i])
    result = []
    classifier.fit(train_data_name_1[i], etrain_data_target.values.ravel())
    y_pred = classifier.predict(test_data_name_1[i])
    print(classification_report(etest_data_target, y_pred))
    result.append(classification_report(etest_data_target, y_pred, zero_division=True, output_dict=True))
    print("f_score: %f  accuracy: %f " %cal(i,result))
    print('==========================================================')

Mix feature
              precision    recall  f1-score   support

           0       0.52      0.64      0.57      2051
           1       0.38      0.37      0.38      2090
           2       0.38      0.37      0.37      2110
           3       0.68      0.56      0.61      2058

    accuracy                           0.48      8309
   macro avg       0.49      0.48      0.48      8309
weighted avg       0.49      0.48      0.48      8309

f_score: 0.482710  accuracy: 0.483331 
Only use numeric features
              precision    recall  f1-score   support

           0       0.57      0.78      0.66      2051
           1       0.41      0.41      0.41      2090
           2       0.45      0.42      0.44      2110
           3       0.84      0.59      0.69      2058

    accuracy                           0.55      8309
   macro avg       0.57      0.55      0.55      8309
weighted avg       0.57      0.55      0.55      8309

f_score: 0.547439  accuracy: 0.548321 
Only use categ

In [8]:
classifier = KNeighborsClassifier(n_neighbors=4)
for i in range (len(dataset_name_2)):  
    print(dataset_name_2[i])
    result = []
    classifier.fit(train_data_name_2[i], etrain_data_target.values.ravel())
    y_pred = classifier.predict(test_data_name_2[i])
    print(classification_report(etest_data_target, y_pred))
    result.append(classification_report(etest_data_target, y_pred, zero_division=True, output_dict=True))
    print("f_score: %f  accuracy: %f " %cal(i,result))
    print('==========================================================')

Selected mix feature
              precision    recall  f1-score   support

           0       0.50      0.71      0.59      2051
           1       0.38      0.37      0.37      2090
           2       0.40      0.37      0.38      2110
           3       0.75      0.53      0.63      2058

    accuracy                           0.49      8309
   macro avg       0.51      0.50      0.49      8309
weighted avg       0.51      0.49      0.49      8309

f_score: 0.491831  accuracy: 0.493682 
Only use numeric features
              precision    recall  f1-score   support

           0       0.57      0.78      0.66      2051
           1       0.41      0.41      0.41      2090
           2       0.45      0.42      0.44      2110
           3       0.84      0.59      0.69      2058

    accuracy                           0.55      8309
   macro avg       0.57      0.55      0.55      8309
weighted avg       0.57      0.55      0.55      8309

f_score: 0.547439  accuracy: 0.548321 
Only 

## MDC

In [9]:
from sklearn.neighbors import NearestCentroid 

Mix feature

In [10]:
model = NearestCentroid() 
for i in range (len(dataset_name_1)): 
    print(dataset_name_1[i])
    result = []
    model.fit(train_data_name_1[i], etrain_data_target.values.ravel())
    y_pred = model.predict(test_data_name_1[i])
    print(classification_report(etest_data_target, y_pred))
    result.append(classification_report(etest_data_target, y_pred, zero_division=True, output_dict=True))
    print("f_score: %f  accuracy: %f " %cal(i,result))
    print('==========================================================')

Mix feature
              precision    recall  f1-score   support

           0       0.55      0.55      0.55      2051
           1       0.37      0.30      0.33      2090
           2       0.38      0.35      0.36      2110
           3       0.57      0.74      0.65      2058

    accuracy                           0.48      8309
   macro avg       0.47      0.48      0.47      8309
weighted avg       0.47      0.48      0.47      8309

f_score: 0.472468  accuracy: 0.483091 
Only use numeric features
              precision    recall  f1-score   support

           0       0.56      0.67      0.61      2051
           1       0.44      0.43      0.43      2090
           2       0.48      0.46      0.47      2110
           3       0.79      0.71      0.75      2058

    accuracy                           0.56      8309
   macro avg       0.57      0.57      0.57      8309
weighted avg       0.57      0.56      0.56      8309

f_score: 0.564049  accuracy: 0.563726 
Only use categ

In [11]:
model = NearestCentroid() 
for i in range (len(dataset_name_2)): 
    print(dataset_name_2[i])
    result = []
    model.fit(train_data_name_2[i], etrain_data_target.values.ravel())
    y_pred = model.predict(test_data_name_2[i])
    print(classification_report(etest_data_target, y_pred))
    result.append(classification_report(etest_data_target, y_pred, zero_division=True, output_dict=True))
    print("f_score: %f  accuracy: %f " %cal(i,result))
    print('==========================================================')

Selected mix feature
              precision    recall  f1-score   support

           0       0.55      0.57      0.56      2051
           1       0.37      0.26      0.31      2090
           2       0.37      0.36      0.36      2110
           3       0.58      0.75      0.66      2058

    accuracy                           0.48      8309
   macro avg       0.47      0.48      0.47      8309
weighted avg       0.47      0.48      0.47      8309

f_score: 0.469644  accuracy: 0.483211 
Only use numeric features
              precision    recall  f1-score   support

           0       0.56      0.67      0.61      2051
           1       0.44      0.43      0.43      2090
           2       0.48      0.46      0.47      2110
           3       0.79      0.71      0.75      2058

    accuracy                           0.56      8309
   macro avg       0.57      0.57      0.57      8309
weighted avg       0.57      0.56      0.56      8309

f_score: 0.564049  accuracy: 0.563726 
Only 