In [8]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from __future__ import division
from scipy.io import arff


In [9]:
def df_normalize(DF):
    df_normalized = DF.copy()
    colList = list(DF.columns)
    for col in range(len(colList)):
        colMean = DF[colList[col]].mean()
        colStd = DF[colList[col]].std()
        df_normalized[colList[col]] = (DF[colList[col]] - colMean)/colStd
    return df_normalized

In [10]:
def PCC(df_x , df_y, SumofYSqrd, mean_y):
    SumofXSqrd = np.sum(np.square(df_x))
    sum_coproduct = np.sum( df_x * df_y )
    mean_x = np.mean(df_x)
    pop_sd_x = np.sqrt( (SumofXSqrd / float(len(df_x))) - (mean_x**2) )
    pop_sd_y = np.sqrt( (SumofYSqrd / float(len(df_y))) - (mean_y**2) )
    xyCov = ( (sum_coproduct / len(df_y)) - (mean_x * mean_y) )
    correlation = ( xyCov / (pop_sd_x * pop_sd_y) )
    return correlation

In [11]:
def KNN_predicted_class(df_training , test_row, df_training_label):
    value_of_k = 7
    df_distance = calculateEculidDist(df_training , test_row)
    rows_k = df_distance.iloc[:value_of_k]
    tmp = df_training_label.iloc[rows_k.index.tolist()]['CLASS'].value_counts()
    return tmp.idxmax()

def calculateEculidDist(df_training , test_row):
    tmp = (((df_training.sub( test_row, axis=1))**2).sum(axis=1))**0.5
    tmp.sort_values(axis=0, ascending=True, inplace=True)
    return tmp

In [12]:
input = arff.loadarff("veh-prime.arff")
df_training = pd.DataFrame(input[0])
df_training_label = df_training[['CLASS']].copy()
df_training.drop('CLASS' , axis=1, inplace=True)
df_training_label['CLASS'] = np.where(df_training_label['CLASS'] == b'noncar', 0, 1)

df_training_normalized = df_normalize(df_training) 

SumofYSqrd = np.sum(np.square(df_training_label['CLASS']))
mean_y = np.mean(df_training_label['CLASS'])

list_pcc = []
list_pcc_abs = []
list_feature = []

for counter in range(len(df_training.columns)):
    list_feature.append(df_training.columns[counter])
    
    pcc = PCC(df_training[df_training.columns[counter]], df_training_label['CLASS'], SumofYSqrd, mean_y)
    list_pcc.append( pcc )
    list_pcc_abs.append( np.abs(pcc) )
    

In [15]:
dict_temp = {'feature' : list_feature , 'pcc' : list_pcc, 'abspcc' : list_pcc_abs}
df_pcc = pd.DataFrame(dict_temp)
df_pcc.sort_values(['abspcc'] , ascending=0 , inplace=True)

list_feature_ranked = df_pcc['feature'].tolist()

for counter in range(len(list_feature_ranked)):
    print("The Following Selected Feature Set = ", list_feature_ranked[:counter+1])
    df_training_temp = df_training_normalized[list_feature_ranked[:counter+1]]
    index = 0
    count_accuracy = 0
    for row in df_training_temp.itertuples(index=False):
        df_temp = df_training_temp.drop(index)
        predicted_class = KNN_predicted_class(df_temp, row, df_training_label) 
        if(predicted_class == df_training_label.iloc[index]['CLASS']):
            count_accuracy += 1        
        index += 1
     
    print("Has an Accuracy Percentage of = ", round((count_accuracy / len(df_training_normalized))*100, 2))
    print("Has an Accuracy Count of = ", count_accuracy)
    print("\n")

The Following Selected Feature Set =  ['f4']
Has an Accuracy Percentage of =  69.74
Has an Accuracy Count of =  590


The Following Selected Feature Set =  ['f4', 'f13']
Has an Accuracy Percentage of =  79.79
Has an Accuracy Count of =  675


The Following Selected Feature Set =  ['f4', 'f13', 'f14']
Has an Accuracy Percentage of =  82.15
Has an Accuracy Count of =  695


The Following Selected Feature Set =  ['f4', 'f13', 'f14', 'f16']
Has an Accuracy Percentage of =  83.1
Has an Accuracy Count of =  703


The Following Selected Feature Set =  ['f4', 'f13', 'f14', 'f16', 'f7']
Has an Accuracy Percentage of =  83.57
Has an Accuracy Count of =  707


The Following Selected Feature Set =  ['f4', 'f13', 'f14', 'f16', 'f7', 'f22']
Has an Accuracy Percentage of =  83.57
Has an Accuracy Count of =  707


The Following Selected Feature Set =  ['f4', 'f13', 'f14', 'f16', 'f7', 'f22', 'f26']
Has an Accuracy Percentage of =  86.17
Has an Accuracy Count of =  729


The Following Selected Feature 