In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from __future__ import division
from scipy.io import arff

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
def df_normalize(DF):
    df_normalized = DF.copy()
    colList = list(DF.columns)
    for col in range(len(colList)):
        colMean = DF[colList[col]].mean()
        colStd = DF[colList[col]].std()
        df_normalized[colList[col]] = (DF[colList[col]] - colMean)/colStd
    return df_normalized

In [4]:
def KNN_predicted_class(df_training , test_row, df_training_label):
    value_of_k = 7
    df_distance = calculateEculidDist(df_training , test_row)
    rows_k = df_distance.iloc[:value_of_k]
    tmp = df_training_label.iloc[rows_k.index.tolist()]['CLASS'].value_counts()
    return tmp.idxmax()

def calculateEculidDist(df_training , test_row):
    tmp = (((df_training.sub( test_row, axis=1))**2).sum(axis=1))**0.5
    tmp.sort_values(axis=0, ascending=True, inplace=True)
    return tmp

In [16]:
input = arff.loadarff("veh-prime.arff")
df_training = pd.DataFrame(input[0])
df_training_label = df_training[['CLASS']].copy()
df_training.drop('CLASS' , axis=1, inplace=True)
df_training_label['CLASS'] = np.where(df_training_label['CLASS'] == b'noncar', 0, 1)

df_training_normalized = df_normalize(df_training) 

feature_list = df_training.columns.tolist()
feature_list_remaining = df_training.columns.tolist()
print(feature_list)

feature_list_selected = []
accuracy = 0
iteration = 1

print("Wrapper Method")
print("\n")

while (len(feature_list_remaining) > 0):  
    print("Iteration = ", iteration)
    iteration += 1
    list_temp_accuracy = []
    
    for counter in range(len(feature_list_remaining)):
        list_feature_temp = feature_list_selected + [feature_list_remaining[counter]]
        df_train_temp = df_training_normalized[list_feature_temp]
     
        index = 0
        count_accuracy = 0
        list_predicted_class = []

        for row in df_train_temp.itertuples(index=False):
            df_temp = df_train_temp.drop(index)
            predicted_class = KNN_predicted_class(df_temp, row, df_training_label) 
            list_predicted_class.append(predicted_class)
        
        df_test_label_predicted = pd.DataFrame({"CLASS" : list_predicted_class})
        difference_label = df_training_label.sub(df_test_label_predicted , axis=1)
        count_accuracy = len(difference_label[ difference_label['CLASS'] ==0 ])
        list_temp_accuracy.append(round(((count_accuracy/len(df_training_normalized))*100),2))
    
    print("Features    = ",feature_list_remaining )
    print("Accuracies  = ",list_temp_accuracy )  
    
    max_accuracy = max(list_temp_accuracy)
    max_accuracy_index = list_temp_accuracy.index(max(list_temp_accuracy))
    max_accuracy_feature = feature_list_remaining[max_accuracy_index]
        
    print("The Maximum Accuracy achieved is ",max_accuracy, "%, with feature ",max_accuracy_feature)
    if(max_accuracy >= accuracy):
        feature_list_selected.append(max_accuracy_feature)
        feature_list_remaining.remove(max_accuracy_feature)
        accuracy = max_accuracy
        print("The New Selected Feature Subset is ",feature_list_selected)
    else:
        print("LOOVC Accuracy did not increase from the previous iteration", accuracy)
        break
    print("\n")

print("\n")
print("Final Selected Feature set is ,", feature_list_selected)
print("Final Accuracy with above feature set is ", accuracy)
    

['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35']
Wrapper Method


Iteration =  1
Features    =  ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35']
Accuracies  =  [65.13, 65.6, 67.61, 64.42, 72.81, 65.48, 62.17, 71.75, 56.62, 64.18, 67.26, 63.24, 64.18, 76.24, 77.3, 65.72, 72.58, 65.72, 64.66, 69.27, 79.55, 66.9, 78.84, 64.54, 65.6, 72.58, 65.6, 62.53, 57.68, 65.13, 64.3, 61.23, 67.73, 66.55, 62.41, 62.65]
The Maximum Accuracy achieved is  79.55 %, with feature  f20
The New Selected Feature Subset is  ['f20']


Iteration =  2
Features    =  ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 