<h4>Importing required Packages and loading the training and testing data</h4>

In [41]:
from scipy.io import arff
import pandas as pd
import math
import operator
import matplotlib.pyplot as plt

train_data = arff.loadarff('trainProdSelection.arff')
training_set = pd.DataFrame(train_data[0])

test_data = arff.loadarff('testProdSelection.arff')
testing_set = pd.DataFrame(test_data[0])

<h4>Printing the training data</h4>

In [42]:
training_set.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,b'student',b'spend>saving',6.0,40.0,13.62,3.2804,b'C1'
1,b'student',b'spend>saving',11.0,21.0,15.32,2.0232,b'C1'
2,b'student',b'spend>saving',7.0,64.0,16.55,3.1202,b'C1'
3,b'student',b'spend>saving',3.0,47.0,15.71,3.4022,b'C1'
4,b'student',b'spend>saving',15.0,10.0,16.96,2.2825,b'C1'


<h4>Printing the testing data</h4>

In [43]:
testing_set.head()

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,b'student',b'spend<saving',12.0,19.0,14.79,3.7697,b'C1'
1,b'student',b'spend>>saving',29.0,10.0,16.19,2.4839,b'C1'
2,b'student',b'spend<<saving',28.0,60.0,15.46,1.1885,b'C1'
3,b'engineer',b'spend>saving',15.0,41.0,21.26,1.4379,b'C1'
4,b'librarian',b'spend<saving',2.0,9.0,19.7207,0.6913,b'C1'


<h4>Checking the datatype for every column</h4>

In [44]:
pd.DataFrame(train_data[0]).dtypes

Type          object
LifeStyle     object
Vacation     float64
eCredit      float64
salary       float64
property     float64
label         object
dtype: object

<h1>Training set pre-processing</h1>

In [45]:
training_set.Type = training_set.Type.str.decode("UTF-8")
training_set.LifeStyle = training_set.LifeStyle.str.decode("UTF-8")
training_set.label = training_set.label.str.decode("UTF-8")

In [46]:
minValue = training_set.Vacation.min()
maxValue = training_set.Vacation.max()
training_set.Vacation = training_set.Vacation.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.eCredit.min()
maxValue = training_set.eCredit.max()
training_set.eCredit = training_set.eCredit.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.salary.min()
maxValue = training_set.salary.max()
training_set.salary = training_set.salary.apply(lambda x:(x-minValue)/(maxValue-minValue))

minValue = training_set.property.min()
maxValue = training_set.property.max()
training_set.property = training_set.property.apply(lambda x:(x-minValue)/(maxValue-minValue))
training_data=training_set.drop(['Type','LifeStyle'],axis=1)
training_data.head()

Unnamed: 0,Vacation,eCredit,salary,property,label
0,0.079365,0.107558,0.21996,0.183167,C1
1,0.15873,0.052326,0.293102,0.112797,C1
2,0.095238,0.177326,0.346023,0.1742,C1
3,0.031746,0.127907,0.309882,0.189984,C1
4,0.222222,0.020349,0.363663,0.127311,C1


<h1>Training set pre-processing done</h1>

<h1>Testing set pre-processing</h1>

In [47]:
testing_set.Type=testing_set.Type.str.decode("UTF-8")
testing_set.LifeStyle=testing_set.LifeStyle.str.decode("UTF-8")
testing_set.label=testing_set.label.str.decode("UTF-8")
testing_set

Unnamed: 0,Type,LifeStyle,Vacation,eCredit,salary,property,label
0,student,spend<saving,12.0,19.0,14.79,3.7697,C1
1,student,spend>>saving,29.0,10.0,16.19,2.4839,C1
2,student,spend<<saving,28.0,60.0,15.46,1.1885,C1
3,engineer,spend>saving,15.0,41.0,21.26,1.4379,C1
4,librarian,spend<saving,2.0,9.0,19.7207,0.6913,C1
5,librarian,spend>saving,7.0,9.0,12.7098,1.4728,C1
6,professor,spend>saving,5.0,10.0,20.883,1.3131,C1
7,professor,spend<saving,3.0,15.0,16.5711,0.4792,C1
8,student,spend<saving,9.0,71.0,25.7,2.0947,C1
9,student,spend>saving,10.0,67.0,27.11,3.8391,C1


In [56]:
minValue = training_set.Vacation.min()
maxValue = training_set.Vacation.max()
testing_set.Vacation = testing_set.Vacation.apply(lambda x:(x-minValue)/((maxValue-minValue)*100))

minValue = training_set.eCredit.min()
maxValue = training_set.eCredit.max()
testing_set.eCredit = testing_set.eCredit.apply(lambda x:(x-minValue)/((maxValue-minValue)*100))

minValue = training_set.salary.min()
maxValue = training_set.salary.max()
testing_set.salary = testing_set.salary.apply(lambda x:(x-minValue)/((maxValue-minValue)*100))

minValue = training_set.property.min()
maxValue = training_set.property.max()
testing_set.property = testing_set.property.apply(lambda x:(x-minValue)/((maxValue-minValue)*100))
testing_set
testing_data=testing_set.drop(['Type','LifeStyle'],axis=1)
testing_data.head()

Unnamed: 0,Vacation,eCredit,salary,property,label
0,0.12,0.19,0.1479,0.037697,C1
1,0.29,0.1,0.1619,0.024839,C1
2,0.28,0.6,0.1546,0.011885,C1
3,0.15,0.41,0.2126,0.014379,C1
4,0.02,0.09,0.197207,0.006913,C1


<h1>Testing set pre-processing done</h1>

<h3>KNN function</h3>

In [57]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)
 
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
 
def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]
 
def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [58]:
def knn(i):
    predictions=[]
    k = i
    for x in range(len(testing_data)):
        neighbors = getNeighbors(training_data.values, testing_data.values[x], k)
    
        result = getResponse(neighbors)
        predictions.append(result)

    accuracy = getAccuracy(testing_data.values, predictions)
    return accuracy

<h4>Finding the more accurate value</h4>

In [59]:
knn(5)

71.42857142857143

In [60]:
acc={}
for i in range(1,30,2):
    acc[i]=knn(i)
acc

{1: 66.66666666666666,
 3: 71.42857142857143,
 5: 71.42857142857143,
 7: 66.66666666666666,
 9: 66.66666666666666,
 11: 61.904761904761905,
 13: 66.66666666666666,
 15: 61.904761904761905,
 17: 61.904761904761905,
 19: 57.14285714285714,
 21: 52.38095238095239,
 23: 42.857142857142854,
 25: 38.095238095238095,
 27: 38.095238095238095,
 29: 38.095238095238095}

# maximum accuracy 

In [61]:
print( "Maximum accuracy is : ",max(acc.items(), key=operator.itemgetter(1))[1], " at k = ",max(acc.items(), key=operator.itemgetter(1))[0])

Maximum accuracy is :  71.42857142857143  at k =  3
