In [1]:
import xlwt
import pandas as pd
import numpy as np

In [2]:
!gdown --id 1679x_njEI5pgCouNxmPAUCZC9c_yOYh4

Downloading...
From: https://drive.google.com/uc?id=1679x_njEI5pgCouNxmPAUCZC9c_yOYh4
To: /content/traintest.xlsx
100% 17.7k/17.7k [00:00<00:00, 24.7MB/s]


Membaca Data Latih dan Uji

In [3]:
xls = pd.ExcelFile('traintest.xlsx')
data = pd.read_excel(xls, 'train')
test = pd.read_excel(xls, 'test')

In [4]:
data.head()

Unnamed: 0,id,x1,x2,x3,y
0,1,60,64,0,1
1,2,54,60,11,0
2,3,65,62,22,0
3,4,34,60,0,1
4,5,38,69,21,0


In [5]:
test.head()

Unnamed: 0,id,x1,x2,x3,y
0,297,43,59,2,?
1,298,67,66,0,?
2,299,58,60,3,?
3,300,49,63,3,?
4,301,45,60,0,?


Modelling

In [6]:
def euclidean_distance(x, y):
  dist = 0
  for i in range(1, len(y)-1):
    dist += (x[i] - y[i])**2
  dist = dist ** (1/2)
  return dist

def manhattan_distance(x, y):
  dist = 0
  for i in range(1, len(y)-1):
    dist += abs(x[i] - y[i])
  return dist

def minkowski_distance(x, y, p=2):
  dist = 0
  for i in range(1, len(y)-1):
    dist += (abs(x[i] - y[i]))**p
  dist = dist ** (1/p)
  return dist

def supremum_distance(x, y):
  dist = []
  for i in range(1, len(y)-1):
    dist.append(abs(x[i] - y[i]))
  return max(dist)

In [7]:
def get_distance(x, y, dist_metric):
  dist = []
  if (dist_metric == 'euclidean'):
    for i in range(len(x)):
      dist.append([euclidean_distance(x.iloc[i].to_list(), y), x['id'][i], x['y'][i]])
    return dist
  elif (dist_metric == 'manhattan'):
    for i in range(len(x)):
      dist.append([manhattan_distance(x.iloc[i].to_list(), y), x['id'][i], x['y'][i]])
    return dist
  elif (dist_metric == 'minkowski'):
    for i in range(len(x)):
      dist.append([minkowski_distance(x.iloc[i].to_list(), y), x['id'][i], x['y'][i]])
    return dist
  elif (dist_metric == 'supremum'):
    for i in range(len(x)):
      dist.append([supremum_distance(x.iloc[i].to_list(), y), x['id'][i], x['y'][i]])
    return dist

In [8]:
def knn(x, y, dist_metric, k):
  # nearest neighbors
  dist = get_distance(x, y, dist_metric)
  dist = sorted(dist)[:k]
  # predict the y
  label = []
  for data in dist:
    label.append(data[-1])
  class_1 = 0
  class_0 = 0
  for lb in label:
    if (lb == 1):
      class_1 += 1
    elif (lb == 0):
      class_0 += 1
  if class_0 > class_1:
    predicted_class = 0
  else:
    predicted_class = 1
  return predicted_class, dist

In [9]:
def knnAllTest(x, y, dist_metric, k):
  y_pred = []
  
  # Print nearest neighbors for test set
  if len(y) == 10:
    for i in range(len(y)):
      prediction, dist = knn(x, y.iloc[i].to_list(), dist_metric, k)
      print('Data test dengan id: ', y.iloc[i][0], ', memiliki Nearest Neighbors:', sep='')
      for j in range(len(dist)):
        print('Distance = ', dist[j][0], ', id = ', dist[j][1], ', y = ', dist[j][2], sep='')
      print()
      y_pred.append(prediction)
  else:
    for i in range(len(y)):
      prediction, dist = knn(x, y.iloc[i].to_list(), dist_metric, k)
      y_pred.append(prediction)
  return y_pred

In [10]:
# Accuracy = correct predictions / total predictions
def accuracy(df, y_pred):
  c = 0
  for i in range(len(y_pred)):
    if df['y'][i] == y_pred[i]:
      c += 1
  return c / len(y_pred)

In [11]:
def kFoldCrossValidation():
  acc_list = [] # fold, k, distance

  # Fold 1 validation
  fold = 1
  df_val_1 = pd.DataFrame()
  df_train_1 = pd.DataFrame()
  for i in range(len(data)):
    if i >= 0 and i <= 73:
      df_val_1 = df_val_1.append(data.iloc[i], ignore_index=True)
    else:
      df_train_1 = df_train_1.append(data.iloc[i], ignore_index=True)
  for k in range(1,11):
    for dist_metric in ['euclidean', 'manhattan', 'minkowski', 'supremum']:
      y_pred = []
      y_pred = knnAllTest(df_train_1, df_val_1, dist_metric, k)
      acc = accuracy(df_val_1, y_pred)*100
      acc_list.append([fold, k, dist_metric, acc])
      
  # Fold 2 validation
  fold = 2
  df_val_2 = pd.DataFrame()
  df_train_2 = pd.DataFrame()
  for i in range(len(data)):
    if i >= 74 and i <= 147:
      df_val_2 = df_val_2.append(data.iloc[i], ignore_index=True)
    else:
      df_train_2 = df_train_2.append(data.iloc[i], ignore_index=True)
  for k in range(1,11):
    for dist_metric in ['euclidean', 'manhattan', 'minkowski', 'supremum']:
      y_pred = []
      y_pred = knnAllTest(df_train_2, df_val_2, dist_metric, k)
      acc = accuracy(df_val_2, y_pred)*100
      acc_list.append([fold, k, dist_metric, acc])
  
  # Fold 3 validation
  fold = 3
  df_val_3 = pd.DataFrame()
  df_train_3 = pd.DataFrame()
  for i in range(len(data)):
    if i >= 148 and i <= 221:
      df_val_3 = df_val_3.append(data.iloc[i], ignore_index=True)
    else:
      df_train_3 = df_train_3.append(data.iloc[i], ignore_index=True)
  for k in range(1,11):
    for dist_metric in ['euclidean', 'manhattan', 'minkowski', 'supremum']:
      y_pred = []
      y_pred = knnAllTest(df_train_3, df_val_3, dist_metric, k)
      acc = accuracy(df_val_3, y_pred)*100
      acc_list.append([fold, k, dist_metric, acc])
  
  # Fold 4 validation
  fold = 4
  df_val_4 = pd.DataFrame()
  df_train_4 = pd.DataFrame()
  for i in range(len(data)):
    if i >= 222:
      df_val_4 = df_val_4.append(data.iloc[i], ignore_index=True)
    else:
      df_train_4 = df_train_4.append(data.iloc[i], ignore_index=True)
  for k in range(1,11):
    for dist_metric in ['euclidean', 'manhattan', 'minkowski', 'supremum']:
      y_pred = []
      y_pred = knnAllTest(df_train_4, df_val_4, dist_metric, k)
      acc = accuracy(df_val_4, y_pred)*100
      acc_list.append([fold, k, dist_metric, acc])
  return acc_list

In [12]:
def bestHyperparameter():
  acc_list = kFoldCrossValidation()
  avg_acc = []
  for k in range(1,11):
    for dist_metric in ['euclidean', 'manhattan', 'minkowski', 'supremum']:
        sum_kdist = 0
        for arr in acc_list:
          if arr[1] == k and arr[2] == dist_metric:
            sum_kdist += arr[3]
        avg = sum_kdist/4
        avg_acc.append([k, dist_metric, avg])
  best_acc = sorted(avg_acc, key=lambda x:x[2], reverse=True)
  # menampilkan 5 pasangan k dan distance metric terbaik
  print('Top 5 best hyperparameter:')
  for i in range(5):
    print(best_acc[i])
  print()
  return best_acc[0][0], best_acc[0][1]

Testing

In [13]:
best_k, best_dist_metric = bestHyperparameter()
y_pred = []
y_pred = knnAllTest(data, test, best_dist_metric, best_k)
print('y:', y_pred)

Top 5 best hyperparameter:
[10, 'manhattan', 73.98648648648648]
[10, 'supremum', 73.98648648648648]
[8, 'euclidean', 73.64864864864865]
[8, 'minkowski', 73.64864864864865]
[9, 'supremum', 73.64864864864865]

Data test dengan id: 297, memiliki Nearest Neighbors:
Distance = 1, id = 177, y = 1
Distance = 3, id = 173, y = 1
Distance = 3, id = 251, y = 1
Distance = 3, id = 263, y = 0
Distance = 4, id = 58, y = 1
Distance = 4, id = 137, y = 0
Distance = 4, id = 139, y = 1
Distance = 4, id = 234, y = 1
Distance = 4, id = 277, y = 1
Distance = 4, id = 282, y = 1

Data test dengan id: 298, memiliki Nearest Neighbors:
Distance = 0, id = 149, y = 1
Distance = 1, id = 245, y = 1
Distance = 2, id = 16, y = 1
Distance = 2, id = 168, y = 1
Distance = 3, id = 68, y = 1
Distance = 3, id = 70, y = 1
Distance = 3, id = 105, y = 1
Distance = 3, id = 108, y = 1
Distance = 3, id = 293, y = 1
Distance = 4, id = 31, y = 1

Data test dengan id: 299, memiliki Nearest Neighbors:
Distance = 2, id = 151, y = 1
Dis

Menyimpan Output ke File

In [14]:
workbook = xlwt.Workbook()
worksheet = workbook.add_sheet('test')
row = 1
worksheet.write(0, 0, 'id')
worksheet.write(0, 1, 'x1')
worksheet.write(0, 2, 'x2')
worksheet.write(0, 3, 'x3')
worksheet.write(0, 4, 'y')
for i in range(10):
  worksheet.write(row, 0, int(test['id'][i]))
  worksheet.write(row, 1, int(test['x1'][i]))
  worksheet.write(row, 2, int(test['x2'][i]))
  worksheet.write(row, 3, int(test['x3'][i]))
  worksheet.write(row, 4, y_pred[i])
  row = row + 1
workbook.save('test.xls')