<a href="https://colab.research.google.com/github/cookiecereal/Machine-learning-proj/blob/main/kNN_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from typing_extensions import dataclass_transform
import pandas as pd
from sklearn.model_selection import train_test_split
import math
from collections import Counter
import numpy as np
from tabulate import tabulate

# 1. Load dataset
def load_dataset(filename):
    df = pd.read_csv(filename)
    return df

data = load_dataset('student-mat.csv')
data['sex'] = data['sex'].map({'F': 1, 'M': 1})
data['schoolsup'] = data['schoolsup'].map({'yes': 1, 'no': 0})
data['famsup'] = data['famsup'].map({'yes': 1, 'no': 0})
data['paid'] = data['paid'].map({'yes': 1, 'no': 0})
data['higher'] = data['higher'].map({'yes': 1, 'no': 0})
data['internet'] = data['internet'].map({'yes': 1, 'no': 0})
X = data.drop(columns=['Finalgrade']).values
#print(X)
y = data['Finalgrade'].values

# 2. Split dataset into training and test sets using scikit-learn
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


# 3. Define the kNN algorithm
def euclidean_distance(instance1, instance2):
    """Calculate the Euclidean distance between two instances."""
    sum = 0
    for i in range(0, len(instance1)):
      x = (instance2[i] - instance1[i])**2
      sum = sum + x
    eucld_dist = math.sqrt(sum) #calculate distance
    return eucld_dist #return distance

def get_neighbors(X_train, test_instance, k):
    """Get the k nearest neighbors for a test instance."""
    distances = [] #stores all distances from test instance
    neighbors = [] #stores k nearest neighbors


    for i in range(0, len(X_train)): #iterate X training set
      dist = euclidean_distance(test_instance,X_train[i]) #calculate distance
      instance = [dist, i] #[distance, location in training set]
      distances.insert(len(distances), instance) #add curr instance to distances

    #sort distances from shortest to longest distance
    neighbors = sorted(distances)[:k] #assign 3 shortest to neighbors

    return neighbors

def get_response(neighbors, y_train):
    """Determine the class label for a current instance based on the majority
    class label of its k neighbors."""
    prediction = None

    predictions = [] #stores class label from all neighbors
    for neighbor in neighbors: #iterate neighbors
      y_location = neighbor[1] #get training set location stored in neighbor
      predictions = predictions + [y_train[y_location]] #add class label to list

    #get most common class label among all neighbors
    most_common = Counter(predictions).most_common(1)
    prediction = most_common[0][0] #assigns prediction with most common label

    return prediction

# 4. Use the kNN algorithm to predict the class labels of the test set
k = 3
predictions = []
for current_instance in X_test:
    neighbors = get_neighbors(X_train, current_instance, k)
    prediction = get_response(neighbors, y_train)
    predictions.append(prediction)

# 5. Calculate the accuracy of the predictions

# assign data

mydata = [['Predicted', 'Actual']]
for i in range(0, 19):
  x = [predictions[i], y_test[i]]
  mydata.append(x)


# display table
print(tabulate(mydata))

correct = sum([y_true == y_pred for y_true, y_pred in zip(y_test, predictions)])
accuracy = (correct / len(y_test)) * 100.0
print(f"Accuracy: {accuracy:.2f}%")
#'''

Training set size: 197
Test set size: 198
---------  ------
Predicted  Actual
8          10
8          12
0          5
13         10
7          9
9          13
0          18
9          6
19         0
0          14
16         15
14         7
11         15
12         10
11         14
9          8
13         8
9          11
15         15
---------  ------
Accuracy: 15.15%
