# K-Nearest Neighbors Model
This notebook is to experiment with the KNN model to see if we can find students in our dataset that are the most similar to our user. We hope that this can help us get the most out of our dataset.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

## Data

In [2]:
data = pd.read_csv("database.csv") # change path for your env
data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [4]:
# Data Splitting & Normalization
scaler = StandardScaler()

# Handle missing values and duplicates
data = data.interpolate(method='linear', limit_direction='forward')
data.drop_duplicates(inplace=True)

# Select relevant columns
columns_to_keep = ['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
                   'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
                   'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA', 'GradeClass']
data = data[columns_to_keep]

# Create the neighborhood DataFrame by dropping specific columns
neighborhood = data.drop(columns=['StudentID', 'Ethnicity', 'StudyTimeWeekly', 'Absences', 
                                   'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 
                                   'Volunteering', 'GradeClass'])
print(neighborhood.head())

   Age  Gender  ParentalEducation       GPA
0   17       1                  2  2.929196
1   18       0                  1  3.042915
2   15       0                  3  0.112602
3   17       1                  3  2.054218
4   17       1                  2  1.288061


In [5]:
# collect user info
age = 17
gender = 0
study_time_weekly = 6
absences = 10
extracurricular = 0
sports = 0
music = 0
volunteering = 0
parental_education = 3
parental_support = 3
tutoring = 1
desired_GPA = 3.5

user_data = {
    'Age': [age],
    'Gender': [gender],
    'ParentalEducation': [parental_education],
    'StudyTimeWeekly': [study_time_weekly],
    'Absences': [absences],
    'Tutoring': [tutoring],
    'ParentalSupport': [parental_support],
    'Extracurricular': [extracurricular],
    'Sports': [sports],
    'Music': [music],
    'Volunteering': [volunteering],
    'GPA': [desired_GPA]
}

user_df = pd.DataFrame(user_data)
print(user_df)

   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   17       0                  3                6        10         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  GPA  
0                3                0       0      0             0  3.5  


In [6]:
# find 5 nearest neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# dropping all columns that the user can change
user_query = user_df.drop(columns=['StudyTimeWeekly', 'Absences', 
                                   'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 
                                   'Volunteering'])

print(user_query)

# Normalize the data
scaler = StandardScaler()
X_data = scaler.fit_transform(neighborhood)
X_user = scaler.transform(user_query)
print("Normalized user data:\n", X_user)

nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(X_data)
distances, indices = nbrs.kneighbors(user_query)
print("Indices of nearest neighbors:", indices)

# find nearest neighbors in original data
nearest_neighbors = data.iloc[indices.flatten()]
print("Nearest neighbors data:\n", nearest_neighbors)


   Age  Gender  ParentalEducation  GPA
0   17       0                  3  3.5
Normalized user data:
 [[ 0.47291901 -1.02198065  1.25350942  1.74194047]]
Indices of nearest neighbors: [[1333 1227 2387 1477 1447]]
Nearest neighbors data:
       StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
1333       2334   18       1          3                  4         5.758682   
1227       2228   18       0          0                  3        10.767684   
2387       3388   18       1          0                  3        10.680555   
1477       2478   18       1          0                  3        11.677165   
1447       2448   18       1          1                  3        16.968355   

      Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
1333         4         0                2                1       1      0   
1227         4         0                3                1       1      1   
2387         2         0                4                

