# K-Nearest Neighbors Model
This notebook is to experiment with the KNN model to see if we can find students in our dataset that are the most similar to our user. We hope that this can help us get the most out of our dataset.

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

## Data

In [2]:
data = pd.read_csv("database.csv") # change path for your env
data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [3]:
# Data Splitting & Normalization
scaler = StandardScaler()

# Handle missing values and duplicates
data = data.interpolate(method='linear', limit_direction='forward')
data.drop_duplicates(inplace=True)

# Select relevant columns
columns_to_keep = ['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
                   'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
                   'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA', 'GradeClass']
data = data[columns_to_keep]

# Create the neighborhood DataFrame by dropping specific columns
neighborhood = data.drop(columns=['StudentID', 'Ethnicity', 'StudyTimeWeekly', 'Absences', 
                                   'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 
                                   'Volunteering', 'GradeClass'])
print(neighborhood.head())

   Age  Gender  ParentalEducation       GPA
0   17       1                  2  2.929196
1   18       0                  1  3.042915
2   15       0                  3  0.112602
3   17       1                  3  2.054218
4   17       1                  2  1.288061


In [4]:
# collect user info
age = 17
gender = 0
study_time_weekly = 6
absences = 10
extracurricular = 0
sports = 0
music = 0
volunteering = 0
parental_education = 3
parental_support = 3
tutoring = 1
desired_GPA = 3.5

user_data = {
    'Age': [age],
    'Gender': [gender],
    'ParentalEducation': [parental_education],
    'StudyTimeWeekly': [study_time_weekly],
    'Absences': [absences],
    'Tutoring': [tutoring],
    'ParentalSupport': [parental_support],
    'Extracurricular': [extracurricular],
    'Sports': [sports],
    'Music': [music],
    'Volunteering': [volunteering],
    'GPA': [desired_GPA]
}

user_df = pd.DataFrame(user_data)
print(user_df)

   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   17       0                  3                6        10         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  GPA  
0                3                0       0      0             0  3.5  


## Try without normalizing data

In [5]:
# find 5 nearest neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# dropping all columns that the user can change
user_query = user_df.drop(columns=['StudyTimeWeekly', 'Absences', 
                                   'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 
                                   'Volunteering'])

print(user_query)

nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(neighborhood)
distances, indices = nbrs.kneighbors(user_query)
print("Indices of nearest neighbors:", indices)

# find nearest neighbors in original data
nearest_neighbors = data.iloc[indices.flatten()]
print("Nearest neighbors data:\n", nearest_neighbors)


   Age  Gender  ParentalEducation  GPA
0   17       0                  3  3.5
Indices of nearest neighbors: [[1480 2108  459 1952 1959]]
Nearest neighbors data:
       StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
1480       2481   17       0          0                  3        10.743386   
2108       3109   17       0          1                  3        16.926360   
459        1460   17       0          0                  3         0.876937   
1952       2953   17       0          1                  3        14.196094   
1959       2960   17       0          1                  3        11.044446   

      Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
1480         0         0                1                1       0      0   
2108         6         0                2                1       1      0   
459          3         1                2                0       1      1   
1952         3         0                2              

## Try with normalized data

In [6]:
# find 5 nearest neighbors
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# dropping all columns that the user can change
user_query = user_df.drop(columns=['StudyTimeWeekly', 'Absences', 
                                   'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports', 'Music', 
                                   'Volunteering'])

print(user_query)

# Normalize the data
scaler = StandardScaler()
norm_neighborhood = scaler.fit_transform(neighborhood)
norm_user_query = scaler.transform(user_query)
print("Normalized user data:\n", norm_user_query)

nbrs_norm = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(norm_neighborhood)
distances2, indices2 = nbrs.kneighbors(norm_user_query)
print("Indices of nearest neighbors:", indices2)

# find nearest neighbors in original data
nearest_neighbors2 = data.iloc[indices.flatten()]
print("Nearest neighbors data:\n", nearest_neighbors2)


   Age  Gender  ParentalEducation  GPA
0   17       0                  3  3.5
Normalized user data:
 [[ 0.47291901 -1.02198065  1.25350942  1.74194047]]
Indices of nearest neighbors: [[ 812 1040 1187 1712  384]]
Nearest neighbors data:
       StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
1480       2481   17       0          0                  3        10.743386   
2108       3109   17       0          1                  3        16.926360   
459        1460   17       0          0                  3         0.876937   
1952       2953   17       0          1                  3        14.196094   
1959       2960   17       0          1                  3        11.044446   

      Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
1480         0         0                1                1       0      0   
2108         6         0                2                1       1      0   
459          3         1                2                



## weighted neighborhood (best results)

In [12]:
# Define weights for each feature
weights = {
    'Age': 1.0,  # No change
    'Gender': 2.0,  # Double the importance
    'ParentalEducation': 1.0,  # Half the importance
    'GPA': 100.0,  # Ten times the importance
    # Add weights for all features
}

# Apply weights to the neighborhood and user query
weighted_neighborhood = neighborhood.copy()
weighted_user_query = user_query.copy()

for feature, weight in weights.items():
    weighted_neighborhood[feature] *= weight
    weighted_user_query[feature] *= weight

# Normalize the weighted data
# scaler_weighted = StandardScaler()
# weighted_neighborhood = scaler_weighted.fit_transform(weighted_neighborhood)
# weighted_user_query = scaler_weighted.transform(weighted_user_query)

# Perform nearest neighbors search
nbrs_norm = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(weighted_neighborhood)
distances3, indices3 = nbrs_norm.kneighbors(weighted_user_query)

print("Indices of nearest neighbors:", indices3)

# find nearest neighbors in the original data
nearest_neighbors = data.iloc[indices3.flatten()]
print("Nearest neighbors data:\n", neighborhood.iloc[indices3.flatten()])
print("Nearest neighbors based on weighted features:\n", nearest_neighbors)

Indices of nearest neighbors: [[ 918  716 2352  124 1176]]
Nearest neighbors data:
       Age  Gender  ParentalEducation       GPA
918    18       0                  2  3.491327
716    17       0                  2  3.516941
2352   17       1                  3  3.495950
124    15       0                  2  3.498257
1176   17       1                  2  3.496777
Nearest neighbors based on weighted features:
       StudentID  Age  Gender  Ethnicity  ParentalEducation  StudyTimeWeekly  \
918        1919   18       0          2                  2        10.646087   
716        1717   17       0          1                  2        16.032852   
2352       3353   17       1          0                  3        19.777514   
124        1125   15       0          0                  2         2.580728   
1176       2177   17       1          2                  2        17.048778   

      Absences  Tutoring  ParentalSupport  Extracurricular  Sports  Music  \
918          3         1           