In [1]:
# # only run this for google colab
# !pip install tabpfn
# !pip install bayesian-optimization
# !pip install sklearn
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

## Setup

In [3]:
# Loading Dataset
data = pd.read_csv("dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                2392 non-null   int64  
 1   Gender             2392 non-null   int64  
 2   StudyTimeWeekly    2392 non-null   float64
 3   Absences           2392 non-null   int64  
 4   Extracurricular    2392 non-null   int64  
 5   Sports             2392 non-null   int64  
 6   Music              2392 non-null   int64  
 7   Volunteering       2392 non-null   int64  
 8   GPA                2392 non-null   float64
 9   ParentalInfluence  2392 non-null   int64  
 10  TutoringEffect     2392 non-null   float64
dtypes: float64(3), int64(8)
memory usage: 205.7 KB


In [4]:
# Data Processing
scaler = StandardScaler()
input = scaler.fit_transform(data.drop('GPA', axis=1))

labels = data['GPA']

X_train, X_test, Y_train, Y_test = train_test_split(input, labels)

In [5]:
from tabpfn import TabPFNRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Train and predict TabPFN
reg = TabPFNRegressor(random_state=42)
reg.fit(X_train, Y_train)
Y3_pred = reg.predict(X_test)

# evaluation
print('Mean Squared Error:', mean_squared_error(Y_test, Y3_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, Y3_pred))

Mean Squared Error: 0.05396969091151965
Mean Absolute Error: 0.18508261340456786


# 1. Coordinate Descent

In [6]:

age = 15
gender = 0
study_time_weekly = 4.2
absences = 10
extracurricular = 1
sports = 0
music = 0
volunteering = 0
parental_education = 1
parental_support = 1
tutoring = 1

user_data = {
    'Age': [age],
    'Gender': [gender],
    'ParentalEducation': [parental_education],
    'StudyTimeWeekly': [study_time_weekly],
    'Absences': [absences],
    'Tutoring': [tutoring],
    'ParentalSupport': [parental_support],
    'Extracurricular': [extracurricular],
    'Sports': [sports],
    'Music': [music],
    'Volunteering': [volunteering]
}

user_df = pd.DataFrame(user_data)
print(user_df)

#process data
user_input = scaler.transform(user_df)

# predict grade
pred_grade = reg.predict([user_input[0]])
print("Predicted Grade:", pred_grade)



   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   15       0                  1              4.2        10         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  
0                1                1       0      0             0  


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- ParentalEducation
- ParentalSupport
- Tutoring
Feature names seen at fit time, yet now missing:
- ParentalInfluence
- TutoringEffect


In [None]:
# iteratively improve parameters
params_to_change = ['Absences', 'StudyTimeWeekly', 'Tutoring', 'Sports', 'Extracurricular', 'Music', 'Volunteering']
values = {'Absences': np.linspace(0, 29, 30), 'StudyTimeWeekly': np.linspace(0,20,21), 'Tutoring': [0, 1], 'Sports': [0, 1], 'Extracurricular': [0, 1], 'Music': [0, 1], 'Volunteering': [0, 1]}

# iteratively change study habits using coordinate descent (actual model can be more complex and exhaustive)
best_grade = pred_grade
best_params = user_df.copy()
for param in params_to_change:
  value_array = values[param]
  for value in value_array:
    user_df_copy = best_params.copy()
    user_df_copy[param] = value
    user_input_copy = scaler.transform(user_df_copy)
    pred_grade = reg.predict([user_input_copy[0]])
    print(f"Parameter: {param}, Value: {value}, Predicted Grade: {pred_grade[0]}")
    if pred_grade[0] > best_grade:
      best_grade = pred_grade[0]
      best_params = user_df_copy.copy()

print("----------------\nBest Grade:", best_grade)
print("Best Parameters:", best_params)

# skopt approach (best performance)
To be more effective, and perhaps more accurate, we could try a bayesian optimization approach.

In [None]:
# !pip install scikit-optimize

In [None]:
from skopt import gp_minimize
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args

# user input
age = 15
gender = 0
study_time_weekly = 4.2
absences = 10
extracurricular = 1
sports = 0
music = 0
volunteering = 0
parental_education = 1
parental_support = 1
tutoring = 1
desired_grade = 4.0


space = [
    Real(0.0, 20.0, name='StudyTimeWeekly'),  # Continuous variable
    Integer(0, 29, name='Absences'),  # Integer variable
    Categorical([0, 1], name='Tutoring'),  # Categorical variable
    Integer(0, 4, name='ParentalSupport'),  # Integer variable
    Categorical([0, 1], name='Extracurricular'),  # Categorical variable
    Categorical([0, 1], name='Sports'),  # Categorical variable
    Categorical([0, 1], name='Music'),  # Categorical variable
    Categorical([0, 1], name='Volunteering')  # Categorical variable
  ]

@use_named_args(space)
def objective(**params):
    user_data = {
        'Age': age,
        'Gender': gender,
        'ParentalEducation': parental_education,
        'StudyTimeWeekly': params['StudyTimeWeekly'],
        'Absences': params['Absences'],
        'Tutoring': params['Tutoring'],
        'ParentalSupport': params['ParentalSupport'],
        'Extracurricular': params['Extracurricular'],
        'Sports': params['Sports'],
        'Music': params['Music'],
        'Volunteering': params['Volunteering']
    }

    user_df = pd.DataFrame(user_data, index=[0])
    print(user_df)

    #process data
    user_input = scaler.transform(user_df)

    # predict grade
    pred_grade = reg.predict([user_input[0]])
    print(pred_grade)
    score = desired_grade - pred_grade[0] # rating based on how close it can get to 4.0

    return score

res = gp_minimize(objective, space, n_calls=50, random_state=0)

print("Best score: ", res.fun)
print("Best parameters: ", res.x)