In [None]:
# only run this for google colab
!pip install tabpfn
!pip install bayesian-optimization
!pip install sklearn
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

Collecting tabpfn
  Downloading tabpfn-2.0.7-py3-none-any.whl.metadata (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.1->tabpfn)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch<3,>=2.1->tabpfn)
  Downloadin

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [3]:
# Loading Dataset
data = pd.read_csv("dataset.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                2392 non-null   int64  
 1   Gender             2392 non-null   int64  
 2   StudyTimeWeekly    2392 non-null   float64
 3   Absences           2392 non-null   int64  
 4   Extracurricular    2392 non-null   int64  
 5   Sports             2392 non-null   int64  
 6   Music              2392 non-null   int64  
 7   Volunteering       2392 non-null   int64  
 8   GPA                2392 non-null   float64
 9   ParentalInfluence  2392 non-null   int64  
 10  TutoringEffect     2392 non-null   float64
dtypes: float64(3), int64(8)
memory usage: 205.7 KB


In [5]:
# Data Processing
scaler = StandardScaler()
input = scaler.fit_transform(data.drop('GPA', axis=1))

labels = data['GPA']

X_train, X_test, Y_train, Y_test = train_test_split(input, labels)

In [6]:
from tabpfn import TabPFNRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Train and predict TabPFN
reg = TabPFNRegressor(random_state=42)
reg.fit(X_train, Y_train)
Y3_pred = reg.predict(X_test)

# evaluation
print('Mean Squared Error:', mean_squared_error(Y_test, Y3_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, Y3_pred))

Mean Squared Error: 0.05345235072048487
Mean Absolute Error: 0.1870511221712904


# Baseline Habit Optimization Model

In [9]:
# Prompting User Input
# age = int(input("Enter Age: "))
# gender = int(input("Enter Gender (0 or 1): "))
# study_time_weekly = float(input("Enter Study Time Weekly: "))
# absences = int(input("Enter Absences: "))
# extracurricular = int(input("Enter Extracurricular Activities: "))
# sports = int(input("Enter Sports Participation: "))
# music = int(input("Enter Music Participation: "))
# volunteering = int(input("Enter Volunteering Participation: "))
# parental_support = int(input("Enter Parental Support: "))
# parental_education = int(input("Enter Parental Education: "))
# tutoring = int(input("Enter Tutoring: "))

age = 15
gender = 0
study_time_weekly = 4.2
absences = 10
extracurricular = 1
sports = 0
music = 0
volunteering = 0
parental_education = 1
parental_support = 1
tutoring = 1

user_data = {
    'Age': [age],
    'Gender': [gender],
    'ParentalEducation': [parental_education],
    'StudyTimeWeekly': [study_time_weekly],
    'Absences': [absences],
    'Tutoring': [tutoring],
    'ParentalSupport': [parental_support],
    'Extracurricular': [extracurricular],
    'Sports': [sports],
    'Music': [music],
    'Volunteering': [volunteering]
}

user_df = pd.DataFrame(user_data)
print(user_df)

#process data
user_input = scaler.transform(user_df)

# predict grade
pred_grade = reg.predict([user_input[0]])
print("Predicted Grade:", pred_grade)



   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   15       0                  1              4.2        10         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  
0                1                1       0      0             0  
Predicted Grade: [2.2323787]


In [10]:
# iteratively improve parameters
params_to_change = ['Absences', 'StudyTimeWeekly', 'Tutoring', 'Sports', 'Extracurricular', 'Music', 'Volunteering']
values = {'Absences': np.linspace(0, 29, 30), 'StudyTimeWeekly': np.linspace(0,20,21), 'Tutoring': [0, 1], 'Sports': [0, 1], 'Extracurricular': [0, 1], 'Music': [0, 1], 'Volunteering': [0, 1]}

# iteratively change study habits using coordinate descent (actual model can be more complex and exhaustive)
best_grade = pred_grade
best_params = user_df.copy()
for param in params_to_change:
  value_array = values[param]
  for value in value_array:
    user_df_copy = best_params.copy()
    user_df_copy[param] = value
    user_input_copy = scaler.transform(user_df_copy)
    pred_grade = reg.predict([user_input_copy[0]])
    print(f"Parameter: {param}, Value: {value}, Predicted Grade: {pred_grade[0]}")
    if pred_grade[0] > best_grade:
      best_grade = pred_grade[0]
      best_params = user_df_copy.copy()

print("----------------\nBest Grade:", best_grade)
print("Best Parameters:", best_params)

Parameter: Absences, Value: 0.0, Predicted Grade: 3.2683465480804443
Parameter: Absences, Value: 1.0, Predicted Grade: 3.1467881202697754
Parameter: Absences, Value: 2.0, Predicted Grade: 3.042768955230713
Parameter: Absences, Value: 3.0, Predicted Grade: 2.940708637237549
Parameter: Absences, Value: 4.0, Predicted Grade: 2.8352184295654297
Parameter: Absences, Value: 5.0, Predicted Grade: 2.7326059341430664
Parameter: Absences, Value: 6.0, Predicted Grade: 2.6293506622314453
Parameter: Absences, Value: 7.0, Predicted Grade: 2.528118848800659
Parameter: Absences, Value: 8.0, Predicted Grade: 2.4211363792419434
Parameter: Absences, Value: 9.0, Predicted Grade: 2.3223652839660645
Parameter: Absences, Value: 10.0, Predicted Grade: 2.2323787212371826
Parameter: Absences, Value: 11.0, Predicted Grade: 2.1343131065368652
Parameter: Absences, Value: 12.0, Predicted Grade: 2.031100273132324
Parameter: Absences, Value: 13.0, Predicted Grade: 1.9284096956253052
Parameter: Absences, Value: 14.0, 

# Bayesian Approach
To be more effective, and perhaps more accurate, we could try a bayesian optimization approach. (This is a work in progress).

In [18]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.1.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.1.0 scikit-optimize-0.10.2


In [24]:
from skopt import gp_minimize
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args

# user input
age = 15
gender = 0
study_time_weekly = 4.2
absences = 10
extracurricular = 1
sports = 0
music = 0
volunteering = 0
parental_education = 1
parental_support = 1
tutoring = 1


space = [
    Real(0.0, 20.0, name='StudyTimeWeekly'),  # Continuous variable
    Integer(0, 29, name='Absences'),  # Integer variable
    Categorical([0, 1], name='Tutoring'),  # Categorical variable
    Integer(0, 4, name='ParentalSupport'),  # Integer variable
    Categorical([0, 1], name='Extracurricular'),  # Categorical variable
    Categorical([0, 1], name='Sports'),  # Categorical variable
    Categorical([0, 1], name='Music'),  # Categorical variable
    Categorical([0, 1], name='Volunteering')  # Categorical variable
  ]

@use_named_args(space)
def objective(**params):
    user_data = {
        'Age': age,
        'Gender': gender,
        'ParentalEducation': parental_education,
        'StudyTimeWeekly': params['StudyTimeWeekly'],
        'Absences': params['Absences'],
        'Tutoring': params['Tutoring'],
        'ParentalSupport': params['ParentalSupport'],
        'Extracurricular': params['Extracurricular'],
        'Sports': params['Sports'],
        'Music': params['Music'],
        'Volunteering': params['Volunteering']
    }

    user_df = pd.DataFrame(user_data, index=[0])
    print(user_df)

    #process data
    user_input = scaler.transform(user_df)

    # predict grade
    pred_grade = reg.predict([user_input[0]])
    print(pred_grade)
    score = 4 - pred_grade[0] # rating based on how close it can get to 4.0

    return score

res = gp_minimize(objective, space, n_calls=50, random_state=0)

print("Best score: ", res.fun)
print("Best parameters: ", res.x)

   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   15       0                  1        11.856892        24         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  
0                3                1       0      0             0  
[1.341822]
   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   15       0                  1         5.453126        14         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  
0                2                0       1      0             1  
[2.0053701]
   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   15       0                  1         7.364831        28         0   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  
0                3                0       1      1             1  
[0.68739164]
   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   15       0                  1        