In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

In [5]:
# Loading Dataset
data = pd.read_csv("database.csv") 
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


(2392, 15)

# Data Processing

In [6]:
# Data Processing
data=data.interpolate(method ='linear', limit_direction ='forward')
data.duplicated().sum()
data.drop_duplicates(inplace=True)
data = data[['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
             'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
             'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA', 'GradeClass']]

input = data.drop(columns=['StudentID', 'Ethnicity', 'GradeClass', 'GPA'], errors='ignore')
scaler = StandardScaler()
print(input.head())
input = scaler.fit_transform(input)

labels = data['GPA']

X_train, X_test, Y_train, Y_test = train_test_split(input, labels)

   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   17       1                  2        19.833723         7         1   
1   18       0                  1        15.408756         0         0   
2   15       0                  3         4.210570        26         0   
3   17       1                  3        10.028829        14         0   
4   17       1                  2         4.672495        17         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  
0                2                0       0      1             0  
1                1                0       0      0             0  
2                2                0       0      0             0  
3                3                1       0      0             0  
4                3                0       0      0             0  


# Baseline Grade Prediction Model

In [7]:
# Random Forest Regression
model_1 = RandomForestRegressor(n_estimators=20, random_state=42)
model_1.fit(X_train, Y_train)
Y_pred = model_1.predict(X_test)
print('Mean Squared Error:', mean_squared_error(Y_test, Y_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, Y_pred))

Mean Squared Error: 0.06607083976758066
Mean Absolute Error: 0.20366609614450795


# Baseline Habit Optimization Model

In [11]:
# Prompting User Input
# age = int(input("Enter Age: "))
# gender = int(input("Enter Gender (0 or 1): "))
# study_time_weekly = float(input("Enter Study Time Weekly: "))
# absences = int(input("Enter Absences: "))
# extracurricular = int(input("Enter Extracurricular Activities: "))
# sports = int(input("Enter Sports Participation: "))
# music = int(input("Enter Music Participation: "))
# volunteering = int(input("Enter Volunteering Participation: "))
# parental_support = int(input("Enter Parental Support: "))
# parental_education = int(input("Enter Parental Education: "))
# tutoring = int(input("Enter Tutoring: "))

age = 17
gender = 0
study_time_weekly = 6
absences = 10
extracurricular = 0
sports = 0
music = 0
volunteering = 0
parental_education = 3
parental_support = 1
tutoring = 1

user_data = {
    'Age': [age],
    'Gender': [gender],
    'ParentalEducation': [parental_education],
    'StudyTimeWeekly': [study_time_weekly],
    'Absences': [absences],
    'Tutoring': [tutoring],
    'ParentalSupport': [parental_support],
    'Extracurricular': [extracurricular],
    'Sports': [sports],
    'Music': [music],
    'Volunteering': [volunteering]
}

user_df = pd.DataFrame(user_data)
print(user_df)

   Age  Gender  ParentalEducation  StudyTimeWeekly  Absences  Tutoring  \
0   17       0                  3                6        10         1   

   ParentalSupport  Extracurricular  Sports  Music  Volunteering  
0                1                0       0      0             0  


In [16]:
#process data
user_input = scaler.transform(user_df)

# predict grade
pred_grade = model_1.predict([user_input[0]])
print("Predicted Grade:", pred_grade)

# iteratively improve parameters
params_to_change = ['Absences', 'StudyTimeWeekly', 'Tutoring', 'Sports', 'Extracurricular', 'Music', 'Volunteering']
values = {'Absences': [0, 5, 10, 20], 'StudyTimeWeekly': [5, 10, 20, 30], 'Tutoring': [0, 3], 'Sports': [0, 1], 'Extracurricular': [0, 1], 'Music': [0, 1], 'Volunteering': [0, 1]}

Predicted Grade: [2.22642743]


In [17]:
# iteratively change study habits using coordinate descent (actual model can be more complex and exhaustive)
best_grade = pred_grade
best_params = user_df.copy()
for param in params_to_change:
  value_array = values[param]
  for value in value_array:
    user_df_copy = best_params.copy()
    user_df_copy[param] = value
    user_input_copy = scaler.transform(user_df_copy)
    pred_grade = model_1.predict([user_input_copy[0]])
    print(f"Parameter: {param}, Value: {value}, Predicted Grade: {pred_grade[0]}")
    if pred_grade[0] > best_grade:
      best_grade = pred_grade[0]
      best_params = user_df_copy.copy()

print("----------------\nBest Grade:", best_grade)
print("Best Parameters:", best_params)

Parameter: Absences, Value: 0, Predicted Grade: 3.1653164454777394
Parameter: Absences, Value: 5, Predicted Grade: 2.629985433801923
Parameter: Absences, Value: 10, Predicted Grade: 2.2264274302654323
Parameter: Absences, Value: 20, Predicted Grade: 1.105848333031902
Parameter: StudyTimeWeekly, Value: 5, Predicted Grade: 3.134207446643018
Parameter: StudyTimeWeekly, Value: 10, Predicted Grade: 3.360647912311511
Parameter: StudyTimeWeekly, Value: 20, Predicted Grade: 3.5690404446399184
Parameter: StudyTimeWeekly, Value: 30, Predicted Grade: 3.5690404446399184
Parameter: Tutoring, Value: 0, Predicted Grade: 3.43854561820759
Parameter: Tutoring, Value: 3, Predicted Grade: 3.5690404446399184
Parameter: Sports, Value: 0, Predicted Grade: 3.5690404446399184
Parameter: Sports, Value: 1, Predicted Grade: 3.672331082301015
Parameter: Extracurricular, Value: 0, Predicted Grade: 3.672331082301015
Parameter: Extracurricular, Value: 1, Predicted Grade: 3.717197450371861
Parameter: Music, Value: 0, 