In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score



In [5]:
# Loading Dataset
data = pd.read_csv("database.csv") 
data.info()
data.shape





In [18]:
# Data Processing
data=data.interpolate(method ='linear', limit_direction ='forward')
data.duplicated().sum()
data.drop_duplicates(inplace=True)
data = data[['StudentID', 'Age', 'Gender', 'Ethnicity', 'ParentalEducation',
             'StudyTimeWeekly', 'Absences', 'Tutoring', 'ParentalSupport',
             'Extracurricular', 'Sports', 'Music', 'Volunteering', 'GPA', 'GradeClass']]

input = data.drop(columns=['StudentID', 'Ethnicity', 'GradeClass', 'GPA'], errors='ignore')
scaler = StandardScaler()
print(input.head())
input = scaler.fit_transform(input)

labels = data['GPA']

X_train, X_test, Y_train, Y_test = train_test_split(input, labels)



In [10]:
# Model 1: Random Forest
model_1 = RandomForestRegressor(n_estimators=20, random_state=42)
model_1.fit(X_train, Y_train)
Y_pred = model_1.predict(X_test)
print('Mean Squared Error:', mean_squared_error(Y_test, Y_pred))
print('Mean Absolute Error:', mean_absolute_error(Y_test, Y_pred))



# Baseline Habit Optimization Model

In [None]:
# Prompting User Input
# age = int(input("Enter Age: "))
# gender = int(input("Enter Gender (0 or 1): "))
# study_time_weekly = float(input("Enter Study Time Weekly: "))
# absences = int(input("Enter Absences: "))
# extracurricular = int(input("Enter Extracurricular Activities: "))
# sports = int(input("Enter Sports Participation: "))
# music = int(input("Enter Music Participation: "))
# volunteering = int(input("Enter Volunteering Participation: "))
# parental_support = int(input("Enter Parental Support: "))
# parental_education = int(input("Enter Parental Education: "))
# tutoring = int(input("Enter Tutoring: "))

age = 15
gender = 0
study_time_weekly = 4.2
absences = 10
extracurricular = 12
sports = 0
music = 0
volunteering = 0
parental_education = 6
parental_support = 1
tutoring = 1

user_data = {
    'Age': [age],
    'Gender': [gender],
    'ParentalEducation': [parental_education],
    'StudyTimeWeekly': [study_time_weekly],
    'Absences': [absences],
    'Tutoring': [tutoring],
    'ParentalSupport': [parental_support],
    'Extracurricular': [extracurricular],
    'Sports': [sports],
    'Music': [music],
    'Volunteering': [volunteering]
}

user_df = pd.DataFrame(user_data)
print(user_df)



In [None]:
#process data
user_input = scaler.transform(user_df)

# predict grade
pred_grade = model_1.predict([user_input[0]])
print("Predicted Grade:", pred_grade)

# iteratively improve parameters
params_to_change = ['Absences', 'StudyTimeWeekly', 'Tutoring', 'Sports', 'Extracurricular', 'Music', 'Volunteering']
values = {'Absences': np.linspace(0, 29, 1), 'StudyTimeWeekly': np.linspace(0,20,1), 'Tutoring': [0, 1], 'Sports': [0, 1], 'Extracurricular': [0, 1], 'Music': [0, 1], 'Volunteering': [0, 1]}

# iteratively change study habits using coordinate descent (actual model can be more complex and exhaustive)
best_grade = pred_grade
best_params = user_df.copy()
for param in params_to_change:
  value_array = values[param]
  for value in value_array:
    user_df_copy = best_params.copy()
    user_df_copy[param] = value
    user_input_copy = scaler.transform(user_df_copy)
    pred_grade = model_1.predict([user_input_copy[0]])
    print(f"Parameter: {param}, Value: {value}, Predicted Grade: {pred_grade[0]}")
    if pred_grade[0] > best_grade:
      best_grade = pred_grade[0]
      best_params = user_df_copy.copy()

print("----------------\nBest Grade:", best_grade)
print("Best Parameters:", best_params)



# Bayesian Approach
To be more effective, and perhaps more accurate, we could try a bayesian optimization approach. (This is a work in progress).

In [None]:
from bayes_opt import BayesianOptimization
import numpy as np

# Example parameter bounds (adjust these to match your dictionary parameter names and valid ranges)
pbounds = {
    'param1': (0.1, 1.0),  # Example: learning rate or similar
    'param2': (10, 100)    # Example: number of estimators or similar
}

# Define your objective function.
# This function should update model_1's parameters, call predict, and return the metric to maximize.
def objective_function(param1, param2):
    # Convert parameters to appropriate types if necessary.
    # For example, if param2 must be an integer:
    param2 = int(param2)
    
    # Create the parameter dictionary.
    params = {
        'param1': param1,
        'param2': param2
    }
    
    # Update model_1's parameters (assuming your model supports set_params).
    model_1.set_params(**params)
    
    # If necessary, refit or update the model here.
    # Example: model_1.fit(X_train, Y_train)  # if parameters require retraining
    
    # Make predictions on your sample/input data.
    predictions = model_1.predict(X_sample)
    
    # For this example, we maximize the mean prediction.
    # Adjust the metric according to your needs (e.g., accuracy, profit, etc.).
    score = np.mean(predictions)
    
    return score

# Set up Bayesian Optimization.
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=pbounds,
    random_state=42,
)

# Launch the optimization.
optimizer.maximize(
    init_points=5,  # Number of random initialization points
    n_iter=15       # Number of iterations for Bayesian Optimization
)

print("Best parameters found:", optimizer.max)