# Regression Modeling for GPA

## Imports

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = [7, 6]

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline

## Data Import

In [40]:
df = pd.read_csv('./data/cleaned_student_lifetyle_dataset.csv')

In [41]:
df.head()

Unnamed: 0,Student_ID,Study_Hours,Extracurricular_Hours,Sleep_Hours,Social_Hours,Physical_Activity_Hours,GPA,Stress_Level
0,1,6.9,3.8,8.7,2.8,1.8,2.99,Moderate
1,2,5.3,3.5,8.0,4.2,3.0,2.75,Low
2,3,5.1,3.9,9.2,1.2,4.6,2.67,Low
3,4,6.5,2.1,7.2,1.7,6.5,2.88,Moderate
4,5,8.1,0.6,6.5,2.2,6.6,3.51,High


In [42]:
df = df.drop(columns=['Stress_Level', 'Student_ID'])
# stress level should not be considered for determining GPA, as it will be used in classification. 
# Student Id is irrelevent for model training

In [43]:
df.head()

Unnamed: 0,Study_Hours,Extracurricular_Hours,Sleep_Hours,Social_Hours,Physical_Activity_Hours,GPA
0,6.9,3.8,8.7,2.8,1.8,2.99
1,5.3,3.5,8.0,4.2,3.0,2.75
2,5.1,3.9,9.2,1.2,4.6,2.67
3,6.5,2.1,7.2,1.7,6.5,2.88
4,8.1,0.6,6.5,2.2,6.6,3.51


## Model Training

In [44]:
X = df.drop(columns='GPA')
y = df['GPA']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.22)

In [46]:
model = LinearRegression()

In [47]:
a = -1 * cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
print(a)
print(a.mean())

[0.21047349 0.18969167 0.18501526 0.19638406 0.20400056]
0.19711300899843592


In [48]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [49]:
b = -1 * cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_root_mean_squared_error')
print(b)
print(b.mean())

[0.18994145 0.19133847 0.19803867 0.20735208 0.19394716]
0.19612356737433329


In [50]:
from sklearn.ensemble import AdaBoostRegressor

In [51]:
model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [52]:
model.score(X_test, y_test)

0.5608684699787283

In [53]:
model_2 = LinearRegression()

In [63]:
ada_reg = AdaBoostRegressor(estimator=model_2, random_state=42)

In [64]:
ada_reg.fit(X_train, y_train)

0,1,2
,estimator,LinearRegression()
,n_estimators,50
,learning_rate,1.0
,loss,'linear'
,random_state,42

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [65]:
ada_reg.score(X_test, y_test)

0.5479898778315093

In [66]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.25, 0.5, 0.75, 1.0]
}

In [67]:
model_3 = LinearRegression()
ada_reg_2 = AdaBoostRegressor(estimator=model_3, random_state=42)

In [68]:
ada_grid = GridSearchCV(ada_reg_2, param_grid=param_grid)

In [69]:
ada_grid.fit(X_train, y_train)

0,1,2
,estimator,AdaBoostRegre...ndom_state=42)
,param_grid,"{'learning_rate': [0.25, 0.5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [70]:
ada_grid.score(X_test, y_test)

0.550601323167587

In [71]:
ada_grid.best_params_

{'learning_rate': 0.25, 'n_estimators': 50}

In [81]:
gbt = GradientBoostingRegressor(random_state=42)

In [82]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [1, 2, 3, 4, 5]
}

In [83]:
gbt_grid = GridSearchCV(gbt, param_grid)

In [84]:
gbt_grid.fit(X_train, y_train)

0,1,2
,estimator,GradientBoost...ndom_state=42)
,param_grid,"{'max_depth': [1, 2, ...], 'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'squared_error'
,learning_rate,0.1
,n_estimators,50
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,1
,min_impurity_decrease,0.0


In [85]:
gbt_grid.score(X_test, y_test)

0.545791946002827

In [87]:
gbt_pred = gbt_grid.predict(X_test)

In [88]:
root_mean_squared_error(y_test, gbt_pred)

0.18935060057964162

In [89]:
ada_pred = ada_grid.predict(X_test)

In [90]:
root_mean_squared_error(y_test, ada_pred)

0.18834546436684277

In [91]:
gbt_grid.best_estimator_.feature_names_in_

array(['Study_Hours', 'Extracurricular_Hours', 'Sleep_Hours',
       'Social_Hours', 'Physical_Activity_Hours'], dtype=object)

In [93]:
scaler = StandardScaler()

In [94]:
X_train_scaled = scaler.fit_transform(X_train)

In [95]:
X_test_scaled = scaler.transform(X_test)

In [96]:
lr = LinearRegression()

In [97]:
lr.fit(X_train_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [98]:
lr.score(X_test_scaled, y_test)

0.5608684699787283

In [99]:
from sklearn.tree import DecisionTreeRegressor

In [100]:
dt = DecisionTreeRegressor()

In [101]:
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [102]:
dt.score(X_test, y_test)

0.011691684717581974

In [112]:
ada_dt = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=42)

In [113]:
ada_dt.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeRegressor()
,n_estimators,50
,learning_rate,1.0
,loss,'linear'
,random_state,42

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [114]:
ada_dt.score(X_test, y_test)

0.3644238804965537

In [115]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.25, 0.5, 0.75]
}

In [116]:
ada_dt_grid = GridSearchCV(ada_dt, param_grid)

In [117]:
ada_dt_grid.fit(X_train, y_train)

0,1,2
,estimator,AdaBoostRegre...ndom_state=42)
,param_grid,"{'learning_rate': [0.25, 0.5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [118]:
ada_dt_grid.score(X_test, y_test)

0.3857450770591705

## Old Testing Method

In [None]:
# test_sizes = np.arange(0.2, 0.4, 0.01)
# r2_scores = []
# rmse_scores = []
# for test_size in test_sizes:
#     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=test_size)
#     lr = LinearRegression()
#     rf = RandomForestRegressor()
#     gbr = GradientBoostingRegressor()
#     lr.fit(X_train, y_train)
#     rf.fit(X_train, y_train)
#     gbr.fit(X_train, y_train)
#     r2_scores.append({'Linear Regression': lr.score(X_test, y_test), \
#                       'Random Forest': rf.score(X_test, y_test), \
#                       'Gradient Boosting Regression': gbr.score(X_test, y_test)})
#     lr_preds = lr.predict(X_test)
#     rf_preds = rf.predict(X_test)
#     gbr_preds = gbr.predict(X_test)
#     baseline_preds = np.full_like(y_test, y_test.mean())
#     rmse_scores.append({'Linear Regression': root_mean_squared_error(y_test, lr_preds), \
#                         'Random Forest': root_mean_squared_error(y_test, rf_preds), \
#                         'Gradient Boosting Regression': root_mean_squared_error(y_test, gbr_preds), \
#                         'Baseline': root_mean_squared_error(y_test, baseline_preds)})

# r2 = pd.DataFrame(r2_scores)
# rmse = pd.DataFrame(rmse_scores)

# r2['test_size'] = test_sizes
# rmse['test_size'] = test_sizes

# r2

# sns.lineplot(data=r2, x='test_size', y='Linear Regression', marker='o', label='Linear Regression')
# sns.lineplot(data=r2, x='test_size', y='Random Forest', marker='o', label='Random Forest')
# sns.lineplot(data=r2, x='test_size', y='Gradient Boosting Regression', marker='o', label='Gradient Boosting Regression')
# plt.ylabel("R2 Score")
# plt.xticks(test_sizes, rotation=45);

# rmse

# sns.lineplot(data=rmse, x='test_size', y='Linear Regression', marker='o', label='Linear Regression')
# sns.lineplot(data=rmse, x='test_size', y='Random Forest', marker='o', label='Random Forest')
# sns.lineplot(data=rmse, x='test_size', y='Gradient Boosting Regression', marker='o', label='Gradient Boosting Regression')
# sns.lineplot(data=rmse, x='test_size', y='Baseline', marker='o', label='Baseline')
# plt.ylabel("RMSE Score")
# plt.xticks(test_sizes, rotation=45);

# ### KNN

# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.preprocessing import StandardScaler

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.36)

# sc = StandardScaler()
# X_train_sc = sc.fit_transform(X_train)
# X_test_sc  = sc.transform(X_test)

# scores = []
# for neighbor in range(3, 40, 2):
#     knn = KNeighborsRegressor(n_neighbors=neighbor)
    
#     knn.fit(X_train_sc, y_train)
#     k_preds = knn.predict(X_test_sc)
#     scores.append({'K': neighbor, 'R2': knn.score(X_test_sc, y_test), 'RMSE': root_mean_squared_error(y_test, k_preds)})

# knn_scores = pd.DataFrame(scores)

# sns.lineplot(data=knn_scores, x='K', y='R2', marker='o')
# plt.xticks(range(3, 40, 2));

# sns.lineplot(data=knn_scores, x='K', y='RMSE', marker='o')
# plt.xticks(range(3, 40, 2));

# knn_scores



# ### Best Model Score Analysis

# The best model was Linear Regression with a test size of 0.22

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.22)
# lr = LinearRegression()
# lr.fit(X_train, y_train)
# r2_scorev = lr.score(X_test, y_test)
# lr_preds = lr.predict(X_test)
# # baseline_preds = np.full_like(y_test, y_test.mean())
# rmse_score = root_mean_squared_error(y_test, lr_preds)

# sns.scatterplot(x=y_test, y=lr_preds)
# plt.xlabel("Actual")
# plt.ylabel("Predictions")
# plt.title("Actual vs Predicted GPA (Linear Regression)")
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
# plt.tight_layout()
# # plt.savefig("../images/act_vs_pred_lr.png")

# break

# import joblib

# joblib.dump(lr, "reg.pkl")







