In [46]:
%pip install scikit-lego



In [47]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklego.linear_model import LADRegression
from sklearn.model_selection import train_test_split

In [48]:
df = pd.read_csv("StudentPerformanceFactorsCleaned.csv")
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Female,70


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6443 entries, 0 to 6442
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6443 non-null   int64 
 1   Attendance                  6443 non-null   int64 
 2   Parental_Involvement        6443 non-null   object
 3   Access_to_Resources         6443 non-null   object
 4   Extracurricular_Activities  6443 non-null   object
 5   Sleep_Hours                 6443 non-null   int64 
 6   Previous_Scores             6443 non-null   int64 
 7   Motivation_Level            6443 non-null   object
 8   Internet_Access             6443 non-null   object
 9   Tutoring_Sessions           6443 non-null   int64 
 10  Family_Income               6443 non-null   object
 11  Teacher_Quality             6443 non-null   object
 12  School_Type                 6443 non-null   object
 13  Peer_Influence              6443 non-null   obje

In [50]:
df.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Gender', 'Exam_Score'],
      dtype='object')

We first split the data into training, validation, and testing.

In [51]:
y = df["Exam_Score"]

#60% for training
df_train = df.sample(frac=0.6, random_state=2)

tmp = df.query("~index.isin(@df_train.index)")

#20% for validation
df_validation = tmp.sample(frac=0.5, random_state=3789)

#20% for testing
df_test = tmp.query("~index.isin(@df_validation.index)")

print(f"Training data: {df_train.shape}")
print(f"Validation data: {df_validation.shape}")
print(f"Testing data: {df_test.shape}")

Training data: (3866, 19)
Validation data: (1288, 19)
Testing data: (1289, 19)


We now fit it with Linear Regression, and make predictions on validation + test.

In [52]:
x_train = np.array(df_train['Hours_Studied']).reshape(-1, 1)
y_train = df_train['Exam_Score']
x_validation = np.array(df_validation['Hours_Studied']).reshape(-1, 1)
y_validation = df_validation['Exam_Score']

#linear regression
ls_area_fit = LinearRegression()
ls_area_fit.fit(x_train, y_train)

#pred on training set
pred_train_df = ls_area_fit.predict(x_train)

#pred on validation set
pred_val_df = ls_area_fit.predict(x_validation)

We calculate MSE and $R^2$.

In [53]:
# Step 5: Calculate evaluation metrics for the training set
train_mse = mean_squared_error(y_train, pred_train_df)
train_r2 = r2_score(y_train, pred_train_df)

print(f"Training MSE: {train_mse}")
print(f"Training R²: {train_r2}")

# Step 6: Calculate evaluation metrics for the validation set
validation_mse = mean_squared_error(y_validation, pred_val_df)
validation_r2 = r2_score(y_validation, pred_val_df)

print(f"Validation MSE: {validation_mse}")
print(f"Validation R²: {validation_r2}")


Training MSE: 11.732696665642777
Training R²: 0.1975764984706978
Validation MSE: 12.75541594990444
Validation R²: 0.2015234091425332


Next, we plot it.

In [54]:
fig = go.Figure()

#plot points
fig.add_trace(
    go.Scatter(x=df_train['Hours_Studied'],
                y=df_train['Exam_Score'],
                mode='markers',
                name='Actual'
))

#regression line
fig.add_trace(
    go.Scatter(x=df_train['Hours_Studied'],
                y=ls_area_fit.intercept_ + df_train['Hours_Studied'] * ls_area_fit.coef_[0],
                mode='lines',
                name='LS',
                line={'dash': 'solid',
                      'color': 'black'}
))

fig.show()

Evaluate it on the testing data.

In [55]:
# Create a DataFrame to compare true vs predicted values for validation data
pred_val_df = pd.DataFrame({
    'true': y_validation,
    'predicted': pred_val_df
})

# Display the comparison
print(pred_val_df.head())

      true  predicted
6332    65  68.418919
823     64  67.856129
6092    70  68.981710
4245    66  67.293339
1924    72  69.263105


In [60]:
ls_rmse = np.sqrt(mean_squared_error(pred_val_df['true'], pred_val_df['predicted']))
ls_mae = mean_absolute_error(pred_val_df['true'], pred_val_df['predicted'])
ls_mad = np.median(np.abs(pred_val_df['true'] - pred_val_df['predicted']))
ls_corr = np.corrcoef(pred_val_df['true'], pred_val_df['predicted'])[0, 1]
ls_r2 = r2_score(pred_val_df['true'], pred_val_df['predicted'])

print(f"LS RMSE: {ls_rmse}")
print(f"LS MAE: {ls_mae}")
print(f"LS MAD: {ls_mad}")
print(f"LS Correlation: {ls_corr}")
print(f"LS R²: {ls_r2}")

LS RMSE: 3.57147251843052
LS MAE: 2.5077887288670846
LS MAD: 2.1677587569089383
LS Correlation: 0.4509614218410936
LS R²: 0.2015234091425332


Our correlation matches when we ran it in the previous check in last week.

Since the $R^2$ is so low alongside not a strong correlation, it would suggest the model is underfitting. It could be that it is not a strong enough model to predict between hours studied and exam score.

In [61]:
%pip install mlxtend

