# Student Performance Dataset

https://www.kaggle.com/datasets/rabieelkharoua/students-performance-dataset

Credit goes to the creator/provider of the Dataset, Rabie El Kharoua.

This work is licensed under the specific Creative Commons license, CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/).

No specific changes where to the data itself.

## Import Libraries

In [84]:
from scipy import stats
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import tensorflow as tf

# Read CSV

In [85]:
filepath = "student_performance_data.csv"
df = pd.read_csv(filepath)

In [86]:
display(df)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.210570,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,3388,18,1,0,3,10.680555,2,0,4,1,0,0,0,3.455509,0.0
2388,3389,17,0,0,1,7.583217,4,1,4,0,1,0,0,3.279150,4.0
2389,3390,16,1,0,2,6.805500,20,0,2,0,0,0,1,1.142333,2.0
2390,3391,16,1,1,0,12.416653,17,0,2,0,1,1,0,1.803297,1.0


# Descriptive Analytics

In [87]:
df.shape

(2392, 15)

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [89]:
df.describe()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,2196.5,16.468645,0.51087,0.877508,1.746237,9.771992,14.541388,0.301421,2.122074,0.383361,0.303512,0.196906,0.157191,1.906186,2.983696
std,690.655244,1.123798,0.499986,1.028476,1.000411,5.652774,8.467417,0.458971,1.122813,0.486307,0.45987,0.397744,0.364057,0.915156,1.233908
min,1001.0,15.0,0.0,0.0,0.0,0.001057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1598.75,15.0,0.0,0.0,1.0,5.043079,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.174803,2.0
50%,2196.5,16.0,1.0,0.0,2.0,9.705363,15.0,0.0,2.0,0.0,0.0,0.0,0.0,1.893393,4.0
75%,2794.25,17.0,1.0,2.0,2.0,14.40841,22.0,1.0,3.0,1.0,1.0,0.0,0.0,2.622216,4.0
max,3392.0,18.0,1.0,3.0,4.0,19.978094,29.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,4.0


# Data Normalization

In [90]:
normalized_df = df.copy()

for col in df.columns:
    normalized_df[col] = normalized_df[col]/normalized_df[col].max()

display(normalized_df)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,0.295106,0.944444,1.0,0.000000,0.50,0.992774,0.241379,1.0,0.50,0.0,0.0,1.0,0.0,0.732299,0.50
1,0.295401,1.000000,0.0,0.000000,0.25,0.771283,0.000000,0.0,0.25,0.0,0.0,0.0,0.0,0.760729,0.25
2,0.295696,0.833333,0.0,0.666667,0.75,0.210759,0.896552,0.0,0.50,0.0,0.0,0.0,0.0,0.028151,1.00
3,0.295991,0.944444,1.0,0.000000,0.75,0.501991,0.482759,0.0,0.75,1.0,0.0,0.0,0.0,0.513555,0.75
4,0.296285,0.944444,1.0,0.000000,0.50,0.233881,0.586207,1.0,0.75,0.0,0.0,0.0,0.0,0.322015,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,0.998821,1.000000,1.0,0.000000,0.75,0.534613,0.068966,0.0,1.00,1.0,0.0,0.0,0.0,0.863877,0.00
2388,0.999116,0.944444,0.0,0.000000,0.25,0.379577,0.137931,1.0,1.00,0.0,1.0,0.0,0.0,0.819787,1.00
2389,0.999410,0.888889,1.0,0.000000,0.50,0.340648,0.689655,0.0,0.50,0.0,0.0,0.0,1.0,0.285583,0.50
2390,0.999705,0.888889,1.0,0.333333,0.00,0.621513,0.586207,0.0,0.50,0.0,1.0,1.0,0.0,0.450824,0.25


# Measure Correlations

In [91]:
corr = normalized_df.corr()

for col in df.columns:
    output = corr[[col]].sort_values(by=col, axis=0, ascending=False).head()
    display(output)
    output = corr[[col]].sort_values(by=col, axis=0, ascending=False).tail()
    display(output)

Unnamed: 0,StudentID
StudentID,1.0
StudyTimeWeekly,0.026976
Absences,0.014841
Volunteering,0.008011
ParentalSupport,0.003016


Unnamed: 0,StudentID
Ethnicity,-0.01299
Gender,-0.014625
Sports,-0.020703
Age,-0.042255
GradeClass,-0.0985


Unnamed: 0,Age
Age,1.0
Gender,0.044895
ParentalSupport,0.033197
ParentalEducation,0.025099
Volunteering,0.013074


Unnamed: 0,Age
Tutoring,-0.012076
Extracurricular,-0.025061
Ethnicity,-0.028473
StudentID,-0.042255
Sports,-0.04632


Unnamed: 0,Gender
Gender,1.0
Age,0.044895
GradeClass,0.022998
Absences,0.021479
Ethnicity,0.01601


Unnamed: 0,Gender
Extracurricular,-0.005964
Sports,-0.008897
GPA,-0.01336
StudentID,-0.014625
Tutoring,-0.031597


Unnamed: 0,Ethnicity
Ethnicity,1.0
ParentalEducation,0.033595
GPA,0.02776
ParentalSupport,0.020922
Gender,0.01601


Unnamed: 0,Ethnicity
Music,-0.014627
Tutoring,-0.01744
GradeClass,-0.023326
Absences,-0.025712
Age,-0.028473


Unnamed: 0,ParentalEducation
ParentalEducation,1.0
GradeClass,0.041031
Music,0.039439
Absences,0.036518
Ethnicity,0.033595


Unnamed: 0,ParentalEducation
StudentID,-0.002307
StudyTimeWeekly,-0.011051
Tutoring,-0.01734
ParentalSupport,-0.017463
GPA,-0.035854


Unnamed: 0,StudyTimeWeekly
StudyTimeWeekly,1.0
GPA,0.179275
ParentalSupport,0.0358
Tutoring,0.02893
StudentID,0.026976


Unnamed: 0,StudyTimeWeekly
Age,-0.0068
ParentalEducation,-0.011051
Volunteering,-0.016604
Extracurricular,-0.02286
GradeClass,-0.134131


Unnamed: 0,Absences
Absences,1.0
GradeClass,0.728633
Sports,0.041454
ParentalEducation,0.036518
Gender,0.021479


Unnamed: 0,Absences
Age,-0.011511
Tutoring,-0.015534
Volunteering,-0.018528
Ethnicity,-0.025712
GPA,-0.919314


Unnamed: 0,Tutoring
Tutoring,1.0
GPA,0.145119
StudyTimeWeekly,0.02893
Sports,0.006278
Extracurricular,0.004865


Unnamed: 0,Tutoring
ParentalEducation,-0.01734
Ethnicity,-0.01744
Gender,-0.031597
Volunteering,-0.050898
GradeClass,-0.111695


Unnamed: 0,ParentalSupport
ParentalSupport,1.0
GPA,0.190774
StudyTimeWeekly,0.0358
Music,0.035122
Age,0.033197


Unnamed: 0,ParentalSupport
Volunteering,-0.006036
Sports,-0.006176
Extracurricular,-0.008381
ParentalEducation,-0.017463
GradeClass,-0.136823


Unnamed: 0,Extracurricular
Extracurricular,1.0
GPA,0.094078
ParentalEducation,0.007479
Tutoring,0.004865
Absences,0.00036


Unnamed: 0,Extracurricular
Sports,-0.01182
Music,-0.014191
StudyTimeWeekly,-0.02286
Age,-0.025061
GradeClass,-0.069733


Unnamed: 0,Sports
Sports,1.0
GPA,0.057859
Absences,0.041454
StudyTimeWeekly,0.006836
Tutoring,0.006278


Unnamed: 0,Sports
Extracurricular,-0.01182
Music,-0.020474
StudentID,-0.020703
GradeClass,-0.026654
Age,-0.04632


Unnamed: 0,Music
Music,1.0
GPA,0.073318
ParentalEducation,0.039439
ParentalSupport,0.035122
Volunteering,0.017224


Unnamed: 0,Music
Tutoring,-0.011385
Extracurricular,-0.014191
Ethnicity,-0.014627
Sports,-0.020474
GradeClass,-0.036065


Unnamed: 0,Volunteering
Volunteering,1.0
Music,0.017224
Ethnicity,0.013468
GradeClass,0.013156
Age,0.013074


Unnamed: 0,Volunteering
ParentalSupport,-0.006036
Extracurricular,-0.007427
StudyTimeWeekly,-0.016604
Absences,-0.018528
Tutoring,-0.050898


Unnamed: 0,GPA
GPA,1.0
ParentalSupport,0.190774
StudyTimeWeekly,0.179275
Tutoring,0.145119
Extracurricular,0.094078


Unnamed: 0,GPA
StudentID,-0.002697
Gender,-0.01336
ParentalEducation,-0.035854
GradeClass,-0.782835
Absences,-0.919314


Unnamed: 0,GradeClass
GradeClass,1.0
Absences,0.728633
ParentalEducation,0.041031
Gender,0.022998
Volunteering,0.013156


Unnamed: 0,GradeClass
StudentID,-0.0985
Tutoring,-0.111695
StudyTimeWeekly,-0.134131
ParentalSupport,-0.136823
GPA,-0.782835


In [92]:
for param in df.drop('GradeClass', axis=1).columns:
    pearson_coef, p_value = stats.pearsonr(df[param], df['GradeClass'])
    print(param)
    print(f'The Pearson Correlation Coefficient for {param} is {pearson_coef} with a P-value of P = {p_value}')

StudentID
The Pearson Correlation Coefficient for StudentID is -0.09849989295454192 with a P-value of P = 1.388395326544007e-06
Age
The Pearson Correlation Coefficient for Age is -0.006250264255547962 with a P-value of P = 0.7599613017314245
Gender
The Pearson Correlation Coefficient for Gender is 0.022997762354162836 with a P-value of P = 0.26087004331186564
Ethnicity
The Pearson Correlation Coefficient for Ethnicity is -0.02332582157243182 with a P-value of P = 0.2541280553087374
ParentalEducation
The Pearson Correlation Coefficient for ParentalEducation is 0.041031287770300205 with a P-value of P = 0.044796807379303
StudyTimeWeekly
The Pearson Correlation Coefficient for StudyTimeWeekly is -0.13413050292456996 with a P-value of P = 4.5056399779171405e-11
Absences
The Pearson Correlation Coefficient for Absences is 0.728632710454859 with a P-value of P = 0.0
Tutoring
The Pearson Correlation Coefficient for Tutoring is -0.11169457883793804 with a P-value of P = 4.324570869955078e-08
P

# Data Processing/Feature Engineering

In [93]:
X = normalized_df.drop('GradeClass', axis=1)
y = normalized_df[['GradeClass']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Statistical Modeling

In [94]:
reg = LinearRegression().fit(X_train, y_train)

In [95]:
predictions_linear_regression = reg.predict(X_test)

# Ensemble Learning

In [96]:
base_models = [
    ('lr', LinearRegression()),
    ('svr', SVR(kernel='rbf', C=1.0, epsilon=0.2)),
    ('dt', DecisionTreeRegressor(random_state=42))
]

meta_model = LinearRegression()

stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

stacking_model.fit(X_train, y_train)

predictions_stacking = stacking_model.predict(X_test)

  y = column_or_1d(y, warn=True)


# Neural Networks

In [97]:
input_shape = X_test.shape[1]
output_shape = y_test.shape[1]

In [98]:
model = Sequential()
model.add(Input(shape=(input_shape,)))
model.add(Dense(output_shape))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=[
    tf.keras.metrics.MeanAbsoluteError(),
    tf.keras.metrics.MeanSquaredError(),
    tf.keras.metrics.RootMeanSquaredError(),
    tf.keras.metrics.R2Score()
    ])

model.summary()

model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=1)

predictions_neural_networks = model.predict(X_test)

Epoch 1/100
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 282us/step - loss: 0.5155 - mean_absolute_error: 0.5895 - mean_squared_error: 0.5155 - r2_score: -4.4792 - root_mean_squared_error: 0.7150
Epoch 2/100
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242us/step - loss: 0.2638 - mean_absolute_error: 0.4122 - mean_squared_error: 0.2638 - r2_score: -1.8324 - root_mean_squared_error: 0.5127
Epoch 3/100
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248us/step - loss: 0.1938 - mean_absolute_error: 0.3547 - mean_squared_error: 0.1938 - r2_score: -1.0400 - root_mean_squared_error: 0.4400
Epoch 4/100
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257us/step - loss: 0.1323 - mean_absolute_error: 0.2942 - mean_squared_error: 0.1323 - r2_score: -0.3944 - root_mean_squared_error: 0.3633
Epoch 5/100
[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 257us/step - loss: 0.1000 - mean_absolute_error: 0.250

# Evaluation

In [99]:
get_percentage = lambda x : (x * 100)

## Statistical Modeling

In [100]:
linear_mae = mean_absolute_error(y_test, predictions_linear_regression)
linear_mse = mean_squared_error(y_test, predictions_linear_regression)
linear_rmse = root_mean_squared_error(y_test, predictions_linear_regression)
linear_r2_score = r2_score(y_test, predictions_linear_regression)

statistical_modeling_metrics = [linear_mae, linear_mse, linear_rmse, linear_r2_score]

In [101]:
linear_mae = get_percentage(linear_mae)
linear_mse = get_percentage(linear_mse)
linear_rmse = get_percentage(linear_rmse)
linear_r2_score = get_percentage(linear_r2_score)

In [102]:
print('Mean Absolute Error: %.2f%%' % linear_mae)
print('Mean Squared Error: %.2f%%' % linear_mse)
print('Root Mean Absolute Error: %.2f%%' % linear_rmse)
print('R^2 Score: %.2f%%' % linear_r2_score)

Mean Absolute Error: 13.59%
Mean Squared Error: 3.32%
Root Mean Absolute Error: 18.21%
R^2 Score: 64.49%


In [103]:
predictions_linear_regression = pd.DataFrame(predictions_linear_regression, columns=['Predictions'])
display(predictions_linear_regression)

Unnamed: 0,Predictions
0,0.927324
1,0.492300
2,0.668899
3,0.284843
4,1.197530
...,...
474,0.851363
475,0.728665
476,0.639770
477,0.481474


In [104]:
predictions_linear_regression.describe()

Unnamed: 0,Predictions
count,479.0
mean,0.738128
std,0.242047
min,0.17686
25%,0.545652
50%,0.735456
75%,0.918261
max,1.297307


## Ensemble Learning

In [105]:
ensemble_mae = mean_absolute_error(y_test, predictions_stacking)
ensemble_mse = mean_squared_error(y_test, predictions_stacking)
ensemble_rmse = root_mean_squared_error(y_test, predictions_stacking)
ensemble_r2_score = r2_score(y_test, predictions_stacking)

ensemble_metrics = [ensemble_mae, ensemble_mse, ensemble_rmse, ensemble_r2_score]

In [106]:
ensemble_mae = get_percentage(ensemble_mae)
ensemble_mse = get_percentage(ensemble_mse)
ensemble_rmse = get_percentage(ensemble_rmse)
ensemble_r2_score = get_percentage(ensemble_r2_score)

In [107]:
print('Mean Absolute Error: %.2f%%' % ensemble_mae)
print('Mean Squared Error: %.2f%%' % ensemble_mse)
print('Root Mean Absolute Error: %.2f%%' % ensemble_rmse)
print('R^2 Score: %.2f%%' % ensemble_r2_score)

Mean Absolute Error: 6.92%
Mean Squared Error: 1.95%
Root Mean Absolute Error: 13.96%
R^2 Score: 79.14%


## Neural Networks

In [108]:
evaluation = model.evaluate(X_test, y_test)

[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 416us/step - loss: 0.0355 - mean_absolute_error: 0.1407 - mean_squared_error: 0.0355 - r2_score: 0.6389 - root_mean_squared_error: 0.1883


In [109]:
tf_mae = evaluation[1]
tf_mse = evaluation[2]
tf_rmse = evaluation[3]
tf_r2_score = evaluation[4]

neural_networks_metrics = [tf_mae, tf_mse, tf_rmse, tf_r2_score]

In [110]:
tf_mae = get_percentage(tf_mae)
tf_mse = get_percentage(tf_mse)
tf_rmse = get_percentage(tf_rmse)
tf_r2_score = get_percentage(tf_r2_score)

In [111]:
print('Mean Absolute Error: %.2f%%' % tf_mae)
print('Mean Squared Error: %.2f%%' % tf_mse)
print('Root Mean Absolute Error: %.2f%%' % tf_rmse)
print('R^2 Score: %.2f%%' % tf_r2_score)

Mean Absolute Error: 13.74%
Mean Squared Error: 3.47%
Root Mean Absolute Error: 18.63%
R^2 Score: 62.83%


In [112]:
predictions_neural_networks = pd.DataFrame(predictions_neural_networks, columns=['Predictions'])
display(predictions_neural_networks)

Unnamed: 0,Predictions
0,0.904191
1,0.507877
2,0.683299
3,0.319408
4,1.219061
...,...
474,0.846595
475,0.750999
476,0.605828
477,0.555389


In [113]:
predictions_neural_networks.describe()

Unnamed: 0,Predictions
count,479.0
mean,0.741496
std,0.238988
min,0.159727
25%,0.553795
50%,0.73528
75%,0.931008
max,1.24324


# Evaluation Comparison

In [114]:
metrics_dict = {'Linear Regression':statistical_modeling_metrics, 'Ensemble Learning (Stacking)':ensemble_metrics, 'Neural Networks':neural_networks_metrics}
metrics_df = pd.DataFrame.from_dict(metrics_dict)
display(metrics_df)

Unnamed: 0,Linear Regression,Ensemble Learning (Stacking),Neural Networks
0,0.135896,0.069219,0.137418
1,0.033157,0.019482,0.034715
2,0.182091,0.139577,0.186319
3,0.644942,0.791382,0.62826


In [115]:
print(metrics_df.iloc[0, :].idxmin(axis=0))
print(metrics_df.iloc[1, :].idxmin(axis=0))
print(metrics_df.iloc[2, :].idxmin(axis=0))

Ensemble Learning (Stacking)
Ensemble Learning (Stacking)
Ensemble Learning (Stacking)


In [116]:
print(metrics_df.iloc[3, :].idxmax(axis=0))

Ensemble Learning (Stacking)


# Predicted vs Real Values

In [129]:
index = random.randint(0, len(X_test.index.to_list()))

row = X_test.iloc[[index]]

final_predicted = stacking_model.predict(row)
print('Predicted Value: ' + str(final_predicted[0] * max(df['GradeClass'])))
print('Real Value: ' + str(y_test.iloc[index][0] * max(df['GradeClass'])))

Predicted Value: 3.9869222893195144
Real Value: 4.0


  print('Real Value: ' + str(y_test.iloc[index][0] * max(df['GradeClass'])))


# Conclusion

The best performing model is the Ensemble Learning model (Stacking Regressor).