Exploration of the "Student performance dataset"

In [1]:
import os
import zipfile
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import seaborn as sns
import torch as torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from student_performance import student_model, student_preprocessing
from training_functions import train_model, activation_function

In [2]:
df = pd.read_csv('./data/minahilfatima12328/performance-trends-in-education/StudentPerformanceFactors.csv')
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [3]:
df.describe()
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [4]:
categorical_vars = [col for col in df.columns if df[col].dtype == pd.CategoricalDtype]
numerical_vars = [col for col in df.columns if df[col].dtype != pd.CategoricalDtype]
categorical_vars, numerical_vars    

(['Parental_Involvement',
  'Access_to_Resources',
  'Extracurricular_Activities',
  'Motivation_Level',
  'Internet_Access',
  'Family_Income',
  'Teacher_Quality',
  'School_Type',
  'Peer_Influence',
  'Learning_Disabilities',
  'Parental_Education_Level',
  'Distance_from_Home',
  'Gender'],
 ['Hours_Studied',
  'Attendance',
  'Sleep_Hours',
  'Previous_Scores',
  'Tutoring_Sessions',
  'Physical_Activity',
  'Exam_Score'])

In [5]:
mean_df = df[numerical_vars].mean()
std_df = df[numerical_vars].std()

In [6]:
for col in numerical_vars:
    df[col] = (df[col]-mean_df[col])/std_df[col]

In [7]:
# sns.histplot(df['Attendance'], bins=10)

In [8]:
# sns.heatmap(df[numerical_vars])

In [9]:
# sns.lmplot(data=df, x="Previous_Scores", y="Exam_Score", hue="Parental_Involvement")

In [10]:
# sns.lmplot(data=df, x="Hours_Studied", y="Exam_Score", hue="Parental_Involvement")

In [11]:
# sns.regplot(data=df, x="Attendance", y="Exam_Score")

In [12]:
# sns.regplot(data=df, x="Sleep_Hours", y="Exam_Score")

In [13]:
# sns.histplot(df['Exam_Score'])

In [14]:
# sns.swarmplot(data=df, x="Parental_Involvement", y="Exam_Score")

In [15]:
# sns.swarmplot(data=df, x="Motivation_Level", y="Exam_Score")

In [16]:
# sns.swarmplot(data=df, x="Teacher_Quality", y="Exam_Score")

In [None]:
# sns.swarmplot(data=df, x="Learning_Disabilities", y="Exam_Score") 

In [18]:
# sns.swarmplot(data=df, x="Parental_Education_Level", y="Exam_Score")

In [19]:
# sns.swarmplot(data=df, x="Gender", y="Exam_Score")

In [20]:
# sns.scatterplot(data=df, x="Hours_Studied", y="Sleep_Hours")

In [21]:
# sns.scatterplot(data=df, x="Tutoring_Sessions", y="Exam_Score")

In [22]:
X_train_scaled,X_val_scaled,X_test_scaled,y_train,y_val,y_test, scaler_X, scaler_y = student_preprocessing(df,test_size=0.2,val_size=0.2)

In [23]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [26]:
# Modèle régularisé pour éviter l'overfitting
# Architecture plus petite [64, 32] au lieu de [128, 64, 32]
# Dropout élevé (0.5) pour forte régularisation
model = student_model(
    input_dim=X_train_scaled.shape[1], 
    mode='relu',
    hidden_dims=[64, 32],  # Architecture réduite pour éviter l'overfitting
    dropout_rate=0.5,  # Dropout élevé pour régularisation forte
    use_batch_norm=False  # Désactivé car peut interférer avec dropout élevé
).to(device)

criterion = nn.MSELoss()
learning_rate = 0.001  # Learning rate réduit pour apprentissage plus stable
num_epochs = 100
batch_size = 32

# Optimizer avec weight decay augmenté pour plus de régularisation
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-3)


# Note: y_train, y_val, y_test sont maintenant des numpy arrays (déjà scalés), pas des pandas Series
train_dataset = TensorDataset(torch.tensor(X_train_scaled, dtype=torch.float32).to(device), torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device))
val_dataset = TensorDataset(torch.tensor(X_val_scaled, dtype=torch.float32).to(device), torch.tensor(y_val, dtype=torch.float32).view(-1, 1).to(device))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [27]:
history = train_model(model, criterion, optimizer, num_epochs,train_loader, val_loader,device)

Epoch 1/100, Training Loss: 0.8153, Validation Loss: 0.6413
Epoch 2/100, Training Loss: 0.5180, Validation Loss: 0.4663
Epoch 3/100, Training Loss: 0.4346, Validation Loss: 0.4476
Epoch 4/100, Training Loss: 0.3915, Validation Loss: 0.4395
Epoch 5/100, Training Loss: 0.3746, Validation Loss: 0.4396
Epoch 6/100, Training Loss: 0.3529, Validation Loss: 0.4301
Epoch 7/100, Training Loss: 0.3513, Validation Loss: 0.4304
Epoch 8/100, Training Loss: 0.3309, Validation Loss: 0.4345
Epoch 9/100, Training Loss: 0.3377, Validation Loss: 0.4287
Epoch 10/100, Training Loss: 0.3280, Validation Loss: 0.4220
Epoch 11/100, Training Loss: 0.3464, Validation Loss: 0.4304
Epoch 12/100, Training Loss: 0.3303, Validation Loss: 0.4279
Epoch 13/100, Training Loss: 0.3313, Validation Loss: 0.4310
Epoch 14/100, Training Loss: 0.3270, Validation Loss: 0.4274
Epoch 15/100, Training Loss: 0.3280, Validation Loss: 0.4219
Epoch 16/100, Training Loss: 0.3247, Validation Loss: 0.4373
Epoch 17/100, Training Loss: 0.32