# Machine Learning Model Training

This notebook implements the training phase for predicting student performance (G3 grade) using processed and engineered data.

### Import Dependencies

In [16]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
import os
import joblib

### Load Processed Data

In [17]:
# Load the processed training and testing data
X_train_mat_f = pd.read_csv('processed_data/X_MatFE_enhanced.csv')
X_train_mat_m = pd.read_csv('processed_data/X_MatM_enhanced.csv')
X_train_por_f = pd.read_csv('processed_data/X_PorFE_enhanced.csv')
X_train_por_m = pd.read_csv('processed_data/X_PorM_enhanced.csv')

print("Data loaded successfully:")
print(f"X_train_mat_f shape: {X_train_mat_f.shape}")
print(f"X_train_mat_m shape: {X_train_mat_m.shape}")
print(f"X_train_por_f shape: {X_train_por_f.shape}")
print(f"X_train_por_m shape: {X_train_por_m.shape}")

Data loaded successfully:
X_train_mat_f shape: (208, 36)
X_train_mat_m shape: (186, 36)
X_train_por_f shape: (383, 36)
X_train_por_m shape: (265, 36)


### Encode Categorical Variables

In [18]:
# Encode categorical variables
def encode_categorical(df):
    label_encoders = {}
    for column in df.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le
    return df, label_encoders

X_train_mat_f, le_mat_f = encode_categorical(X_train_mat_f)
X_train_mat_m, le_mat_m = encode_categorical(X_train_mat_m)
X_train_por_f, le_por_f = encode_categorical(X_train_por_f)
X_train_por_m, le_por_m = encode_categorical(X_train_por_m)

### Split Data into Features and Target Variables

In [19]:
# Split data into features and target variables
def split_features_target(df, target):
    X = df.drop(columns=[target])
    y = df[target]
    return X, y

# Split data for G1 prediction
X_train_mat_f_g1, y_train_mat_f_g1 = split_features_target(X_train_mat_f, 'G1')
X_train_mat_m_g1, y_train_mat_m_g1 = split_features_target(X_train_mat_m, 'G1')
X_train_por_f_g1, y_train_por_f_g1 = split_features_target(X_train_por_f, 'G1')
X_train_por_m_g1, y_train_por_m_g1 = split_features_target(X_train_por_m, 'G1')

# Split data for G2 prediction
X_train_mat_f_g2, y_train_mat_f_g2 = split_features_target(X_train_mat_f, 'G2')
X_train_mat_m_g2, y_train_mat_m_g2 = split_features_target(X_train_mat_m, 'G2')
X_train_por_f_g2, y_train_por_f_g2 = split_features_target(X_train_por_f, 'G2')
X_train_por_m_g2, y_train_por_m_g2 = split_features_target(X_train_por_m, 'G2')

### Classify Grades into 5 Distinct Classes

In [20]:
# Function to classify grades into 5 distinct classes
def classify_grades(y):
    discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
    y_class = discretizer.fit_transform(y.values.reshape(-1, 1)).astype(int)
    return y_class

# Classify G1 and G2 grades
y_train_mat_f_g1_class = classify_grades(y_train_mat_f_g1)
y_train_mat_m_g1_class = classify_grades(y_train_mat_m_g1)
y_train_por_f_g1_class = classify_grades(y_train_por_f_g1)
y_train_por_m_g1_class = classify_grades(y_train_por_m_g1)

y_train_mat_f_g2_class = classify_grades(y_train_mat_f_g2)
y_train_mat_m_g2_class = classify_grades(y_train_mat_m_g2)
y_train_por_f_g2_class = classify_grades(y_train_por_f_g2)
y_train_por_m_g2_class = classify_grades(y_train_por_m_g2)

### Train Decision Tree Classifiers for G1 and G2 Prediction

In [21]:
# Train Decision Tree classifiers for G1 prediction
dt_g1_female = DecisionTreeClassifier(random_state=42)
dt_g1_female.fit(X_train_mat_f_g1, y_train_mat_f_g1_class)

dt_g1_male = DecisionTreeClassifier(random_state=42)
dt_g1_male.fit(X_train_mat_m_g1, y_train_mat_m_g1_class)

# Train Decision Tree classifiers for G2 prediction
dt_g2_female = DecisionTreeClassifier(random_state=42)
dt_g2_female.fit(X_train_mat_f_g2, y_train_mat_f_g2_class)

dt_g2_male = DecisionTreeClassifier(random_state=42)
dt_g2_male.fit(X_train_mat_m_g2, y_train_mat_m_g2_class)

### Save the Trained Decision Tree Models

In [22]:
# Save the trained models
os.makedirs('models', exist_ok=True)
joblib.dump(dt_g1_female, 'models/dt_g1_female.joblib')
joblib.dump(dt_g1_male, 'models/dt_g1_male.joblib')
joblib.dump(dt_g2_female, 'models/dt_g2_female.joblib')
joblib.dump(dt_g2_male, 'models/dt_g2_male.joblib')

print("Models saved successfully.")

Models saved successfully.


### Predict G1 and G2 Using the Trained Decision Tree Models

In [23]:
# Predict G1 and G2 using the trained Decision Tree models
X_train_mat_f['G1_pred'] = dt_g1_female.predict(X_train_mat_f_g1)
X_train_mat_m['G1_pred'] = dt_g1_male.predict(X_train_mat_m_g1)
X_train_mat_f['G2_pred'] = dt_g2_female.predict(X_train_mat_f_g2)
X_train_mat_m['G2_pred'] = dt_g2_male.predict(X_train_mat_m_g2)

### Prepare Data for G3 Prediction

In [24]:
# Prepare data for G3 prediction
X_train_mat_f_g3, y_train_mat_f_g3 = split_features_target(X_train_mat_f, 'G3')
X_train_mat_m_g3, y_train_mat_m_g3 = split_features_target(X_train_mat_m, 'G3')

### Train Linear Regression Model for G3 Prediction

In [25]:
# Train Linear Regression model for G3 prediction
lr_g3_female = LinearRegression()
lr_g3_female.fit(X_train_mat_f_g3, y_train_mat_f_g3)

lr_g3_male = LinearRegression()
lr_g3_male.fit(X_train_mat_m_g3, y_train_mat_m_g3)

### Save the Trained Linear Regression Models

In [26]:
# Save the trained models
joblib.dump(lr_g3_female, 'models/lr_g3_female.joblib')
joblib.dump(lr_g3_male, 'models/lr_g3_male.joblib')

print("Models saved successfully.")

Models saved successfully.
