# Machine Learning Model Training

This notebook implements the training phase for predicting student performance (G3 grade) using processed and engineered data.

### Import Dependencies

In [47]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
import os
import joblib

### Load Processed Data

In [48]:
def load_training_data():
    base_path = 'processed_data/'
    
    # Load training data
    X_train_mat_f = pd.read_csv(f'{base_path}matfe_X_train.csv')
    y_train_mat_f = pd.read_csv(f'{base_path}matfe_y_train.csv')
    X_train_mat_m = pd.read_csv(f'{base_path}matm_X_train.csv')
    y_train_mat_m = pd.read_csv(f'{base_path}matm_y_train.csv')
    
    print("Data loaded successfully:")
    print(f"Mathematics Female - X: {X_train_mat_f.shape}, y: {y_train_mat_f.shape}")
    print(f"Mathematics Male - X: {X_train_mat_m.shape}, y: {y_train_mat_m.shape}")
    
    return X_train_mat_f, y_train_mat_f, X_train_mat_m, y_train_mat_m

# Load training data
X_train_mat_f, y_train_mat_f, X_train_mat_m, y_train_mat_m = load_training_data()

Data loaded successfully:
Mathematics Female - X: (145, 33), y: (145, 3)
Mathematics Male - X: (130, 33), y: (130, 3)


### Model specific utility

In [49]:
def classify_grades(grade_series):
    """Classify grades into 5 distinct classes"""
    grades = grade_series.values.reshape(-1, 1)
    kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
    return kbd.fit_transform(grades).ravel()

def split_features_target(df, target):
    """Split dataframe into features and target"""
    X = df.drop(columns=[target])
    y = df[target]
    return X, y

### Prepare data for G1 and G2 models

In [50]:
# Classify G1 grades
y_train_mat_f_g1 = classify_grades(y_train_mat_f['G1'])
y_train_mat_m_g1 = classify_grades(y_train_mat_m['G1'])

# Classify G2 grades
y_train_mat_f_g2 = classify_grades(y_train_mat_f['G2'])
y_train_mat_m_g2 = classify_grades(y_train_mat_m['G2'])

print("\nGrade Classification Results:")
for gender, g1, g2 in [('Female', y_train_mat_f_g1, y_train_mat_f_g2),
                       ('Male', y_train_mat_m_g1, y_train_mat_m_g2)]:
    print(f"\n{gender} Students:")
    print(f"G1 classes: {np.unique(g1)}")
    print(f"G2 classes: {np.unique(g2)}")


Grade Classification Results:

Female Students:
G1 classes: [0. 1. 2. 3. 4.]
G2 classes: [0. 1. 2. 3. 4.]

Male Students:
G1 classes: [0. 1. 2. 3. 4.]
G2 classes: [0. 1. 2. 3. 4.]


### Preparing data for G3

In [51]:
# Verify G3 data is available
print("Verifying G3 target data:")
print(f"Female G3 values: {len(y_train_mat_f['G3'])} samples")
print(f"Male G3 values: {len(y_train_mat_m['G3'])} samples")

# Prepare features for G3 prediction
X_train_mat_f_g3 = X_train_mat_f.drop(['G1', 'G2'], axis=1, errors='ignore')
X_train_mat_m_g3 = X_train_mat_m.drop(['G1', 'G2'], axis=1, errors='ignore')

# Get G3 target values
y_train_mat_f_g3 = y_train_mat_f['G3']
y_train_mat_m_g3 = y_train_mat_m['G3']

print("\nG3 Training Data Prepared:")
print(f"Female features shape: {X_train_mat_f_g3.shape}")
print(f"Male features shape: {X_train_mat_m_g3.shape}")

Verifying G3 target data:
Female G3 values: 145 samples
Male G3 values: 130 samples

G3 Training Data Prepared:
Female features shape: (145, 33)
Male features shape: (130, 33)


### Train decision tree models

In [52]:
# Train G1 models
dt_g1_female = DecisionTreeClassifier(random_state=42)
dt_g1_female.fit(X_train_mat_f, y_train_mat_f_g1)

dt_g1_male = DecisionTreeClassifier(random_state=42)
dt_g1_male.fit(X_train_mat_m, y_train_mat_m_g1)

# Train G2 models
dt_g2_female = DecisionTreeClassifier(random_state=42)
dt_g2_female.fit(X_train_mat_f, y_train_mat_f_g2)

dt_g2_male = DecisionTreeClassifier(random_state=42)
dt_g2_male.fit(X_train_mat_m, y_train_mat_m_g2)

# Train G3 models
lr_g3_female = LinearRegression()
lr_g3_female.fit(X_train_mat_f_g3, y_train_mat_f_g3)

lr_g3_male = LinearRegression()
lr_g3_male.fit(X_train_mat_m_g3, y_train_mat_m_g3)

print("\nAll models trained successfully")


All models trained successfully


### Train linear regression G3

In [53]:
# Train G3 models
print("Training Linear Regression models for G3:")

# Train female model
lr_g3_female = LinearRegression()
lr_g3_female.fit(X_train_mat_f_g3, y_train_mat_f_g3)
print(f"Female G3 model trained with {X_train_mat_f_g3.shape[1]} features")

# Train male model
lr_g3_male = LinearRegression()
lr_g3_male.fit(X_train_mat_m_g3, y_train_mat_m_g3)
print(f"Male G3 model trained with {X_train_mat_m_g3.shape[1]} features")

# Print model coefficients for verification
print("\nModel Coefficients:")
print(f"Female model: {len(lr_g3_female.coef_)} coefficients")
print(f"Male model: {len(lr_g3_male.coef_)} coefficients")

Training Linear Regression models for G3:
Female G3 model trained with 33 features
Male G3 model trained with 33 features

Model Coefficients:
Female model: 33 coefficients
Male model: 33 coefficients


### Save models

In [54]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save Decision Tree models
joblib.dump(dt_g1_female, 'models/dt_g1_female.joblib')
joblib.dump(dt_g1_male, 'models/dt_g1_male.joblib')
joblib.dump(dt_g2_female, 'models/dt_g2_female.joblib')
joblib.dump(dt_g2_male, 'models/dt_g2_male.joblib')

# Save Linear Regression models
joblib.dump(lr_g3_female, 'models/lr_g3_female.joblib')
joblib.dump(lr_g3_male, 'models/lr_g3_male.joblib')

print("Models saved successfully in 'models' directory")

Models saved successfully in 'models' directory
