# Machine Learning Model Training

This notebook implements the training phase for predicting student performance (G3 grade) using processed and engineered data.

### Import Dependencies

In [35]:
# Import required libraries
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import joblib

### Prepare classification method for G1 and G2

In [36]:
def classify_grades_edu(grade_series):
    """
    Classify grades into meaningful educational categories using fixed width bins:
    - 0-5: Poor (0)
    - 6-10: Below Average (1)
    - 11-14: Average (2)
    - 15-17: Good (3)
    - 18-20: Excellent (4)
    """
    grades = grade_series.values
    bins = [0, 5, 10, 14, 17, 20]
    labels = [0, 1, 2, 3, 4]
    
    # Create categorical bins using educational scale
    binned_grades = pd.cut(grades, bins=bins, labels=labels, include_lowest=True)
    
    # Convert to numeric for the model
    return pd.to_numeric(binned_grades)

### Load Processed Data

In [37]:
# Load processed enhanced training data
base_path = 'processed_data/'

# Load training data for mathematics
X_train_mat_f = pd.read_csv(f'{base_path}X_Pmat_full_enhanced_train.csv')
X_train_mat_m = pd.read_csv(f'{base_path}X_PmatM_enhanced_train.csv')
y_train_mat_f = pd.read_csv(f'{base_path}y_Pmat_full_enhanced_train.csv')
y_train_mat_m = pd.read_csv(f'{base_path}y_PmatM_enhanced_train.csv')

# Load training data for portuguese
X_train_por_f = pd.read_csv(f'{base_path}X_Ppor_full_enhanced_train.csv')
X_train_por_m = pd.read_csv(f'{base_path}X_PporM_enhanced_train.csv')
y_train_por_f = pd.read_csv(f'{base_path}y_Ppor_full_enhanced_train.csv')
y_train_por_m = pd.read_csv(f'{base_path}y_PporM_enhanced_train.csv')

# Also load test data for later evaluation
X_test_mat_f = pd.read_csv(f'{base_path}X_Pmat_full_enhanced_test.csv')
X_test_mat_m = pd.read_csv(f'{base_path}X_PmatM_enhanced_test.csv')
y_test_mat_f = pd.read_csv(f'{base_path}y_Pmat_full_enhanced_test.csv')
y_test_mat_m = pd.read_csv(f'{base_path}y_PmatM_enhanced_test.csv')

X_test_por_f = pd.read_csv(f'{base_path}X_Ppor_full_enhanced_test.csv')
X_test_por_m = pd.read_csv(f'{base_path}X_PporM_enhanced_test.csv')
y_test_por_f = pd.read_csv(f'{base_path}y_Ppor_full_enhanced_test.csv')
y_test_por_m = pd.read_csv(f'{base_path}y_PporM_enhanced_test.csv')

print("Enhanced data loaded successfully:")
print(f"Mathematics Female - X: {X_train_mat_f.shape}, y: {y_train_mat_f.shape}")
print(f"Mathematics Male - X: {X_train_mat_m.shape}, y: {y_train_mat_m.shape}")
print(f"Portuguese Female - X: {X_train_por_f.shape}, y: {y_train_por_f.shape}")
print(f"Portuguese Male - X: {X_train_por_m.shape}, y: {y_train_por_m.shape}")

# Initialize models dictionary
models = {
    'math': {
        'female': {},
        'male': {}
    },
    'por': {
        'female': {},
        'male': {}
    }
}

Enhanced data loaded successfully:
Mathematics Female - X: (175, 46), y: (175, 3)
Mathematics Male - X: (83, 46), y: (83, 3)
Portuguese Female - X: (322, 46), y: (322, 3)
Portuguese Male - X: (125, 46), y: (125, 3)


### Train random forest decision tree G1 and G2 models

In [38]:
# Train G1 and G2 models using RandomForestClassifier
print("\nTraining G1 and G2 classification models...")

# Mathematics Female G1 & G2
rf_g1_mat_f = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g1_mat_f.fit(X_train_mat_f, classify_grades_edu(y_train_mat_f['G1']))
models['math']['female']['G1'] = rf_g1_mat_f

rf_g2_mat_f = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g2_mat_f.fit(X_train_mat_f, classify_grades_edu(y_train_mat_f['G2']))
models['math']['female']['G2'] = rf_g2_mat_f

# Mathematics Male G1 & G2
rf_g1_mat_m = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g1_mat_m.fit(X_train_mat_m, classify_grades_edu(y_train_mat_m['G1']))
models['math']['male']['G1'] = rf_g1_mat_m

rf_g2_mat_m = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g2_mat_m.fit(X_train_mat_m, classify_grades_edu(y_train_mat_m['G2']))
models['math']['male']['G2'] = rf_g2_mat_m

# Portuguese Female G1 & G2
rf_g1_por_f = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g1_por_f.fit(X_train_por_f, classify_grades_edu(y_train_por_f['G1']))
models['por']['female']['G1'] = rf_g1_por_f

rf_g2_por_f = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g2_por_f.fit(X_train_por_f, classify_grades_edu(y_train_por_f['G2']))
models['por']['female']['G2'] = rf_g2_por_f

# Portuguese Male G1 & G2
rf_g1_por_m = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g1_por_m.fit(X_train_por_m, classify_grades_edu(y_train_por_m['G1']))
models['por']['male']['G1'] = rf_g1_por_m

rf_g2_por_m = RandomForestClassifier(n_estimators=100, random_state=42)
rf_g2_por_m.fit(X_train_por_m, classify_grades_edu(y_train_por_m['G2']))
models['por']['male']['G2'] = rf_g2_por_m

print("G1 and G2 classification models trained successfully.")


Training G1 and G2 classification models...
G1 and G2 classification models trained successfully.


### Train linear regression G3

In [39]:
# G3 Regression Female
X_g3_mat_f = pd.concat([X_train_mat_f, y_train_mat_f[['G1', 'G2']]], axis=1)
lr_g3_mat_f = LinearRegression()
lr_g3_mat_f.fit(X_g3_mat_f, y_train_mat_f['G3'])
models['math']['female']['G3'] = lr_g3_mat_f

# G3 Regression Male
X_g3_mat_m = pd.concat([X_train_mat_m, y_train_mat_m[['G1', 'G2']]], axis=1)
lr_g3_mat_m = LinearRegression()
lr_g3_mat_m.fit(X_g3_mat_m, y_train_mat_m['G3'])
models['math']['male']['G3'] = lr_g3_mat_m

# Portuguese Female G3
X_g3_por_f = pd.concat([X_train_por_f, y_train_por_f[['G1', 'G2']]], axis=1)
lr_g3_por_f = LinearRegression()
lr_g3_por_f.fit(X_g3_por_f, y_train_por_f['G3'])
models['por']['female']['G3'] = lr_g3_por_f

# Portuguese Male G3
X_g3_por_m = pd.concat([X_train_por_m, y_train_por_m[['G1', 'G2']]], axis=1)
lr_g3_por_m = LinearRegression()
lr_g3_por_m.fit(X_g3_por_m, y_train_por_m['G3'])
models['por']['male']['G3'] = lr_g3_por_m

print("G3 regression models trained successfully.")

G3 regression models trained successfully.


### Save models

In [40]:
os.makedirs('models', exist_ok=True)

# Save mathematics female models
joblib.dump(rf_g1_mat_f, 'models/math_female_G1_model.joblib')
joblib.dump(rf_g2_mat_f, 'models/math_female_G2_model.joblib')
joblib.dump(lr_g3_mat_f, 'models/math_female_G3_model.joblib')

# Save mathematics male models
joblib.dump(rf_g1_mat_m, 'models/math_male_G1_model.joblib')
joblib.dump(rf_g2_mat_m, 'models/math_male_G2_model.joblib')
joblib.dump(lr_g3_mat_m, 'models/math_male_G3_model.joblib')

# Save portuguese female models
joblib.dump(rf_g1_por_f, 'models/por_female_G1_model.joblib')
joblib.dump(rf_g2_por_f, 'models/por_female_G2_model.joblib')
joblib.dump(lr_g3_por_f, 'models/por_female_G3_model.joblib')

# Save portuguese male models
joblib.dump(rf_g1_por_m, 'models/por_male_G1_model.joblib')
joblib.dump(rf_g2_por_m, 'models/por_male_G2_model.joblib')
joblib.dump(lr_g3_por_m, 'models/por_male_G3_model.joblib')

print("All models saved successfully")

All models saved successfully
