In [54]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [55]:
from pathlib import Path
import sys
BASE_DIR = Path.cwd().parent
sys.path.append(str(BASE_DIR))
from config import PROCESSED_DATA, MODEL_DIR
df = pd.read_csv(PROCESSED_DATA)

In [56]:
df.head()

Unnamed: 0,school,sex,age,Pstatus,studytime,failures,paid,activities,higher,romantic,...,G2_10,G3_10,G_Avg,school_encoded,sex_encoded,Pstatus_encoded,paid_encoded,activities_encoded,higher_encoded,romantic_encoded
0,GP,F,18.0,A,2.0,0,no,no,yes,no,...,3.0,3.0,2.75,0,0,0,0,0,1,0
1,GP,F,17.0,T,2.0,0,no,no,yes,no,...,2.5,3.0,2.5,0,0,1,0,0,1,0
2,GP,F,15.0,T,2.0,3,yes,no,yes,no,...,4.0,5.0,3.75,0,0,1,1,0,1,0
3,GP,F,15.0,T,3.0,0,yes,yes,yes,yes,...,7.0,7.5,7.25,0,0,1,1,1,1,1
4,GP,F,16.0,T,2.0,0,yes,no,yes,no,...,5.0,5.0,4.0,0,0,1,1,0,1,0


In [57]:
RANDOM_STATE = 42

In [58]:
# Thiết lập Cross-Validation cố định
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [59]:
features = ['sex_encoded', 'age', 'failures', 'higher_encoded', 'absences', 'G_Avg']

target = "G3_10"

In [60]:
# Chuẩn bị dữ liệu 
X = df[features].copy()
y = df[target].copy()

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [62]:
models = {
    "Baseline(mean)": {
        "model": DummyRegressor(strategy="mean"),
        "params": {}
    },
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}
    },
    "DecisionTree": {
        "model": DecisionTreeRegressor(random_state=RANDOM_STATE),
        "params": {
            "max_depth": [3, 5, 7], 
            "min_samples_split": [5, 10], 
            "min_samples_leaf": [2, 5] 
        }
    },
    "RandomForest": {
        "model": RandomForestRegressor(random_state=RANDOM_STATE),
        "params": {
            "max_depth": [3, 7], 
            "min_samples_leaf": [5, 10, 20], 
            "n_estimators": [50, 100]
        }
    }
}

In [63]:
best_models=[]

In [64]:
models_metrics = {}

In [65]:
for name, config in models.items():
    print(f"Training {name}")
    
    grid = GridSearchCV(config["model"], config["params"], cv=cv, scoring="neg_mean_squared_error")
    grid.fit(X_train, y_train)

    y_pred = grid.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    best_models.append({
    "model": name,
    "best_params": grid.best_params_,
    "rmse": rmse,
    "r2": r2

   
})
    models_metrics[name] = {
        "best_params": grid.best_params_,
        "rmse": rmse,
        "r2": r2,
        "best_estimator": grid.best_estimator_
    }

Training Baseline(mean)
Training LinearRegression
Training DecisionTree
Training RandomForest


In [66]:
best_models

[{'model': 'Baseline(mean)',
  'best_params': {},
  'rmse': np.float64(2.2750903260811937),
  'r2': -0.009709643515769084},
 {'model': 'LinearRegression',
  'best_params': {},
  'rmse': np.float64(1.1154570568507811),
  'r2': 0.7572803555315948},
 {'model': 'DecisionTree',
  'best_params': {'max_depth': 5,
   'min_samples_leaf': 5,
   'min_samples_split': 5},
  'rmse': np.float64(0.7936066661427824),
  'r2': 0.8771401923999244},
 {'model': 'RandomForest',
  'best_params': {'max_depth': 7, 'min_samples_leaf': 5, 'n_estimators': 100},
  'rmse': np.float64(0.8091451388073762),
  'r2': 0.8722820099663473}]

In [67]:
results_df = pd.DataFrame(best_models)
results_df 

Unnamed: 0,model,best_params,rmse,r2
0,Baseline(mean),{},2.27509,-0.00971
1,LinearRegression,{},1.115457,0.75728
2,DecisionTree,"{'max_depth': 5, 'min_samples_leaf': 5, 'min_s...",0.793607,0.87714
3,RandomForest,"{'max_depth': 7, 'min_samples_leaf': 5, 'n_est...",0.809145,0.872282


In [68]:
results_df.sort_values(by="rmse")

Unnamed: 0,model,best_params,rmse,r2
2,DecisionTree,"{'max_depth': 5, 'min_samples_leaf': 5, 'min_s...",0.793607,0.87714
3,RandomForest,"{'max_depth': 7, 'min_samples_leaf': 5, 'n_est...",0.809145,0.872282
1,LinearRegression,{},1.115457,0.75728
0,Baseline(mean),{},2.27509,-0.00971


In [69]:
best_row = results_df.sort_values(by="rmse").iloc[0]

best_row

model                                               DecisionTree
best_params    {'max_depth': 5, 'min_samples_leaf': 5, 'min_s...
rmse                                                    0.793607
r2                                                       0.87714
Name: 2, dtype: object

In [70]:
best_model_name = best_row["model"]

best_model_name


'DecisionTree'

In [71]:
best_params = best_row['best_params']
best_params

{'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}

In [72]:
best_model = models[best_model_name]["model"].set_params(**best_params)

In [73]:
best_model.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,5
,min_samples_split,5
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [74]:
joblib.dump(best_model,MODEL_DIR/ "best_model.pkl")

['D:\\SW\\DATASCIENCE\\ML\\Student-Final-Grade-Prediction\\models\\best_model.pkl']

In [75]:
joblib.load(MODEL_DIR/"best_model.pkl").predict(X_test)

array([4.125     , 6.25      , 2.85714286, 4.86363636, 4.39285714,
       6.25      , 9.28571429, 4.125     , 1.3       , 5.63636364,
       7.34615385, 3.4375    , 6.44      , 5.63636364, 7.        ,
       4.86363636, 3.6       , 5.47368421, 7.85294118, 1.3       ,
       7.34615385, 7.57142857, 7.34615385, 2.85714286, 4.875     ,
       9.28571429, 5.47368421, 4.875     , 8.9       , 5.47368421,
       4.39285714, 4.125     , 7.57142857, 6.44      , 3.6       ,
       2.85714286, 0.        , 7.34615385, 5.63636364, 4.125     ,
       3.4375    , 4.875     , 7.34615385, 4.86363636, 7.85294118,
       5.25      , 6.02272727, 7.        , 6.02272727, 7.85294118,
       6.9375    , 7.34615385, 4.875     , 4.39285714, 2.85714286,
       6.02272727, 5.47368421, 0.7       , 7.57142857, 8.9       ,
       6.02272727, 4.86363636, 4.39285714, 2.85714286, 3.4375    ,
       8.7       , 4.125     , 4.39285714, 4.39285714, 7.57142857,
       4.125     , 4.86363636, 7.        , 9.28571429, 5.63636

In [76]:
joblib.dump(models_metrics, MODEL_DIR/"models_metrics.pkl")
print("✅ Đã lưu metrics của baseline và candidates: models_metrics.pkl")
print("📊 Metrics đã lưu:")
for name, metrics in models_metrics.items():
    print(f"  {name}: RMSE={metrics['rmse']:.4f}, R²={metrics['r2']:.4f}")

✅ Đã lưu metrics của baseline và candidates: models_metrics.pkl
📊 Metrics đã lưu:
  Baseline(mean): RMSE=2.2751, R²=-0.0097
  LinearRegression: RMSE=1.1155, R²=0.7573
  DecisionTree: RMSE=0.7936, R²=0.8771
  RandomForest: RMSE=0.8091, R²=0.8723
