In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import joblib

In [None]:
# Seed cố định
RANDOM_STATE = 42

In [None]:
# Thiết lập Cross-Validation cố định
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [None]:
features = ['sex', 'age', 'failures', 'higher', 'paid', 'absences', 'G_Avg']
target = "G3_10"

In [None]:
X = df[features].copy()
y = df[target].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [None]:
categorical_feats = ['sex', 'higher', 'paid']
numeric_feats = [c for c in features if c not in categorical_feats]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_feats),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_feats)
    ],
    remainder="drop"
)

In [None]:
models = {
    "Baseline(mean)": {
        "model": DummyRegressor(strategy="mean"),
        "params": {}
    },
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}
    },
    "DecisionTree": {
        "model": DecisionTreeRegressor(random_state=RANDOM_STATE),
        "params": {
            "model__max_depth": [3, 5, 10],
            "model__min_samples_split": [2, 5]
        }
    },
    "RandomForest": {
        "model": RandomForestRegressor(random_state=RANDOM_STATE),
        "params": {
            "model__n_estimators": [50, 100],
            "model__max_depth": [5, 10]
        }
    }
}

In [None]:
best_models = []
for name, cfg in models.items():
    print(f"\n--- Training {name} ---")
    pipe = Pipeline([
        ("pre", preprocessor),
        ("model", cfg["model"])  
    ])
    param_grid = cfg["params"]
    
    if param_grid:
        grid = GridSearchCV(pipe, param_grid, cv=cv, scoring="neg_mean_squared_error", n_jobs=-1)
        grid.fit(X_train, y_train)
        best_est = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_est = pipe.fit(X_train, y_train)
        best_params = {}

    # predict
    y_pred = best_est.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"{name} -> RMSE: {rmse:.6f}, R2: {r2:.6f}, best_params: {best_params}")
    best_models.append({
        "model": name,
        "best_params": best_params,
        "rmse": rmse,
        "r2": r2,
        "pipeline": best_est
    })

In [None]:
results_df = pd.DataFrame(best_models).sort_values("rmse").reset_index(drop=True)

print(results_df[["model", "best_params", "rmse", "r2"]])

best_row = results_df.iloc[0]
print("\nBest model:", best_row["model"], "RMSE:", best_row["rmse"], "R2:", best_row["r2"])

In [None]:
joblib.dump(best_row["pipeline"], "best_pipeline_model.pkl")
print("Saved best pipeline to best_pipeline_model.pkl")

In [None]:
joblib.load("best_pipeline_model.pkl").predict(X_test)