#### Imports

In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import mlflow
import mlflow.sklearn

#### Load Data

In [11]:
df = pd.read_csv("../data/student-mat.csv", sep=";")
binary_map = {"yes": 1, "no": 0}
binary_cols = ['schoolsup', 'famsup', 'paid', 'activities', 
               'nursery', 'higher', 'internet', 'romantic']
df[binary_cols] = df[binary_cols].map(lambda x: binary_map.get(x, x))

#### Target and features

In [12]:
target = "G3"
X = df.drop(columns=[target])
y = df[target]

numeric_cols = X.select_dtypes(include='number').columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(drop="first", sparse_output=False), categorical_cols)
])

#### Model configurations

In [13]:
model_configs = {
    "LinearRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42)
}

#### Start MLflow experiment

In [15]:
mlflow.set_experiment("student-performance-regression")
mlflow.set_tracking_uri("http://127.0.0.1:5000")

for model_name, model in model_configs.items():
    with mlflow.start_run(run_name=model_name):
        # Build pipeline
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("regressor", model)
        ])

        # Fit and predict
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)

        # Metrics
        mae = mean_absolute_error(y_test, preds)
        rmse = root_mean_squared_error(y_test, preds)
        r2 = r2_score(y_test, preds)

        # Log params and metrics
        mlflow.log_param("model", model_name)
        if hasattr(model, "n_estimators"):
            mlflow.log_param("n_estimators", model.n_estimators)

        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2", r2)

        # Log model artifact
        mlflow.sklearn.log_model(pipeline, "model")

        print(f"‚úÖ Logged {model_name}: MAE={mae:.2f}, RMSE={rmse:.2f}, R¬≤={r2:.2f}")

2025/07/14 23:31:51 INFO mlflow.tracking.fluent: Experiment with name 'student-performance-regression' does not exist. Creating a new experiment.


‚úÖ Logged LinearRegression: MAE=1.65, RMSE=2.38, R¬≤=0.72
üèÉ View run LinearRegression at: http://127.0.0.1:5000/#/experiments/270823155224265531/runs/65aa5c3d82f9405aa58e5d647abe23c4
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/270823155224265531




‚úÖ Logged DecisionTree: MAE=1.43, RMSE=2.61, R¬≤=0.67
üèÉ View run DecisionTree at: http://127.0.0.1:5000/#/experiments/270823155224265531/runs/03016e9115264a689d7c9d07fd748787
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/270823155224265531




‚úÖ Logged RandomForest: MAE=1.19, RMSE=1.95, R¬≤=0.82
üèÉ View run RandomForest at: http://127.0.0.1:5000/#/experiments/270823155224265531/runs/136ab25da4984b32b20fc2a748525e71
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/270823155224265531


#### Register model

In [29]:
from mlflow.tracking import MlflowClient

model_uri = f"runs:/{mlflow.last_active_run().info.run_id}/model"
model_name = "StudentPerformanceModel"

mlflow.register_model(model_uri=model_uri, name=model_name)

Successfully registered model 'StudentPerformanceModel'.
2025/07/14 23:39:26 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: StudentPerformanceModel, version 1
Created version '1' of model 'StudentPerformanceModel'.


<ModelVersion: aliases=[], creation_timestamp=1752529166429, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1752529166429, metrics=None, model_id=None, name='StudentPerformanceModel', params=None, run_id='136ab25da4984b32b20fc2a748525e71', run_link='', source='models:/m-1af2f6ce9d2945b7bfc494523b84d0cc', status='READY', status_message=None, tags={}, user_id='', version='1'>