
# 🎓 Student Habits → Exam Score (Regression) — Colab Notebook

Ce notebook installe les dépendances, charge votre CSV, entraîne 2 modèles (Régression Linéaire & RandomForest),
évalue les performances (MAE/RMSE/R² + CV 5-fold) et sauvegarde le **meilleur modèle**.


In [1]:

# ✅ Installation des dépendances (exécuter une fois)
!pip -q install pandas numpy scikit-learn joblib
import sklearn
print(sklearn.__version__)


1.6.1


## 📥 Charger le fichier CSV

In [2]:

# Option A — Téléverser depuis votre ordinateur (recommandé)
from google.colab import files
uploaded = files.upload()  # Sélectionnez votre CSV
CSV_PATH = list(uploaded.keys())[0]  # nom du fichier téléversé

# Option B — Utiliser un chemin déjà présent (décommentez si besoin)
# CSV_PATH = "/content/student_habits_performance.csv"
print("CSV utilisé:", CSV_PATH)


Saving student_habits_performance (1).csv to student_habits_performance (1).csv
CSV utilisé: student_habits_performance (1).csv


## ⚙️ Préparation & Fonctions utilitaires

In [3]:
import json
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import joblib

TARGET = "exam_score"

CATEGORICAL = [
    "gender",
    "part_time_job",
    "diet_quality",
    "parental_education_level",
    "internet_quality",
    "extracurricular_participation",
]

NUMERIC_BASE = [
    "study_hours_per_day",
    "social_media_hours",
    "netflix_hours",
    "attendance_percentage",
    "sleep_hours",
    "exercise_frequency",
    "mental_health_rating",
    "age",
]

def build_lifestyle_index(df: pd.DataFrame) -> pd.Series:
    tmp = df[NUMERIC_BASE].astype(float)
    z = (tmp - tmp.mean()) / tmp.std(ddof=0)
    lifestyle = (
        + 1.2*z["study_hours_per_day"]
        - 0.6*z["social_media_hours"]
        - 0.6*z["netflix_hours"]
        + 0.8*z["attendance_percentage"]
        + 0.5*z["sleep_hours"]
        + 0.6*z["exercise_frequency"]
        + 0.8*z["mental_health_rating"]
        - 0.1*z["age"]
    )
    return lifestyle

def make_preprocess(numeric_cols, categorical_cols):
    return ColumnTransformer([
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ])

def evaluate(pipe, X_train, X_test, y_train, y_test):
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mae = float(mean_absolute_error(y_test, preds))
    # RMSE sans utiliser l'argument 'squared'
    rmse = float(np.sqrt(mean_squared_error(y_test, preds)))
    r2 = float(r2_score(y_test, preds))
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_r2 = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="r2")
    return {
        "MAE": round(mae, 4),
        "RMSE": round(rmse, 4),
        "R2_test": round(r2, 4),
        "R2_CV_mean": round(float(cv_r2.mean()), 4),
        "R2_CV_std": round(float(cv_r2.std()), 4),
    }

## 🔬 Chargement, EDA minimale & Entraînement

In [16]:

# Chargement & nettoyage
df = pd.read_csv(CSV_PATH).drop_duplicates().copy()
print("Shape:", df.shape)
display(df.head())

# Feature engineering
df["lifestyle_index"] = build_lifestyle_index(df)

numeric = NUMERIC_BASE + ["lifestyle_index"]
categorical = CATEGORICAL

X = df[categorical + numeric]
y = df[TARGET].values

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocess = make_preprocess(numeric, categorical)

# Modèles
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
}

all_metrics = {}
best_model_name, best_r2 = None, -1.0
best_pipe = None

for name, model in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", model)])
    metrics = evaluate(pipe, X_train, X_test, y_train, y_test)
    all_metrics[name] = metrics
    print(f"\n{name} ->", json.dumps(metrics, indent=2, ensure_ascii=False))
    if metrics["R2_test"] > best_r2:
        best_r2 = metrics["R2_test"]
        best_model_name = name
        best_pipe = pipe

# Fit meilleur modèle sur toutes les données
best_pipe.fit(X, y)

# Sauvegardes
ART = Path("/content/artifacts"); ART.mkdir(exist_ok=True, parents=True)
model_path = ART / f"best_model_{best_model_name}.joblib"
joblib.dump(best_pipe, model_path)

# Noms des features après encodage
ohe = best_pipe.named_steps["prep"].named_transformers_["cat"]
feature_names = numeric + list(ohe.get_feature_names_out(categorical))
with open(ART / "feature_names.json", "w", encoding="utf-8") as f:
    json.dump(feature_names, f, ensure_ascii=False, indent=2)

with open(ART / "metrics.json", "w", encoding="utf-8") as f:
    json.dump(all_metrics, f, ensure_ascii=False, indent=2)

print("\n✅ Meilleur modèle:", best_model_name)
print("📁 Modèle enregistré:", str(model_path))
print("📄 metrics.json et feature_names.json sauvegardés dans /content/artifacts")


Shape: (1000, 16)


Unnamed: 0,student_id,age,gender,study_hours_per_day,social_media_hours,netflix_hours,part_time_job,attendance_percentage,sleep_hours,diet_quality,exercise_frequency,parental_education_level,internet_quality,mental_health_rating,extracurricular_participation,exam_score
0,S1000,23,Female,0.0,1.2,1.1,No,85.0,8.0,Fair,6,Master,Average,8,Yes,56.2
1,S1001,20,Female,6.9,2.8,2.3,No,97.3,4.6,Good,6,High School,Average,8,No,100.0
2,S1002,21,Male,1.4,3.1,1.3,No,94.8,8.0,Poor,1,High School,Poor,1,No,34.3
3,S1003,23,Female,1.0,3.9,1.0,No,71.0,9.2,Poor,4,Master,Good,1,Yes,26.8
4,S1004,19,Female,5.0,4.4,0.5,No,90.9,4.9,Fair,3,Master,Good,1,No,66.4



LinearRegression -> {
  "MAE": 4.1893,
  "RMSE": 5.1455,
  "R2_test": 0.8968,
  "R2_CV_mean": 0.8943,
  "R2_CV_std": 0.0174
}

RandomForest -> {
  "MAE": 4.3907,
  "RMSE": 5.4346,
  "R2_test": 0.8848,
  "R2_CV_mean": 0.8855,
  "R2_CV_std": 0.0145
}

✅ Meilleur modèle: LinearRegression
📁 Modèle enregistré: /content/artifacts/best_model_LinearRegression.joblib
📄 metrics.json et feature_names.json sauvegardés dans /content/artifacts


## ⬇️ Télécharger les artefacts

In [10]:
import os
from google.colab import files
files.download("/content/artifacts/metrics.json")
files.download("/content/artifacts/feature_names.json")
# Pour le modèle (taille > 32MB possible) :
files.download("/content/artifacts/" + [f for f in os.listdir("/content/artifacts") if f.startswith("best_model_")][0])


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [15]:

import joblib
import pandas as pd
import numpy as np # Import numpy as it's used in build_lifestyle_index

# 2) Charger le pipeline entraîné
# En supposant que le répertoire 'artifacts' se trouve dans le répertoire de travail actuel
pipe_path = "artifacts/best_model_LinearRegression.joblib" # Or RandomForest
try:
    pipe = joblib.load(pipe_path)
except FileNotFoundError:
    print(f"Error: Model file not found at {pipe_path}. Please ensure the model was saved correctly and the path is correct.")
    # You might want to exit or handle this error appropriately
    raise # Re-raise the exception after printing the message


# 3) Créer un exemple d’entrée (mêmes colonnes que l’entraînement SAUF lifestyle_index initialement)
sample = pd.DataFrame([{
    "gender": "Female",
    "part_time_job": "No",
    "diet_quality": "Fair",
    "parental_education_level": "Master",
    "internet_quality": "Average",
    "extracurricular_participation": "Yes",
    "study_hours_per_day": 3.0,
    "social_media_hours": 2.0,
    "netflix_hours": 1.0,
    "attendance_percentage": 92.0,
    "sleep_hours": 7.0,
    "exercise_frequency": 3,
    "mental_health_rating": 7,
    "age": 21,
}])

# Calculer l'indice de style de vie pour les données de l'échantillon
sample["lifestyle_index"] = build_lifestyle_index(sample)

# S'assurer que les colonnes sont dans le même ordre que lors de l'apprentissage si le pipeline s'y attend
# (StandardScaler et OneHotEncoder dans ColumnTransformer sont généralement agnostiques à l'ordre
# mais c'est une bonne pratique de maintenir l'ordre ou de vérifier les noms des éléments)
# Il se peut que vous ayez besoin des noms d'entités sauvegardés lors de l'entraînement pour réorganiser correctement l'ordre.
# En supposant que l'ordre dans la création de l'échantillon corresponde à l'ordre original NUMERIC_BASE + CATEGORICAL
# et que build_lifestyle_index l'ajoute correctement.

# 4) Prédire
pred = pipe.predict(sample)
print("Score prédit:", float(pred[0]))

Score prédit: 67.30726303135387
