In [None]:
# ===============================
# 🎬 Entraînement modèle reco Louve
# ===============================
import os
import re
import joblib
import numpy as np
import pandas as pd

from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, confusion_matrix, classification_report
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.calibration import CalibratedClassifierCV

import xgboost as xgb
import mlflow
import mlflow.xgboost

# ============ ⚙️ Config ============
DATABASE_URL = os.getenv("MYSQL_URL")
if not DATABASE_URL:
    raise RuntimeError("Missing MYSQL_URL")

SQL_QUERY = """
SELECT movie_id, title, synopsis, rating, genres, release_year
FROM movies
WHERE synopsis IS NOT NULL
"""

LIKE_THRESHOLD = 4.0
TFIDF_MAX_FEATURES = 5000
SVD_COMPONENTS = 100
RANDOM_STATE = 42
SAMPLE_SIZE = 10_000
ARTIFACT_DIR = "E3_E4_API_app/model"
os.makedirs(ARTIFACT_DIR, exist_ok=True)

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", mlflow.get_tracking_uri()))
EXPERIMENT_NAME = "monitoring_model-github"
RUN_NAME = "monitoring_train"

# ============ 📦 Chargement données ============
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

engine = create_engine(DATABASE_URL)
df = pd.read_sql(SQL_QUERY, engine)
df["synopsis_clean"] = df["synopsis"].fillna("").apply(preprocess_text)

# ============ 🔢 Prétraitement ============
# TF-IDF
vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words="english")
tfidf_matrix_full = vectorizer.fit_transform(df["synopsis_clean"])

# SVD
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
tfidf_svd_full = svd.fit_transform(tfidf_matrix_full)

# Genres
def split_genres(s: str):
    if not s:
        return []
    return [g.strip() for g in s.split("|") if g.strip()]

df["genres_list"] = df["genres"].fillna("").apply(split_genres)
mlb = MultiLabelBinarizer()
genres_encoded_full = mlb.fit_transform(df["genres_list"])

# Année
scaler_year = StandardScaler()
year_scaled_full = scaler_year.fit_transform(
    df[["release_year"]].fillna(df["release_year"].mean())
)

# Similarité KNN (sur TF-IDF)
nn_full = NearestNeighbors(metric="cosine", algorithm="brute")
nn_full.fit(tfidf_matrix_full)
distances_full, _ = nn_full.kneighbors(tfidf_matrix_full, n_neighbors=6)
neighbor_scores_full = 1 - distances_full[:, 1:]

sim_mean_full = neighbor_scores_full.mean(axis=1)
sim_max_full = neighbor_scores_full.max(axis=1)
sim_min_full = neighbor_scores_full.min(axis=1)
sim_std_full = neighbor_scores_full.std(axis=1)
sim_stats_full = np.column_stack([sim_mean_full, sim_max_full, sim_min_full, sim_std_full])

# ============ 🎯 Features + labels ============
y = (df["rating"] >= LIKE_THRESHOLD).astype(int).to_numpy()
X_full = np.column_stack([tfidf_svd_full, genres_encoded_full, year_scaled_full, sim_stats_full])

# Échantillonnage
if SAMPLE_SIZE and SAMPLE_SIZE < len(df):
    rng = np.random.default_rng(RANDOM_STATE)
    idx = rng.choice(len(df), size=SAMPLE_SIZE, replace=False)
    X = X_full[idx]
    y = y[idx]
    df_used = df.iloc[idx].reset_index(drop=True)
else:
    X = X_full
    df_used = df.reset_index(drop=True)

# ============ ✂️ Split train/test ============
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

n_pos = int((y_train == 1).sum())
n_neg = int((y_train == 0).sum())
scale_pos_weight = (n_neg / n_pos) if n_pos > 0 else 1.0

# ============ 🤖 Modèle XGB ============
params = {
    "colsample_bytree": 0.935552788417904,
    "gamma": 0.09983689107917987,
    "learning_rate": 0.11256219891445009,
    "max_depth": 3,
    "min_child_weight": 3,
    "n_estimators": 284,
    "subsample": 0.7511572371061874,
    "random_state": RANDOM_STATE,
    "n_jobs": -1,
    "eval_metric": "logloss",
    "use_label_encoder": False,
    "scale_pos_weight": scale_pos_weight,
}

mlflow.set_experiment(EXPERIMENT_NAME)
with mlflow.start_run(run_name=RUN_NAME):
    mlflow.log_param("like_threshold", LIKE_THRESHOLD)
    mlflow.log_params(params)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # 🎯 Calibration
    xgb_calibrated = CalibratedClassifierCV(model, method='sigmoid', cv=5)
    xgb_calibrated.fit(X_train, y_train)

    # ⚖️ Proba calibrées
    y_pred = xgb_calibrated.predict(X_test)
    y_prob = xgb_calibrated.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    mlflow.xgboost.log_model(xgb_calibrated, "xgb_hybrid_like_dislike_model")

    print(f"✅ Metrics — ACC: {acc:.4f} | ROC-AUC: {roc:.4f} | P: {prec:.4f} | R: {rec:.4f} | F1: {f1:.4f}")


# ============ 🤖 Matrice confusion ============
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

# ============ 📦 Sauvegarde artefacts ============
# Calibrated model
joblib.dump(xgb_calibrated, os.path.join(ARTIFACT_DIR, "xgb_classifier_model.joblib"))

# TF-IDF, SVD, kNN
joblib.dump(vectorizer, os.path.join(ARTIFACT_DIR, "reco_vectorizer.joblib"))
joblib.dump(svd, os.path.join(ARTIFACT_DIR, "svd_model.joblib"))
joblib.dump(tfidf_matrix_full, os.path.join(ARTIFACT_DIR, "tfidf_matrix_full.joblib"))
joblib.dump(nn_full, os.path.join(ARTIFACT_DIR, "nn_full.joblib"))

# Genres, année
joblib.dump(mlb, os.path.join(ARTIFACT_DIR, "mlb_model.joblib"))
joblib.dump(scaler_year, os.path.join(ARTIFACT_DIR, "scaler_year.joblib"))

# 🎯 Sauvegarde scaler pour `proba_scaled`
scaler_proba = MinMaxScaler(feature_range=(0, 1))
scaler_proba.fit(y_prob.reshape(-1, 1))  # calibration pour API
joblib.dump(scaler_proba, os.path.join(ARTIFACT_DIR, "scaler_proba.joblib"))

print("🎉 Artefacts sauvegardés dans", ARTIFACT_DIR)

8 Netflix
337 Disney Plus
119 Amazon Prime Video
350 Apple TV+
2 Apple TV
283 Crunchyroll
381 Canal+
3 Google Play Movies
58 Canal VOD
147 M6+
61 Orange VOD
234 Arte
192 YouTube
188 YouTube Premium
138 FILMO
59 Bbox VOD
35 Rakuten TV
11 MUBI
310 LaCinetek
324 Cinemas a la Demande
415 Animation Digital Network
190 Curiosity Stream
475 DOCSVILLE
513 Shadowz
538 Plex
2077 Plex Channel
546 WOW Presents Plus
550 Tenk
551 Magellan TV
554 BroadwayHD
559 Filmzie
444 Dekkoo
239 Universcine
1967 Molotov TV
567 True Story
569 DocAlliance Films
315 Hoichoi
10 Amazon Video
300 Pluto TV
677 Eventive
685 Cine+ OCS Amazon Channel 
588 MGM Amazon Channel
201 MUBI Amazon Channel
692 Cultpix
701 FilmBox+
1732 Universcine Amazon Channel
1734 Filmo Amazon Channel
1733 Action Max Amazon Channel
1735 Insomnia Amazon Channel
1736 Shadowz Amazon Channel
1737 INA  madelen Amazon Channel
1738 Benshi Amazon Channel
1754 TF1+
1771 Takflix
309 Sun Nxt
1796 Netflix Standard with Ads
531 Paramount Plus
582 Paramount+