# Recommandation de film

## Modèle

In [1]:
import os
import re
import joblib
import numpy as np
import pandas as pd

from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors

import xgboost as xgb
import mlflow
import mlflow.xgboost

In [None]:
# =========================
# ⚙️ Config
# =========================
DATABASE_URL = os.getenv("MYSQL_URL")
if not DATABASE_URL:
    raise RuntimeError("Missing MYSQL_URL")

SQL_QUERY = """
SELECT movie_id, title, synopsis, rating, genres, release_year
FROM movies
WHERE synopsis IS NOT NULL
"""

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", mlflow.get_tracking_uri()))
EXPERIMENT_NAME = "monitoring_model-github"
RUN_NAME = "monitoring_train"

LIKE_THRESHOLD = 4.0          # seuil like/dislike
TFIDF_MAX_FEATURES = 5000
SVD_COMPONENTS = 100
RANDOM_STATE = 42
SAMPLE_SIZE = 10_000          # None = tout le dataset

ARTIFACT_DIR = "E3_E4_API_app/model"
os.makedirs(ARTIFACT_DIR, exist_ok=True)


In [3]:
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

engine = create_engine(DATABASE_URL)
df = pd.read_sql(SQL_QUERY, engine)

df["synopsis_clean"] = df["synopsis"].fillna("").apply(preprocess_text)
df.head()


Unnamed: 0,movie_id,title,synopsis,rating,genres,release_year,synopsis_clean
0,2,Ariel,A Finnish man goes to the city to find a job a...,7.1,Crime,1988.0,a finnish man goes to the city to find a job a...
1,2,Ariel,"Salla, petite ville minière de la Laponie. Tai...",7.1,Crime,1988.0,salla petite ville mini re de la laponie tai...
2,5,Four Rooms,It's Ted the Bellhop's first night on the job....,5.9,Comedy,1995.0,it s ted the bellhop s first night on the job ...
3,6,Judgment Night,"Four young friends, while taking a shortcut en...",6.5,Action,1993.0,four young friends while taking a shortcut en...
4,6,La Nuit du jugement,"Quatre copains, voulant se rendre à un match d...",6.5,Crime|Thriller,1993.0,quatre copains voulant se rendre un match d...


In [4]:
# TF-IDF
vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words="english")
tfidf_matrix_full = vectorizer.fit_transform(df["synopsis_clean"])

# Réduction SVD
svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=RANDOM_STATE)
tfidf_svd_full = svd.fit_transform(tfidf_matrix_full)

# Genres
def split_genres(s: str):
    if not s: return []
    return [g.strip() for g in s.split("|") if g.strip()]

df["genres_list"] = df["genres"].fillna("").apply(split_genres)
mlb = MultiLabelBinarizer()
genres_encoded_full = mlb.fit_transform(df["genres_list"])

# Année
scaler_year = StandardScaler()
year_scaled_full = scaler_year.fit_transform(
    df[["release_year"]].fillna(df["release_year"].mean())
)

# Similarité kNN
nn_full = NearestNeighbors(metric="cosine", algorithm="brute")
nn_full.fit(tfidf_matrix_full)
distances_full, _ = nn_full.kneighbors(tfidf_matrix_full, n_neighbors=6)
neighbor_scores_full = 1 - distances_full[:, 1:]

sim_mean_full = neighbor_scores_full.mean(axis=1)
sim_max_full = neighbor_scores_full.max(axis=1)
sim_min_full = neighbor_scores_full.min(axis=1)
sim_std_full = neighbor_scores_full.std(axis=1)

sim_stats_full = np.column_stack([sim_mean_full, sim_max_full, sim_min_full, sim_std_full])


In [5]:
# Label binaire
y = (df["rating"] >= LIKE_THRESHOLD).astype(int).to_numpy()

# Features
X_full = np.column_stack([tfidf_svd_full, genres_encoded_full, year_scaled_full, sim_stats_full])

# Échantillonnage optionnel
if SAMPLE_SIZE and SAMPLE_SIZE < len(df):
    rng = np.random.default_rng(RANDOM_STATE)
    idx = rng.choice(len(df), size=SAMPLE_SIZE, replace=False)
    X = X_full[idx]
    y = y[idx]
    df_used = df.iloc[idx].reset_index(drop=True)
else:
    X = X_full
    df_used = df.reset_index(drop=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

n_pos = int((y_train == 1).sum())
n_neg = int((y_train == 0).sum())
scale_pos_weight = (n_neg / n_pos) if n_pos > 0 else 1.0

In [7]:
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME):
    params = {
        "colsample_bytree": 0.935552788417904,
        "gamma": 0.09983689107917987,
        "learning_rate": 0.11256219891445009,
        "max_depth": 3,
        "min_child_weight": 3,
        "n_estimators": 284,
        "subsample": 0.7511572371061874,
        "random_state": 42,
        "n_jobs": -1,
        "eval_metric": "logloss",
        "use_label_encoder": False,
        "scale_pos_weight": scale_pos_weight,
    }
    mlflow.log_param("like_threshold", LIKE_THRESHOLD)
    mlflow.log_params(params)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_prob)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", roc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    mlflow.xgboost.log_model(model, "xgb_hybrid_like_dislike_model")

print(f"✅ Metrics — ACC: {acc:.4f} | ROC-AUC: {roc:.4f} | P: {prec:.4f} | R: {rec:.4f} | F1: {f1:.4f}")


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  self.get_booster().save_model(fname)


✅ Metrics — ACC: 0.6710 | ROC-AUC: 0.7458 | P: 0.8078 | R: 0.6440 | F1: 0.7166


In [8]:
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)


Confusion Matrix:
 [[510 198]
 [460 832]]

Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.72      0.61       708
           1       0.81      0.64      0.72      1292

    accuracy                           0.67      2000
   macro avg       0.67      0.68      0.66      2000
weighted avg       0.71      0.67      0.68      2000



In [9]:
joblib.dump(model, os.path.join(ARTIFACT_DIR, "xgb_classifier_model.joblib"))
joblib.dump(vectorizer, os.path.join(ARTIFACT_DIR, "reco_vectorizer.joblib"))
joblib.dump(svd, os.path.join(ARTIFACT_DIR, "svd_model.joblib"))
joblib.dump(tfidf_matrix_full, os.path.join(ARTIFACT_DIR, "tfidf_matrix_full.joblib"))
joblib.dump(mlb, os.path.join(ARTIFACT_DIR, "mlb_model.joblib"))
joblib.dump(scaler_year, os.path.join(ARTIFACT_DIR, "scaler_year.joblib"))
joblib.dump(nn_full, os.path.join(ARTIFACT_DIR, "nn_full.joblib"))

# df.to_csv(os.path.join(ARTIFACT_DIR, "movies_full.csv"), index=False)

print("🎉 Artefacts sauvegardés dans", ARTIFACT_DIR)


🎉 Artefacts sauvegardés dans model
