In [1]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sqlalchemy import create_engine
import re
import os
import joblib
import nltk

nltk.download("stopwords")
nltk.download("wordnet")

# =========================
# 1. Charger les données
# =========================
DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
engine = create_engine(DATABASE_URL)
query = "SELECT movie_id, title, synopsis, rating, genres, release_year FROM movies WHERE synopsis IS NOT NULL"
df = pd.read_sql(query, engine)

# Nettoyage texte
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

# =========================
# 2. TF-IDF + SVD sur tout le dataset
# =========================
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix_full = vectorizer.fit_transform(df["synopsis_clean"].fillna(""))

svd_full = TruncatedSVD(n_components=100, random_state=42)
tfidf_svd_full = svd_full.fit_transform(tfidf_matrix_full)

# =========================
# 3. Genres et années
# =========================
df["genres_list"] = df["genres"].fillna("").apply(lambda x: x.split("|"))

mlb = MultiLabelBinarizer()
genres_encoded_full = mlb.fit_transform(df["genres_list"])

scaler_year = StandardScaler()
year_scaled_full = scaler_year.fit_transform(df[["release_year"]].fillna(df["release_year"].mean()))

# =========================
# 4. Nearest Neighbors sur full dataset
# =========================
nn_full = NearestNeighbors(metric="cosine", algorithm="brute")
nn_full.fit(tfidf_matrix_full)
distances_full, indices_full = nn_full.kneighbors(tfidf_matrix_full, n_neighbors=6)
neighbor_scores_full = 1 - distances_full[:, 1:]

sim_mean_full = neighbor_scores_full.mean(axis=1)
sim_max_full = neighbor_scores_full.max(axis=1)
sim_min_full = neighbor_scores_full.min(axis=1)
sim_std_full = neighbor_scores_full.std(axis=1)

# =========================
# 5. Échantillon pour entraînement XGB
# =========================
df_sample = df.sample(10_000, random_state=42).reset_index(drop=True)

tfidf_matrix_sample = vectorizer.transform(df_sample["synopsis_clean"].fillna(""))
tfidf_svd_sample = svd_full.transform(tfidf_matrix_sample)

genres_encoded_sample = mlb.transform(df_sample["genres_list"])
year_scaled_sample = scaler_year.transform(df_sample[["release_year"]].fillna(df["release_year"].mean()))

nn_sample = NearestNeighbors(metric="cosine", algorithm="brute")
nn_sample.fit(tfidf_matrix_sample)
distances_sample, indices_sample = nn_sample.kneighbors(tfidf_matrix_sample, n_neighbors=6)
neighbor_scores_sample = 1 - distances_sample[:, 1:]

sim_mean_sample = neighbor_scores_sample.mean(axis=1)
sim_max_sample = neighbor_scores_sample.max(axis=1)
sim_min_sample = neighbor_scores_sample.min(axis=1)
sim_std_sample = neighbor_scores_sample.std(axis=1)

# Classification : like / dislike
threshold = 7.0  # note >= 7 -> "like"
y_class_sample = (df_sample["rating"] >= threshold).astype(int)

# Features
X_sample = np.column_stack([
    tfidf_svd_sample,
    genres_encoded_sample,
    year_scaled_sample,
    sim_mean_sample,
    sim_max_sample,
    sim_min_sample,
    sim_std_sample
])

X_train, X_test, y_train, y_test = train_test_split(X_sample, y_class_sample, test_size=0.2, random_state=42)

# =========================
# 6. MLflow & XGB
# =========================
mlflow.set_experiment("movies_reco_pipeline_classif")

with mlflow.start_run(run_name="xgb_hybrid_classif"):
    params = {
        "n_estimators": 300,
        "max_depth": 6,
        "learning_rate": 0.05,
        "random_state": 42,
        "n_jobs": -1,
        "use_label_encoder": False,
        "eval_metric": "logloss"
    }
    mlflow.log_params(params)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Évaluation
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("auc", auc)
    mlflow.xgboost.log_model(model, "xgb_classifier_model")

print("✅ Pipeline classification terminé ! Accuracy:", acc, "| AUC:", auc)

# =========================
# 7. Sauvegarde des artefacts
# =========================
os.makedirs("model", exist_ok=True)

# Sauvegarde du modèle
joblib.dump(model, "model/xgb_classifier_model.joblib")
print("✅ Modèle XGB sauvegardé !")

# Sauvegarde du vectorizer
joblib.dump(vectorizer, "model/reco_vectorizer.joblib")
print("✅ TfidfVectorizer sauvegardé !")

# Sauvegarde du SVD
joblib.dump(svd_full, "model/svd_model.joblib")
print("✅ SVD sauvegardé !")

# Sauvegarde de la TF-IDF matrix complète
joblib.dump(tfidf_matrix_full, "model/tfidf_matrix_full.joblib")
print("✅ TF-IDF matrix complète sauvegardée !")

# Sauvegarde du movie index
df[["title"]].to_csv("model/movie_index.csv", index=False)
print("✅ Movie index complet sauvegardé !")

# Sauvegarde du MultiLabelBinarizer
joblib.dump(mlb, "model/mlb_model.joblib")
print("✅ MultiLabelBinarizer sauvegardé !")

# Sauvegarde du StandardScaler pour l'année
joblib.dump(scaler_year, "model/scaler_year.joblib")
print("✅ StandardScaler pour l'année sauvegardé !")

# Sauvegarde du NearestNeighbors complet
joblib.dump(nn_full, "model/nn_full.joblib")
print("✅ NearestNeighbors complet sauvegardé !")

# Sauvegarde du DataFrame complet
df.to_csv("model/movies_full.csv", index=False)
print("✅ DataFrame complet sauvegardé !")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\loulo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\loulo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj

✅ Pipeline classification terminé ! Accuracy: 0.832 | AUC: 0.5694185487226879
✅ Modèle XGB sauvegardé !
✅ TfidfVectorizer sauvegardé !
✅ SVD sauvegardé !
✅ TF-IDF matrix complète sauvegardée !
✅ Movie index complet sauvegardé !
✅ MultiLabelBinarizer sauvegardé !
✅ StandardScaler pour l'année sauvegardé !
✅ NearestNeighbors complet sauvegardé !
✅ DataFrame complet sauvegardé !
