In [None]:
from sqlalchemy import create_engine
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
engine = create_engine(DATABASE_URL)

# 1. Charger les données
query = "SELECT movie_id, title, genres, synopsis FROM movies WHERE synopsis IS NOT NULL"
df = pd.read_sql(query, engine)

# 2. Nettoyage
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

# 3. Vectorisation TF-IDF
vectorizer = TfidfVectorizer(
    max_features=30000, 
    ngram_range=(1,3), 
    stop_words="english", 
    sublinear_tf=True,
    norm='l2', 
    min_df=5, 
    max_df=0.8
)

tfidf_matrix = vectorizer.fit_transform(df["synopsis_clean"])

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# lancement : mlflow ui

# ------------------------------
# Connexion MLflow
# ------------------------------
mlflow.set_experiment("movie_cosine_similarity_test")

with mlflow.start_run(run_name="TFIDF_Cosine_Test") as run:
    # ------------------------------
    # Paramètres TF-IDF à logger
    # ------------------------------
    max_features = 80000
    ngram_range = (1,3)
    min_df = 5
    max_df = 0.8
    sublinear_tf = True
    norm = 'l2'

    mlflow.log_param("max_features", max_features)
    mlflow.log_param("ngram_range", ngram_range)
    mlflow.log_param("min_df", min_df)
    mlflow.log_param("max_df", max_df)
    mlflow.log_param("sublinear_tf", sublinear_tf)
    mlflow.log_param("norm", norm)

    # ------------------------------
    # Préprocessing
    # ------------------------------
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    def preprocess_text(text):
        text = text.lower()
        text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
        words = text.split()
        words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
        return " ".join(words)

    df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

    # ------------------------------
    # TF-IDF
    # ------------------------------
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        stop_words="english",
        sublinear_tf=sublinear_tf,
        norm=norm,
        min_df=min_df,
        max_df=max_df
    )

    tfidf_matrix = vectorizer.fit_transform(df["synopsis_clean"])

    # ------------------------------
    # Sample pour test
    # ------------------------------
    sample_df = df.sample(5, random_state=42)
    closest_movies = []

    for idx, row in sample_df.iterrows():
        cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
        cosine_sim[idx] = -1  # ignorer lui-même
        top_idx = cosine_sim.argmax()
        closest_movies.append({
            "Tested Movie": row['title'],
            "Closest Movie": df.iloc[top_idx]['title'],
            "Cosine Score": cosine_sim[top_idx]
        })

    result_df = pd.DataFrame(closest_movies)
    print(result_df)

    # ------------------------------
    # Logger dans MLflow
    # ------------------------------
    mlflow.log_metric("mean_cosine_score", result_df['Cosine Score'].mean())

    # Optionnel : log dataframe complet comme artifact CSV
    result_df.to_csv("cosine_test_results.csv", index=False)
    mlflow.log_artifact("cosine_test_results.csv")


In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sqlalchemy import create_engine
import re

# =========================
# 1. Charger les données
# =========================
DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
engine = create_engine(DATABASE_URL)

query = "SELECT movie_id, title, synopsis, rating, genres FROM movies WHERE synopsis IS NOT NULL"
df = pd.read_sql(query, engine)

# Nettoyage
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

# Genres en liste
df["genres_list"] = df["genres"].fillna("").apply(lambda x: [g.strip() for g in x.split(",")])

# Sample pour limiter mémoire
df_sample = df.sample(10_000, random_state=42).reset_index(drop=True)

# =========================
# TF-IDF + SVD
# =========================
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df_sample["synopsis_clean"].fillna(""))

svd = TruncatedSVD(n_components=50, random_state=42)
tfidf_svd = svd.fit_transform(tfidf_matrix)

# =========================
# Nearest Neighbors
# =========================
nn = NearestNeighbors(metric="cosine", algorithm="brute")
nn.fit(tfidf_matrix)
distances, indices = nn.kneighbors(tfidf_matrix, n_neighbors=10)  # 10 voisins
neighbors = indices[:, 1:]
neighbor_scores = 1 - distances[:, 1:]

sim_mean = neighbor_scores.mean(axis=1)
sim_max = neighbor_scores.max(axis=1)
sim_min = neighbor_scores.min(axis=1)
sim_std = neighbor_scores.std(axis=1)

# =========================
# Genres one-hot
# =========================
mlb = MultiLabelBinarizer()
genres_ohe = mlb.fit_transform(df_sample["genres_list"])

# =========================
# Target binaire : like ou pas
# =========================
df_sample["like"] = (df_sample["rating"] >= 7).astype(int)

# =========================
# Construction features
# =========================
X = np.column_stack([tfidf_svd, sim_mean, sim_max, sim_min, sim_std, genres_ohe])
y = df_sample["like"]

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =========================
# MLflow tracking
# =========================
mlflow.set_experiment("movies_reco_classification")

with mlflow.start_run(run_name="hybrid_cosine_xgb_classif"):
    params = {
        "n_estimators": 300,
        "max_depth": 6,
        "learning_rate": 0.05,
        "objective": "binary:logistic",
        "random_state": 42,
        "n_jobs": -1
    }
    mlflow.log_params(params)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Evaluation
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", auc)
    mlflow.xgboost.log_model(model, "xgb_model_classif")

    # =========================
    # Fonction recommandations
    # =========================
    def recommend_movies(model, movie_id, user_rating=8, top_k=5):
        idx = df_sample.index[df_sample["movie_id"] == movie_id][0]
        neighbor_idx = neighbors[idx]
        sims = neighbor_scores[idx]

        rec_movies = []
        rec_features = []

        for i, sim_score in zip(neighbor_idx, sims):
            rec_movies.append(df_sample.iloc[i]["title"])
            feat = np.hstack([
                tfidf_svd[i],
                sim_mean[i],
                sim_max[i],
                sim_min[i],
                sim_std[i],
                genres_ohe[i]
            ])
            rec_features.append(feat)

        rec_features = scaler.transform(np.array(rec_features))
        pred_scores = model.predict_proba(rec_features)[:,1]

        # Pondération par la note de l'utilisateur
        pred_scores *= (user_rating / 10)

        top_idx = np.argsort(pred_scores)[-top_k:][::-1]
        return [(rec_movies[i], pred_scores[i]) for i in top_idx]

    # Exemple
    movie_id = df_sample.iloc[0]["movie_id"]
    recommendations = recommend_movies(model, movie_id, user_rating=9)

    rec_df = pd.DataFrame(recommendations, columns=["title", "pred_score"])
    rec_df.to_csv("recommendations_classif.csv", index=False)
    mlflow.log_artifact("recommendations_classif.csv")

print("✅ Pipeline terminé ! Accuracy:", acc, "| AUC:", auc)
print("✅ Recommandations exemples :\n", rec_df)


In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sqlalchemy import create_engine
import re

# =========================
# 1. Charger les données
# =========================
DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
engine = create_engine(DATABASE_URL)
query = "SELECT movie_id, title, synopsis, rating, genres, release_year FROM movies WHERE synopsis IS NOT NULL"
df = pd.read_sql(query, engine)

# Nettoyage texte
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

# Sample pour limiter la mémoire
df_sample = df.sample(10_000, random_state=42).reset_index(drop=True)

# =========================
# TF-IDF + SVD
# =========================
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df_sample["synopsis_clean"].fillna(""))
svd = TruncatedSVD(n_components=100, random_state=42)
tfidf_svd = svd.fit_transform(tfidf_matrix)

# =========================
# Encodage genres
# =========================
df_sample["genres_list"] = df_sample["genres"].fillna("").apply(lambda x: x.split("|"))
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(df_sample["genres_list"])

# =========================
# Normalisation année de sortie
# =========================
scaler_year = StandardScaler()
year_scaled = scaler_year.fit_transform(df_sample[["release_year"]].fillna(df_sample["release_year"].mean()))

# =========================
# Nearest Neighbors (Cosine sur TF-IDF)
# =========================
nn = NearestNeighbors(metric="cosine", algorithm="brute")
nn.fit(tfidf_matrix)
distances, indices = nn.kneighbors(tfidf_matrix, n_neighbors=6)
neighbors = indices[:, 1:]
neighbor_scores = 1 - distances[:, 1:]

# Features statistiques des voisins
sim_mean = neighbor_scores.mean(axis=1)
sim_max = neighbor_scores.max(axis=1)
sim_min = neighbor_scores.min(axis=1)
sim_std = neighbor_scores.std(axis=1)

# =========================
# Construction features X
# =========================
rating_norm = df_sample["rating"] / 10.0  # note sur 10
X = np.column_stack([tfidf_svd, genres_encoded, year_scaled, sim_mean, sim_max, sim_min, sim_std, rating_norm])
y = df_sample["rating"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =========================
# MLflow Tracking
# =========================
mlflow.set_experiment("movies_reco_pipeline_hybrid_v2")

with mlflow.start_run(run_name="cosine_xgboost_genres_year"):

    params = {
        "n_estimators": 300,
        "max_depth": 6,
        "learning_rate": 0.05,
        "random_state": 42,
        "n_jobs": -1
    }
    mlflow.log_params(params)

    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)

    # Évaluation
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2)

    # Log modèle
    mlflow.xgboost.log_model(model, "xgboost_model")

    # =========================
    # Fonction de recommandation
    # =========================
    def recommend_movies(model, movie_id, user_rating=8.0, top_k=5):
        idx = df_sample.index[df_sample["movie_id"] == movie_id][0]

        neighbor_idx = neighbors[idx]
        sims = neighbor_scores[idx]

        rec_movies = []
        rec_features = []

        for i, sim_score in zip(neighbor_idx, sims):
            rec_movies.append(df_sample.iloc[i]["title"])
            feat = np.hstack([
                tfidf_svd[i],
                genres_encoded[i],
                year_scaled[i],
                sim_mean[i],
                sim_max[i],
                sim_min[i],
                sim_std[i],
                user_rating / 10.0  # pondération selon note utilisateur
            ])
            rec_features.append(feat)

        rec_features = np.array(rec_features)
        pred_scores = model.predict(rec_features)

        top_idx = np.argsort(pred_scores)[-top_k:][::-1]
        recommended = [(rec_movies[i], pred_scores[i]) for i in top_idx]
        return recommended

    # Exemple test
    movie_id = df_sample.iloc[0]["movie_id"]
    user_rating = 8.0
    recommendations = recommend_movies(model, movie_id, user_rating)

    rec_df = pd.DataFrame(recommendations, columns=["title", "pred_score"])
    rec_df.to_csv("recommendations_v2.csv", index=False)
    mlflow.log_artifact("recommendations_v2.csv")

print("✅ Pipeline terminé ! RMSE:", rmse, "| R²:", r2)
print("✅ Recommandations exemples :")
print(rec_df)


In [40]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
from sqlalchemy import create_engine
import re
import os
import joblib

# =========================
# 1. Charger les données
# =========================
DATABASE_URL = "mysql+pymysql://louve:%40Marley080922@mysql-louve.alwaysdata.net/louve_movies"
engine = create_engine(DATABASE_URL)
query = "SELECT movie_id, title, synopsis, rating, genres, release_year FROM movies WHERE synopsis IS NOT NULL"
df = pd.read_sql(query, engine)

# Nettoyage texte
def preprocess_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    return text

df["synopsis_clean"] = df["synopsis"].apply(preprocess_text)

# =========================
# 2. TF-IDF + SVD sur tout le dataset
# =========================
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix_full = vectorizer.fit_transform(df["synopsis_clean"].fillna(""))

svd_full = TruncatedSVD(n_components=100, random_state=42)
tfidf_svd_full = svd_full.fit_transform(tfidf_matrix_full)

# =========================
# 3. Genres et années
# =========================
df["genres_list"] = df["genres"].fillna("").apply(lambda x: x.split("|"))

mlb = MultiLabelBinarizer()
genres_encoded_full = mlb.fit_transform(df["genres_list"])

scaler_year = StandardScaler()
year_scaled_full = scaler_year.fit_transform(df[["release_year"]].fillna(df["release_year"].mean()))

# =========================
# 4. Nearest Neighbors sur full dataset
# =========================
nn_full = NearestNeighbors(metric="cosine", algorithm="brute")
nn_full.fit(tfidf_matrix_full)
distances_full, indices_full = nn_full.kneighbors(tfidf_matrix_full, n_neighbors=6)
neighbor_scores_full = 1 - distances_full[:, 1:]

sim_mean_full = neighbor_scores_full.mean(axis=1)
sim_max_full = neighbor_scores_full.max(axis=1)
sim_min_full = neighbor_scores_full.min(axis=1)
sim_std_full = neighbor_scores_full.std(axis=1)

# =========================
# 5. Échantillon pour entraînement XGB
# =========================
df_sample = df.sample(10_000, random_state=42).reset_index(drop=True)

tfidf_matrix_sample = vectorizer.transform(df_sample["synopsis_clean"].fillna(""))
tfidf_svd_sample = svd_full.transform(tfidf_matrix_sample)

genres_encoded_sample = mlb.transform(df_sample["genres_list"])
year_scaled_sample = scaler_year.transform(df_sample[["release_year"]].fillna(df["release_year"].mean()))

nn_sample = NearestNeighbors(metric="cosine", algorithm="brute")
nn_sample.fit(tfidf_matrix_sample)
distances_sample, indices_sample = nn_sample.kneighbors(tfidf_matrix_sample, n_neighbors=6)
neighbor_scores_sample = 1 - distances_sample[:, 1:]

sim_mean_sample = neighbor_scores_sample.mean(axis=1)
sim_max_sample = neighbor_scores_sample.max(axis=1)
sim_min_sample = neighbor_scores_sample.min(axis=1)
sim_std_sample = neighbor_scores_sample.std(axis=1)

# Classification : like / dislike
threshold = 7.0  # note >= 7 -> "like"
y_class_sample = (df_sample["rating"] >= threshold).astype(int)

# Features
X_sample = np.column_stack([
    tfidf_svd_sample,
    genres_encoded_sample,
    year_scaled_sample,
    sim_mean_sample,
    sim_max_sample,
    sim_min_sample,
    sim_std_sample
])

X_train, X_test, y_train, y_test = train_test_split(X_sample, y_class_sample, test_size=0.2, random_state=42)

# =========================
# 6. MLflow & XGB
# =========================
mlflow.set_experiment("movies_reco_pipeline_classif")

with mlflow.start_run(run_name="xgb_hybrid_classif"):
    params = {
        "n_estimators": 300,
        "max_depth": 6,
        "learning_rate": 0.05,
        "random_state": 42,
        "n_jobs": -1,
        "use_label_encoder": False,
        "eval_metric": "logloss"
    }
    mlflow.log_params(params)

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Évaluation
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("auc", auc)
    mlflow.xgboost.log_model(model, "xgb_classifier_model")

print("✅ Pipeline classification terminé ! Accuracy:", acc, "| AUC:", auc)

# =========================
# 7. Sauvegarde des artefacts
# =========================
os.makedirs("model", exist_ok=True)

# Sauvegarde du modèle
joblib.dump(model, "model/xgb_classifier_model.joblib")
print("✅ Modèle XGB sauvegardé !")

# Sauvegarde du vectorizer
joblib.dump(vectorizer, "model/reco_vectorizer.joblib")
print("✅ TfidfVectorizer sauvegardé !")

# Sauvegarde du SVD
joblib.dump(svd_full, "model/svd_model.joblib")
print("✅ SVD sauvegardé !")

# Sauvegarde de la TF-IDF matrix complète
joblib.dump(tfidf_matrix_full, "model/tfidf_matrix_full.joblib")
print("✅ TF-IDF matrix complète sauvegardée !")

# Sauvegarde du movie index
df[["title"]].to_csv("model/movie_index.csv", index=False)
print("✅ Movie index complet sauvegardé !")

# Sauvegarde du MultiLabelBinarizer
joblib.dump(mlb, "model/mlb_model.joblib")
print("✅ MultiLabelBinarizer sauvegardé !")

# Sauvegarde du StandardScaler pour l'année
joblib.dump(scaler_year, "model/scaler_year.joblib")
print("✅ StandardScaler pour l'année sauvegardé !")

# Sauvegarde du NearestNeighbors complet
joblib.dump(nn_full, "model/nn_full.joblib")
print("✅ NearestNeighbors complet sauvegardé !")

# Sauvegarde du DataFrame complet
df.to_csv("model/movies_full.csv", index=False)
print("✅ DataFrame complet sauvegardé !")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  self.get_booster().save_model(fname)


✅ Pipeline classification terminé ! Accuracy: 0.832 | AUC: 0.5694185487226879
✅ Modèle XGB sauvegardé !
✅ TfidfVectorizer sauvegardé !
✅ SVD sauvegardé !
✅ TF-IDF matrix complète sauvegardée !
✅ Movie index complet sauvegardé !
✅ MultiLabelBinarizer sauvegardé !
✅ StandardScaler pour l'année sauvegardé !
✅ NearestNeighbors complet sauvegardé !
✅ DataFrame complet sauvegardé !
