# Feature Engineering - Optimisé basé sur l'analyse exploratoire

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import hstack
import joblib

df = pd.read_csv("../data/processed/anime_clean.csv")

## 1. Gestion des Données Manquantes

In [6]:
print("Nettoyage des données manquantes...")
# Remplacer les synopsis vides par une chaîne vide
df['synopsis'] = df['synopsis'].fillna('')

# Remplacer les valeurs manquantes numériques
numeric_features = ['score', 'members', 'year', 'popularity']
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].median())

Nettoyage des données manquantes...


## 2. Features Textuelles - TF-IDF Optimisé

In [7]:
print("Extraction TF-IDF avec paramètres optimisés...")

# Paramètres basés sur l'analyse des synopsis
tfidf = TfidfVectorizer(
    max_features=3000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

tfidf_matrix = tfidf.fit_transform(df['synopsis'])

# Réduction de dimension
svd = TruncatedSVD(n_components=100, random_state=42, algorithm='randomized')
tfidf_reduced = svd.fit_transform(tfidf_matrix)

print(f"Variance expliquée: {svd.explained_variance_ratio_.sum():.3f}")

Extraction TF-IDF avec paramètres optimisés...
Variance expliquée: 0.255


## 3. Features Numériques avec Normalisation Robuste

In [8]:
print("Normalisation des features numériques...")

# Transformations basées sur l'analyse
df['log_members'] = np.log1p(df['members'])
df['score_squared'] = df['score'] ** 2

features_to_scale = ['score', 'log_members', 'year', 'popularity', 'score_squared']
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(df[features_to_scale])

Normalisation des features numériques...


## 4. Features Catégorielles Enrichies

In [9]:
print("Encodage des features catégorielles...")

# Combinaison genres + thèmes + demographics
df['all_categories'] = (
    df['genres'] + ' ' + 
    df['themes'] + ' ' + 
    df['demographics']
)

# One-Hot Encoding
onehot = OneHotEncoder(handle_unknown='ignore', max_categories=80)
genres_encoded = onehot.fit_transform(df['all_categories'].values.reshape(-1, 1))

# Encodage du type
type_encoder = OneHotEncoder(handle_unknown='ignore')
type_encoded = type_encoder.fit_transform(df[['type']])

Encodage des features catégorielles...


## 5. Features d'Interaction Basées sur les Corrélations

In [10]:
print("Création de features d'interaction...")

# Interactions basées sur les corrélations observées
df['score_members_interaction'] = df['score'] * np.log1p(df['members'])
df['popularity_year_interaction'] = df['popularity'] * (2024 - df['year'])

interaction_features = ['score_members_interaction', 'popularity_year_interaction']
interaction_scaler = StandardScaler()
interaction_scaled = interaction_scaler.fit_transform(df[interaction_features])

Création de features d'interaction...


## 6. Assemblage Final des Features

In [11]:
print("Assemblage des features...")

# Concaténation de toutes les features
final_features = hstack([
    tfidf_reduced,
    numeric_scaled,
    genres_encoded,
    type_encoded,
    interaction_scaled
])

# Création du DataFrame final
feature_columns = (
    [f'tfidf_svd_{i}' for i in range(tfidf_reduced.shape[1])] +
    features_to_scale +
    onehot.get_feature_names_out().tolist() +
    type_encoder.get_feature_names_out().tolist() +
    interaction_features
)

features_df = pd.DataFrame(final_features.toarray() if hasattr(final_features, 'toarray') else final_features, 
                          columns=feature_columns[:final_features.shape[1]])
features_df['mal_id'] = df['mal_id'].values
features_df['title'] = df['title'].values

Assemblage des features...


## 7. Sauvegarde des Artéfacts

In [12]:
print("Sauvegarde des processeurs et features...")

# Sauvegarde des processeurs
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')
joblib.dump(svd, '../models/svd_reducer.pkl')
joblib.dump(scaler, '../models/numeric_scaler.pkl')
joblib.dump(onehot, '../models/genre_encoder.pkl')
joblib.dump(type_encoder, '../models/type_encoder.pkl')
joblib.dump(interaction_scaler, '../models/interaction_scaler.pkl')

# Sauvegarde des features
features_df.to_csv('../data/processed/features_engineered.csv', index=False)
joblib.dump(final_features, '../data/processed/final_features_sparse.pkl')

print("Feature engineering optimisé terminé!")
print(f"Shape final: {final_features.shape}")
print(f"Nombre de features: {final_features.shape[1]}")

Sauvegarde des processeurs et features...
Feature engineering optimisé terminé!
Shape final: (18882, 197)
Nombre de features: 197
