# Feature Engineering - Optimis√© bas√© sur l'analyse exploratoire

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from scipy.sparse import hstack
import joblib

df = pd.read_csv("../data/processed/anime_clean.csv")

## 1. Gestion des Donn√©es Manquantes

In [2]:
print("Nettoyage des donn√©es manquantes...")
# Remplacer les synopsis vides par une cha√Æne vide
df['synopsis'] = df['synopsis'].fillna('')

# Remplacer les valeurs manquantes num√©riques
numeric_features = ['score', 'members', 'year', 'popularity']
df[numeric_features] = df[numeric_features].fillna(df[numeric_features].median())

Nettoyage des donn√©es manquantes...


## 2. Features Textuelles - TF-IDF Optimis√©

In [3]:
print("Extraction TF-IDF avec param√®tres optimis√©s...")

# Param√®tres bas√©s sur l'analyse des synopsis
tfidf = TfidfVectorizer(
    max_features=3000,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

tfidf_matrix = tfidf.fit_transform(df['synopsis'])

# R√©duction de dimension
svd = TruncatedSVD(n_components=100, random_state=42, algorithm='randomized')
tfidf_reduced = svd.fit_transform(tfidf_matrix)

print(f"Variance expliqu√©e: {svd.explained_variance_ratio_.sum():.3f}")

Extraction TF-IDF avec param√®tres optimis√©s...
Variance expliqu√©e: 0.255


## 3. Features Num√©riques avec Normalisation Robuste

In [4]:
print("Normalisation des features num√©riques...")

# Transformations bas√©es sur l'analyse
df['log_members'] = np.log1p(df['members'])
df['score_squared'] = df['score'] ** 2

features_to_scale = ['score', 'log_members', 'year', 'popularity', 'score_squared']
scaler = StandardScaler()
numeric_scaled = scaler.fit_transform(df[features_to_scale])

Normalisation des features num√©riques...


## 4. Features Cat√©gorielles Enrichies

In [5]:
print("Encodage des features cat√©gorielles...")

# Combinaison genres + th√®mes + demographics
df['all_categories'] = (
    df['genres'] + ' ' + 
    df['themes'] + ' ' + 
    df['demographics']
)

# One-Hot Encoding
onehot = OneHotEncoder(handle_unknown='ignore', max_categories=80)
genres_encoded = onehot.fit_transform(df['all_categories'].values.reshape(-1, 1))

# Encodage du type
type_encoder = OneHotEncoder(handle_unknown='ignore')
type_encoded = type_encoder.fit_transform(df[['type']])

Encodage des features cat√©gorielles...


## 5. Features d'Interaction Bas√©es sur les Corr√©lations

In [6]:
print("Cr√©ation de features d'interaction...")

# Interactions bas√©es sur les corr√©lations observ√©es
df['score_members_interaction'] = df['score'] * np.log1p(df['members'])
df['popularity_year_interaction'] = df['popularity'] * (2024 - df['year'])

interaction_features = ['score_members_interaction', 'popularity_year_interaction']
interaction_scaler = StandardScaler()
interaction_scaled = interaction_scaler.fit_transform(df[interaction_features])

Cr√©ation de features d'interaction...


## 6. Assemblage Final des Features

In [9]:
print("Assemblage des features...")

# Concat√©nation de toutes les features
final_features = hstack([
    tfidf_reduced,
    numeric_scaled,
    genres_encoded,
    type_encoded,
    interaction_scaled
])

# Cr√©ation du DataFrame final
feature_columns = (
    [f'tfidf_svd_{i}' for i in range(tfidf_reduced.shape[1])] +
    features_to_scale +
    onehot.get_feature_names_out().tolist() +
    type_encoder.get_feature_names_out().tolist() +
    interaction_features
)

features_df = pd.DataFrame(final_features.toarray() if hasattr(final_features, 'toarray') else final_features, 
                          columns=feature_columns[:final_features.shape[1]])
features_df['mal_id'] = df['mal_id'].values
features_df['title'] = df['title'].values

Assemblage des features...


In [10]:
print("‚úÖ V√©rification finale des features...")

# V√©rifier les dimensions
print(f"Dimensions finales: {final_features.shape}")
print(f"Nombre total de features: {final_features.shape[1]}")

# V√©rifier la variance des features num√©riques
var_check = pd.DataFrame(numeric_scaled, columns=features_to_scale).var()
print("\nVariance moyenne des features num√©riques:", var_check.mean().round(4))

# Test de coh√©rence sur TF-IDF
print("\nVariance expliqu√©e (SVD):", round(svd.explained_variance_ratio_.sum(), 3))

# Test m√©moire (utile avant mod√®le)
import sys
size_mb = final_features.data.nbytes / 1024 / 1024
print(f"Taille m√©moire de la matrice finale : {size_mb:.2f} MB")


‚úÖ V√©rification finale des features...
Dimensions finales: (18882, 197)
Nombre total de features: 197

Variance moyenne des features num√©riques: 1.0001

Variance expliqu√©e (SVD): 0.255
Taille m√©moire de la matrice finale : 15.23 MB


## 7. Sauvegarde des Art√©facts

In [8]:
print("Sauvegarde des processeurs et features...")

# Sauvegarde des processeurs
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')
joblib.dump(svd, '../models/svd_reducer.pkl')
joblib.dump(scaler, '../models/numeric_scaler.pkl')
joblib.dump(onehot, '../models/genre_encoder.pkl')
joblib.dump(type_encoder, '../models/type_encoder.pkl')
joblib.dump(interaction_scaler, '../models/interaction_scaler.pkl')

# Sauvegarde des features
features_df.to_csv('../data/processed/features_engineered.csv', index=False)
joblib.dump(final_features, '../data/processed/final_features_sparse.pkl')

print("Feature engineering optimis√© termin√©!")
print(f"Shape final: {final_features.shape}")
print(f"Nombre de features: {final_features.shape[1]}")

Sauvegarde des processeurs et features...
Feature engineering optimis√© termin√©!
Shape final: (18882, 197)
Nombre de features: 197


## üìò R√©sum√© du Feature Engineering

| Type de Feature | M√©thode | D√©tails | Nb Features |
|-----------------|----------|----------|--------------|
| Texte | TF-IDF + SVD | ngram=(1,2), 3000 max_features, 100 comps | 100 |
| Num√©riques | StandardScaler | score, members, year, popularity, interactions | 5+2 |
| Cat√©gorielles | OneHotEncoder | genres+th√®mes+type | ~80 |
| Interactions | Log & Multiplicatives | score√ómembers, popularity√óyear | 2 |

**Total features** ‚âà `final_features.shape[1]`  
**Fichier sauvegard√©** : `../data/processed/features_engineered.csv`  
**Processeurs** sauvegard√©s : `../models/*.pkl`
