#  Notebook : `01b_exploration_data.ipynb`

### **Objectif**

  - Explorer le dataset hybride avant la vectorisation
  - Identifier les tendances, corr√©lations et biais
  - Guider les choix de features et mod√®les pour la suite

---

In [None]:
# ================================================================
# üß† 1. Imports & setup
# ================================================================


import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8,5)

DATA_PROCESSED = "../../data/processed/"
REPORTS_DIR = "../../reports/"
os.makedirs(REPORTS_DIR, exist_ok=True)
os.makedirs(os.path.join(REPORTS_DIR, "figures"), exist_ok=True)

# Chargement du dataset r√©duit et des interactions
df = pd.read_csv(os.path.join(DATA_PROCESSED, "train_hybride_reduced.csv"))
print(f"‚úÖ Dataset charg√© : {df.shape[0]:,} lignes, {df.shape[1]} colonnes")

# Optionnel : si user_favs existe d√©j√†
user_favs_path = os.path.join(DATA_PROCESSED, "user_favs_temp.csv")
user_favs = pd.read_csv(user_favs_path) if os.path.exists(user_favs_path) else None
if user_favs is not None:
    print(f"Interractions utilisateur charg√©es : {user_favs.shape[0]:,} lignes")


[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


‚úÖ Dataset charg√© : 1,462,009 lignes, 16 colonnes


In [None]:
# ================================================================
# üìä 2. Vue d‚Äôensemble des donn√©es
# ================================================================
display(df.head(3))
print("\n--- INFO ---")
print(df.info())
print("\n--- DESCRIPTION NUM√âRIQUE ---")
display(df.describe().T)

print(f"Utilisateurs uniques : {df['username'].nunique()}")
print(f"Animes uniques : {df['mal_id'].nunique()}")


Unnamed: 0,mal_id,title,genres,themes,type,rating,score,members,popularity,year,synopsis_length,completion_rate,drop_rate,weighted_score,total_votes,username
0,45649,The First Slam Dunk,"['Award Winning', 'Sports']","['School', 'Team Sports']",Movie,PG-13 - Teens 13 or older,8.72,69803,2881,0.0,113,0.555468,0.006759,8.668793,32134.0,ishikawas
1,38680,Fruits Basket 1st Season,"['Drama', 'Romance', 'Supernatural']","['Love Polygon', 'School']",TV,PG-13 - Teens 13 or older,8.21,916323,211,2019.0,213,0.569366,0.039585,8.165223,435739.0,ishikawas
2,795,Oniisama e...,"['Drama', 'Girls Love']","['Psychological', 'School']",TV,PG-13 - Teens 13 or older,7.86,53824,3315,1991.0,176,0.249485,0.031223,7.837709,11960.0,ishikawas



--- INFO ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462009 entries, 0 to 1462008
Data columns (total 16 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   mal_id           1462009 non-null  int64  
 1   title            1462009 non-null  object 
 2   genres           1462009 non-null  object 
 3   themes           1462009 non-null  object 
 4   type             1462009 non-null  object 
 5   rating           1462009 non-null  object 
 6   score            1462009 non-null  float64
 7   members          1462009 non-null  int64  
 8   popularity       1462009 non-null  int64  
 9   year             1462009 non-null  float64
 10  synopsis_length  1462009 non-null  int64  
 11  completion_rate  1462009 non-null  float64
 12  drop_rate        1462009 non-null  float64
 13  weighted_score   1462009 non-null  float64
 14  total_votes      1462009 non-null  float64
 15  username         1462009 non-null  object 
dtypes: f

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mal_id,1462009.0,20976.73,17800.84,1.0,2890.0,17074.0,36649.0,62568.0
score,1462009.0,8.23552,0.653661,0.0,7.92,8.31,8.64,9.29
members,1462009.0,1354196.0,1030303.0,23228.0,499878.0,1121020.0,1977969.0,4230312.0
popularity,1462009.0,448.3562,741.9353,1.0,44.0,148.0,497.0,5006.0
year,1462009.0,1760.092,666.228,0.0,2005.0,2012.0,2017.0,2026.0
synopsis_length,1462009.0,163.1697,35.057,0.0,140.0,165.0,183.0,370.0
completion_rate,1462009.0,0.6378789,0.1724834,0.0,0.554265,0.6645215,0.7585827,0.967253
drop_rate,1462009.0,0.02795773,0.02121955,0.0,0.01484,0.02353282,0.03471266,0.3805747
weighted_score,1462009.0,8.205439,0.5874769,0.0,7.87965,8.278976,8.611927,10.0
total_votes,1462009.0,805069.4,699650.1,0.0,240910.0,632398.0,1221747.0,2980061.0


Utilisateurs uniques : 192174
Animes uniques : 4828


In [None]:


import matplotlib
# set a non-interactive backend to avoid backend_agg import issues on some environments
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns

# ================================================================
# üîç 3. Analyse univari√©e
# ================================================================
num_cols = ['score','weighted_score','completion_rate','drop_rate','members','popularity','year','synopsis_length']
for col in num_cols:
    if col in df.columns:
        sns.histplot(df[col], bins=30, kde=True)
        plt.title(f"Distribution de {col}")
        plt.savefig(f"{REPORTS_DIR}/figures/dist_{col}.png", bbox_inches="tight")
        plt.close()  # close figure to free memory (and avoid duplicate displays with Agg)


In [None]:
# üé≠ Analyse des genres et types
if "genres" in df.columns:
    top_genres = df["genres"].dropna().str.split(",").explode().str.strip().value_counts().head(20)
    top_genres.plot(kind="bar", color="teal")
    plt.title("Top 20 genres les plus fr√©quents")
    plt.ylabel("Nombre d'animes")
    plt.savefig(f"{REPORTS_DIR}/figures/top_genres.png", bbox_inches="tight")
    plt.show()

if "type" in df.columns:
    sns.countplot(data=df, y="type", order=df["type"].value_counts().index)
    plt.title("Distribution par type d'anime")
    plt.savefig(f"{REPORTS_DIR}/figures/type_distribution.png", bbox_inches="tight")
    plt.show()


  plt.show()
  plt.show()


In [None]:
# ================================================================
# üîó 4. Corr√©lations entre variables num√©riques
# ================================================================
corr_cols = [c for c in num_cols if c in df.columns]
corr = df[corr_cols].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Corr√©lations entre variables num√©riques")
plt.savefig(f"{REPORTS_DIR}/figures/correlation_heatmap.png", bbox_inches="tight")
plt.show()

# Exemples cibl√©s de relations
sns.scatterplot(data=df, x="score", y="completion_rate", alpha=0.5)
plt.title("Score vs Completion Rate")
plt.savefig(f"{REPORTS_DIR}/figures/score_vs_completion.png", bbox_inches="tight")
plt.show()


  plt.show()
  plt.show()


In [None]:
# ================================================================
# üë• 5. Analyse du comportement utilisateur
# ================================================================
if user_favs is not None:
    user_activity = user_favs.groupby("username").size()
    anime_popularity = user_favs.groupby("mal_id").size()

    sns.histplot(user_activity, bins=30)
    plt.title("Distribution du nombre de favoris par utilisateur")
    plt.xlabel("Nombre de favoris")
    plt.ylabel("Nombre d'utilisateurs")
    plt.savefig(f"{REPORTS_DIR}/figures/user_activity.png", bbox_inches="tight")
    plt.show()

    top_animes = anime_popularity.sort_values(ascending=False).head(20)
    top_animes.plot(kind="bar", color="orange")
    plt.title("Top 20 animes les plus ajout√©s en favoris")
    plt.ylabel("Nombre de favoris")
    plt.savefig(f"{REPORTS_DIR}/figures/top_favoris.png", bbox_inches="tight")
    plt.show()

    print(f"Nb moyen de favoris par utilisateur : {user_activity.mean():.2f}")
    print(f"Nb moyen de favoris par anime : {anime_popularity.mean():.2f}")
else:
    print("‚ö†Ô∏è user_favs non disponible pour cette exploration (aucun fichier trouv√©).")


‚ö†Ô∏è user_favs non disponible pour cette exploration (aucun fichier trouv√©).


In [None]:
# ================================================================
# üìà 6. Tendances et insights cl√©s
# ================================================================
print("üéØ INSIGHTS PR√âLIMINAIRES")

if 'completion_rate' in df.columns and 'score' in df.columns:
    corr_val = df['completion_rate'].corr(df['score'])
    print(f"‚û°Ô∏è Corr√©lation score ‚Üî completion_rate : {corr_val:.2f}")
    if corr_val > 0.5:
        print("   ‚Üí Les animes bien not√©s ont tendance √† √™tre termin√©s plus souvent.")
    elif corr_val < 0.2:
        print("   ‚Üí Le score n‚Äôexplique que faiblement le taux de compl√©tion.")

if 'members' in df.columns and 'popularity' in df.columns:
    corr_pop = df['members'].corr(df['popularity'])
    print(f"‚û°Ô∏è Corr√©lation members ‚Üî popularity : {corr_pop:.2f}")
    if corr_pop < -0.7:
        print("   ‚Üí Popularity est une m√©trique inverse du nombre de membres (confirm√©).")

if 'year' in df.columns:
    trend = df.groupby('year')['score'].mean().dropna()
    trend.plot(title="√âvolution moyenne du score par ann√©e")
    plt.savefig(f"{REPORTS_DIR}/figures/trend_score_year.png", bbox_inches="tight")
    plt.show()


üéØ INSIGHTS PR√âLIMINAIRES
‚û°Ô∏è Corr√©lation score ‚Üî completion_rate : 0.12
   ‚Üí Le score n‚Äôexplique que faiblement le taux de compl√©tion.
‚û°Ô∏è Corr√©lation members ‚Üî popularity : -0.59


  plt.show()


In [None]:
# ================================================================
# üßæ 7. Rapport automatis√© (optionnel)
# ================================================================
try:
    profile = ProfileReport(df, title="Anime Hybrid Dataset EDA", minimal=True)
    profile.to_file(os.path.join(REPORTS_DIR, "anime_eda_report.html"))
    print("üíæ Rapport EDA sauvegard√© dans /reports/anime_eda_report.html")
except Exception as e:
    print("‚ö†Ô∏è Impossible de g√©n√©rer le rapport Profiling :", e)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [00:02<00:00,  5.48it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

üíæ Rapport EDA sauvegard√© dans /reports/anime_eda_report.html


In [None]:
# ================================================================
# üß≠ 8. Conclusions & axes pour la mod√©lisation
# ================================================================
print("""
üìå Conclusions de l‚Äôexploration :
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
- Les distributions de scores et de completion_rate indiquent une base biais√©e vers les animes populaires.
- Les genres et types dominants (Action, Comedy, Drama, TV) doivent √™tre √©quilibr√©s ou pond√©r√©s.
- Corr√©lation notable entre engagement (completion_rate) et score ‚Üí utile comme feature de qualit√©.
- Variables 'members' et 'popularity' tr√®s corr√©l√©es ‚Üí √† fusionner ou normaliser.
- synopsis_length variable selon le type ‚Üí indicateur indirect de profondeur narrative.

üöÄ Recommandations pour le prochain notebook (02_feature_engineering.ipynb) :
- TF-IDF ou Sentence Embeddings sur synopsis (feature textuelle principale)
- One-hot / Multi-label encoding sur genres, themes, type
- Normalisation MinMax sur scores, rates et members
- Construction d‚Äôun vecteur hybride : contenu (TF-IDF) + stats (num√©riques normalis√©es)
""")



üìå Conclusions de l‚Äôexploration :
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
- Les distributions de scores et de completion_rate indiquent une base biais√©e vers les animes populaires.
- Les genres et types dominants (Action, Comedy, Drama, TV) doivent √™tre √©quilibr√©s ou pond√©r√©s.
- Corr√©lation notable entre engagement (completion_rate) et score ‚Üí utile comme feature de qualit√©.
- Variables 'members' et 'popularity' tr√®s corr√©l√©es ‚Üí √† fusionner ou normaliser.
- synopsis_length variable selon le type ‚Üí indicateur indirect de profondeur narrative.

üöÄ Recommandations pour le prochain notebook (02_feature_engineering.ipynb) :
- TF-IDF ou Sentence Embeddings sur synopsis (feature textuelle principale)
- One-hot / Multi-label encoding sur genres, themes, type
- Normalisation MinMax sur scores, rates et members
- Construction d‚Äôun vecteur hybride : contenu (TF-IDF) + stats (num√©riques normalis√©es)

