In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_keywords = pd.read_csv("datasets/df_agri_keywords.csv")
df_ner = pd.read_csv("datasets/df_agri_ner.csv")
df_sentiment = pd.read_csv("datasets/processed_results.csv")

In [None]:
# Add columns to df_final
ner_cols = ["actor_persons", "actor_organizations", "actor_locations"]
sentiment_cols = ["sentiment", "sentiment_confidence"]
df_keywords[ner_cols] = df_ner[ner_cols]
df_keywords[sentiment_cols] = df_sentiment[sentiment_cols]
df_final = df_keywords
df_final.to_csv("datasets/df_agri_final.csv", index=False, encoding="utf-8")

In [None]:
df_final.head()

In [None]:
df_final.head(10)

## Analysis

In [None]:
# Configuration pour les visualisations
plt.style.use("default")
sns.set_palette("husl")
plt.rcParams["figure.figsize"] = (14, 8)
plt.rcParams["font.size"] = 10

# V√©rifier les dimensions du dataset
print(f"Nombre de segments: {len(df_final)}")
print(f"Nombre de colonnes: {len(df_final.columns)}")
print("\nColonnes disponibles:")
print(df_final.columns.tolist())

### Question 1: Quels sont les mod√®les agricoles repr√©sent√©s dans les reportages ?

Nous allons analyser la colonne `themes` pour identifier les mod√®les agricoles et th√©matiques environnementales abord√©s.

In [None]:
# Analyser les th√®mes
import ast
from collections import Counter


# Fonction pour extraire les th√®mes
def extract_themes(themes_str):
    if pd.isna(themes_str) or themes_str == "":
        return []
    # Nettoyer et s√©parer les th√®mes
    themes_str = str(themes_str).strip(",").strip()
    if themes_str:
        return [theme.strip() for theme in themes_str.split(",") if theme.strip()]
    return []


# Extraire tous les th√®mes
all_themes = []
for themes in df_final["themes"]:
    all_themes.extend(extract_themes(themes))

# Compter les th√®mes
theme_counts = Counter(all_themes)
print(f"Nombre total de th√®mes uniques: {len(theme_counts)}")
print("\nTop 20 des th√®mes les plus fr√©quents:")
for theme, count in theme_counts.most_common(20):
    print(f"  {theme}: {count}")

In [None]:
# Visualisation 1: Top 15 des mod√®les/th√®mes agricoles
top_themes = theme_counts.most_common(15)
themes_df = pd.DataFrame(top_themes, columns=["Theme", "Count"])

fig, ax = plt.subplots(figsize=(14, 8))
bars = ax.barh(themes_df["Theme"], themes_df["Count"], color="#2ecc71")
ax.set_xlabel("Nombre de mentions", fontsize=12, fontweight="bold")
ax.set_ylabel("Th√®me / Mod√®le agricole", fontsize=12, fontweight="bold")
ax.set_title(
    "Top 15 des mod√®les agricoles et th√®mes environnementaux dans les reportages",
    fontsize=14,
    fontweight="bold",
    pad=20,
)
ax.invert_yaxis()

# Ajouter les valeurs sur les barres
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(
        width,
        bar.get_y() + bar.get_height() / 2,
        f" {int(width)}",
        ha="left",
        va="center",
        fontweight="bold",
    )

plt.tight_layout()
plt.show()

In [None]:
# Cat√©goriser les th√®mes par grandes familles
categories_mapping = {
    "biodiversite": [
        "biodiversite_causes_indirectes",
        "biodiversite_causes_directes",
        "biodiversite_consequences",
        "biodiversite_solutions",
    ],
    "changement_climatique": [
        "changement_climatique_causes_indirectes",
        "changement_climatique_causes_directes",
        "changement_climatique_consequences",
        "changement_climatique_solutions",
    ],
    "attenuation_climatique": [
        "attenuation_climatique_solutions_indirectes",
        "attenuation_climatique_solutions_directes",
    ],
    "adaptation_climatique": ["adaptation_climatique_solutions"],
    "ressources": [
        "ressources_consequences",
        "ressources_causes_indirectes",
        "ressources_solutions",
    ],
    "pollution": [
        "pollution_causes_indirectes",
        "pollution_causes_directes",
        "pollution_consequences",
        "pollution_solutions",
    ],
}

# Cr√©er un dictionnaire invers√© pour mapper chaque th√®me √† sa cat√©gorie
theme_to_category = {}
for category, themes_list in categories_mapping.items():
    for theme in themes_list:
        theme_to_category[theme] = category

# Compter par cat√©gorie
category_counts = Counter()
for theme, count in theme_counts.items():
    category = theme_to_category.get(theme, "autres")
    category_counts[category] += count

# Visualisation des grandes cat√©gories
cat_df = pd.DataFrame(category_counts.most_common(), columns=["Cat√©gorie", "Count"])

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Diagramme en barres
bars = ax1.bar(cat_df["Cat√©gorie"], cat_df["Count"], color="#3498db")
ax1.set_xlabel("Cat√©gorie", fontsize=12, fontweight="bold")
ax1.set_ylabel("Nombre de mentions", fontsize=12, fontweight="bold")
ax1.set_title(
    "Distribution des th√®mes par grande cat√©gorie", fontsize=13, fontweight="bold"
)
ax1.tick_params(axis="x", rotation=45)
for bar in bars:
    height = bar.get_height()
    ax1.text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{int(height)}",
        ha="center",
        va="bottom",
        fontweight="bold",
    )

# Diagramme circulaire
colors = sns.color_palette("husl", len(cat_df))
wedges, texts, autotexts = ax2.pie(
    cat_df["Count"],
    labels=cat_df["Cat√©gorie"],
    autopct="%1.1f%%",
    colors=colors,
    startangle=90,
)
ax2.set_title("Proportion des cat√©gories de th√®mes", fontsize=13, fontweight="bold")
for autotext in autotexts:
    autotext.set_color("white")
    autotext.set_fontweight("bold")

plt.tight_layout()
plt.show()

### Question 2: Quels sont les acteurs prenant la parole sur l'agriculture dans les m√©dias en France ?

Nous allons analyser les colonnes `actor_persons` (personnes) et `actor_organizations` (organisations).

In [None]:
# Analyser les acteurs (personnes et organisations)


def extract_actors(actor_str):
    """Extrait les acteurs d'une cha√Æne de caract√®res (format liste Python)"""
    if pd.isna(actor_str) or actor_str == "" or actor_str == "[]":
        return []
    try:
        # Essayer de parser comme une liste Python
        actors = ast.literal_eval(actor_str)
        if isinstance(actors, list):
            return [str(actor).strip().strip("'\"") for actor in actors if actor]
        return []
    except (ValueError, SyntaxError):
        # Si √©chec, essayer de s√©parer par virgules
        return [
            actor.strip().strip("'\"[]")
            for actor in str(actor_str).split(",")
            if actor.strip()
        ]


# Extraire toutes les personnes
all_persons = []
for persons in df_final["actor_persons"]:
    all_persons.extend(extract_actors(persons))

# Extraire toutes les organisations
all_organizations = []
for orgs in df_final["actor_organizations"]:
    all_organizations.extend(extract_actors(orgs))

# Compter les acteurs
person_counts = Counter(all_persons)
org_counts = Counter(all_organizations)

print(f"Nombre total de personnes uniques: {len(person_counts)}")
print(f"Nombre total d'organisations uniques: {len(org_counts)}")
print("\nTop 15 des personnes les plus mentionn√©es:")
for person, count in person_counts.most_common(15):
    print(f"  {person}: {count}")
print("\nTop 15 des organisations les plus mentionn√©es:")
for org, count in org_counts.most_common(15):
    print(f"  {org}: {count}")

In [None]:
# Visualisation 2: Top des acteurs (personnes)
top_persons = person_counts.most_common(20)
if top_persons:
    persons_df = pd.DataFrame(top_persons, columns=["Personne", "Count"])

    fig, ax = plt.subplots(figsize=(14, 10))
    bars = ax.barh(persons_df["Personne"], persons_df["Count"], color="#e74c3c")
    ax.set_xlabel("Nombre de mentions", fontsize=12, fontweight="bold")
    ax.set_ylabel("Personne", fontsize=12, fontweight="bold")
    ax.set_title(
        "Top 20 des personnes prenant la parole sur l'agriculture",
        fontsize=14,
        fontweight="bold",
        pad=20,
    )
    ax.invert_yaxis()

    # Ajouter les valeurs
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(
            width,
            bar.get_y() + bar.get_height() / 2,
            f" {int(width)}",
            ha="left",
            va="center",
            fontweight="bold",
        )

    plt.tight_layout()
    plt.show()
else:
    print("Aucune donn√©e sur les personnes")

In [None]:
# Visualisation 3: Top des organisations
top_orgs = org_counts.most_common(20)
if top_orgs:
    orgs_df = pd.DataFrame(top_orgs, columns=["Organisation", "Count"])

    fig, ax = plt.subplots(figsize=(14, 10))
    bars = ax.barh(orgs_df["Organisation"], orgs_df["Count"], color="#9b59b6")
    ax.set_xlabel("Nombre de mentions", fontsize=12, fontweight="bold")
    ax.set_ylabel("Organisation", fontsize=12, fontweight="bold")
    ax.set_title(
        "Top 20 des organisations cit√©es dans les reportages agricoles",
        fontsize=14,
        fontweight="bold",
        pad=20,
    )
    ax.invert_yaxis()

    # Ajouter les valeurs
    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(
            width,
            bar.get_y() + bar.get_height() / 2,
            f" {int(width)}",
            ha="left",
            va="center",
            fontweight="bold",
        )

    plt.tight_layout()
    plt.show()
else:
    print("Aucune donn√©e sur les organisations")

In [None]:
# Visualisation 4: Comparaison Personnes vs Organisations
actor_types = ["Personnes", "Organisations"]
actor_counts_data = [len(all_persons), len(all_organizations)]
unique_counts = [len(person_counts), len(org_counts)]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Total des mentions
bars1 = ax1.bar(actor_types, actor_counts_data, color=["#e74c3c", "#9b59b6"])
ax1.set_ylabel("Nombre total de mentions", fontsize=12, fontweight="bold")
ax1.set_title(
    "Nombre total de mentions par type d'acteur", fontsize=13, fontweight="bold"
)
for bar in bars1:
    height = bar.get_height()
    ax1.text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{int(height)}",
        ha="center",
        va="bottom",
        fontweight="bold",
        fontsize=12,
    )

# Acteurs uniques
bars2 = ax2.bar(actor_types, unique_counts, color=["#e74c3c", "#9b59b6"])
ax2.set_ylabel("Nombre d'acteurs uniques", fontsize=12, fontweight="bold")
ax2.set_title("Nombre d'acteurs uniques par type", fontsize=13, fontweight="bold")
for bar in bars2:
    height = bar.get_height()
    ax2.text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{int(height)}",
        ha="center",
        va="bottom",
        fontweight="bold",
        fontsize=12,
    )

plt.tight_layout()
plt.show()

### Question 3: Quels sont les angles avec lesquels sont trait√©s les sujets agricoles ?

Nous allons analyser:
1. Les mots-cl√©s (`keywords_filtered`) pour identifier les angles de traitement
2. Le sentiment (`sentiment`) pour comprendre le ton des reportages
3. Les cat√©gories secondaires (`secondary_categories`)

In [None]:
# Analyser les mots-cl√©s pour identifier les angles de traitement
all_keywords = []
for keywords in df_final["keywords_filtered"]:
    extracted = extract_actors(
        keywords
    )  # R√©utiliser la fonction pour extraire les listes
    all_keywords.extend(extracted)

# Compter les mots-cl√©s
keyword_counts = Counter(all_keywords)
print(f"Nombre total de mots-cl√©s uniques: {len(keyword_counts)}")
print("\nTop 30 des mots-cl√©s les plus fr√©quents:")
for keyword, count in keyword_counts.most_common(30):
    print(f"  {keyword}: {count}")

In [None]:
# Identifier les angles de traitement par cat√©gories th√©matiques
angles_categories = {
    "√âconomie/Pouvoir d'achat": [
        "prix",
        "co√ªt",
        "√©conomie",
        "march√©",
        "euros",
        "budget",
        "argent",
        "pouvoir achat",
        "revenus",
        "salaire",
        "financier",
    ],
    "Emploi": [
        "emploi",
        "travail",
        "ch√¥mage",
        "m√©tier",
        "formation",
        "embauche",
        "travailleurs",
    ],
    "Sant√©": [
        "sant√©",
        "maladie",
        "m√©decin",
        "h√¥pital",
        "soins",
        "patient",
        "alimentation",
    ],
    "Environnement": [
        "environnement",
        "√©cologie",
        "pollution",
        "climat",
        "biodiversit√©",
        "nature",
        "durable",
        "vert",
    ],
    "R√©glementation/Normes": [
        "norme",
        "r√®glement",
        "loi",
        "interdiction",
        "autorisation",
        "r√©glementation",
        "directive",
        "l√©gislation",
    ],
    "Mobilisation/Col√®re": [
        "col√®re",
        "manifestation",
        "protestation",
        "mobilisation",
        "gr√®ve",
        "blocage",
        "agriculteurs",
    ],
    "Technologie/Innovation": [
        "technologie",
        "innovation",
        "num√©rique",
        "robot",
        "digital",
        "moderne",
        "nouveau",
    ],
}

# Compter les mots-cl√©s par angle
angle_counts = {}
for angle, keywords_list in angles_categories.items():
    count = 0
    for keyword in keyword_counts.keys():
        keyword_lower = keyword.lower()
        if any(kw in keyword_lower for kw in keywords_list):
            count += keyword_counts[keyword]
    angle_counts[angle] = count

angle_df = pd.DataFrame(list(angle_counts.items()), columns=["Angle", "Count"])
angle_df = angle_df.sort_values("Count", ascending=False)

print("\nDistribution des angles de traitement:")
print(angle_df)

In [None]:
# Visualisation 5: Nuage de mots des mots-cl√©s
from wordcloud import WordCloud

# Cr√©er un dictionnaire de fr√©quences pour le nuage de mots
keyword_freq = dict(keyword_counts.most_common(100))

fig, ax = plt.subplots(figsize=(16, 10))
wordcloud = WordCloud(
    width=1600,
    height=800,
    background_color="white",
    colormap="viridis",
    relative_scaling=0.5,
    min_font_size=10,
).generate_from_frequencies(keyword_freq)

ax.imshow(wordcloud, interpolation="bilinear")
ax.axis("off")
ax.set_title(
    "Nuage de mots des principaux mots-cl√©s dans les reportages agricoles",
    fontsize=16,
    fontweight="bold",
    pad=20,
)
plt.tight_layout()
plt.show()

In [None]:
# Visualisation 6: Distribution des sentiments
sentiment_dist = df_final["sentiment"].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Graphique en barres
colors_sent = {"positive": "#2ecc71", "negative": "#e74c3c", "neutral": "#95a5a6"}
bar_colors = [colors_sent.get(sent, "#3498db") for sent in sentiment_dist.index]
bars = ax1.bar(sentiment_dist.index, sentiment_dist.values, color=bar_colors)
ax1.set_xlabel("Sentiment", fontsize=12, fontweight="bold")
ax1.set_ylabel("Nombre de segments", fontsize=12, fontweight="bold")
ax1.set_title(
    "Distribution des sentiments dans les reportages agricoles",
    fontsize=13,
    fontweight="bold",
)
for bar in bars:
    height = bar.get_height()
    ax1.text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{int(height)}\n({100 * height / len(df_final):.1f}%)",
        ha="center",
        va="bottom",
        fontweight="bold",
    )

# Diagramme circulaire
wedges, texts, autotexts = ax2.pie(
    sentiment_dist.values,
    labels=sentiment_dist.index,
    autopct="%1.1f%%",
    colors=bar_colors,
    startangle=90,
)
ax2.set_title("Proportion des sentiments", fontsize=13, fontweight="bold")
for autotext in autotexts:
    autotext.set_color("white")
    autotext.set_fontweight("bold")
    autotext.set_fontsize(12)

plt.tight_layout()
plt.show()

In [None]:
# Visualisation 7: Confidence du sentiment
fig, ax = plt.subplots(figsize=(14, 6))

# Boxplot par sentiment
df_final.boxplot(column="sentiment_confidence", by="sentiment", ax=ax)
ax.set_xlabel("Sentiment", fontsize=12, fontweight="bold")
ax.set_ylabel("Confiance", fontsize=12, fontweight="bold")
ax.set_title(
    "Distribution de la confiance du sentiment par type", fontsize=13, fontweight="bold"
)
plt.suptitle("")  # Supprimer le titre automatique du boxplot

plt.tight_layout()
plt.show()

# Statistiques
print("\nStatistiques de confiance par sentiment:")
print(df_final.groupby("sentiment")["sentiment_confidence"].describe())

In [None]:
# Analyser les cat√©gories secondaires
all_categories = []
for categories in df_final["secondary_categories"]:
    extracted = extract_actors(categories)
    all_categories.extend(extracted)

category_counts = Counter(all_categories)
print(f"Nombre de cat√©gories secondaires uniques: {len(category_counts)}")
print("\nTop 15 des cat√©gories secondaires:")
for cat, count in category_counts.most_common(15):
    print(f"  {cat}: {count}")

In [None]:
# Visualisation 8: Cat√©gories secondaires
if category_counts:
    top_cats = category_counts.most_common(15)
    cats_df = pd.DataFrame(top_cats, columns=["Cat√©gorie", "Count"])

    fig, ax = plt.subplots(figsize=(14, 8))
    bars = ax.barh(cats_df["Cat√©gorie"], cats_df["Count"], color="#f39c12")
    ax.set_xlabel("Nombre de mentions", fontsize=12, fontweight="bold")
    ax.set_ylabel("Cat√©gorie secondaire", fontsize=12, fontweight="bold")
    ax.set_title(
        "Top 15 des cat√©gories secondaires dans les reportages agricoles",
        fontsize=14,
        fontweight="bold",
        pad=20,
    )
    ax.invert_yaxis()

    for i, bar in enumerate(bars):
        width = bar.get_width()
        ax.text(
            width,
            bar.get_y() + bar.get_height() / 2,
            f" {int(width)}",
            ha="left",
            va="center",
            fontweight="bold",
        )

    plt.tight_layout()
    plt.show()
else:
    print("Aucune cat√©gorie secondaire trouv√©e")

### Analyses crois√©es et visualisations avanc√©es

In [None]:
# Visualisation 9: Sentiment par cha√Æne de t√©l√©vision
channel_sentiment = (
    df_final.groupby(["channel_name", "sentiment"]).size().unstack(fill_value=0)
)

fig, ax = plt.subplots(figsize=(14, 8))
channel_sentiment.plot(
    kind="bar", stacked=True, ax=ax, color=["#2ecc71", "#e74c3c", "#95a5a6"]
)
ax.set_xlabel("Cha√Æne de t√©l√©vision", fontsize=12, fontweight="bold")
ax.set_ylabel("Nombre de segments", fontsize=12, fontweight="bold")
ax.set_title(
    "Distribution des sentiments par cha√Æne de t√©l√©vision",
    fontsize=14,
    fontweight="bold",
    pad=20,
)
ax.legend(title="Sentiment", title_fontsize=11, fontsize=10)
ax.tick_params(axis="x", rotation=45)
plt.tight_layout()
plt.show()

print("\nR√©partition des sentiments par cha√Æne (%):")
print((channel_sentiment.div(channel_sentiment.sum(axis=1), axis=0) * 100).round(1))

In [None]:
# Visualisation 10: Dur√©e moyenne des segments par sentiment
duration_by_sentiment = df_final.groupby("sentiment")["duration_seconds"].agg(
    ["mean", "median", "std"]
)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Dur√©e moyenne
bars1 = ax1.bar(
    duration_by_sentiment.index,
    duration_by_sentiment["mean"],
    color=["#2ecc71", "#e74c3c", "#95a5a6"],
)
ax1.set_xlabel("Sentiment", fontsize=12, fontweight="bold")
ax1.set_ylabel("Dur√©e moyenne (secondes)", fontsize=12, fontweight="bold")
ax1.set_title(
    "Dur√©e moyenne des segments par sentiment", fontsize=13, fontweight="bold"
)
for bar in bars1:
    height = bar.get_height()
    ax1.text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{height:.1f}s",
        ha="center",
        va="bottom",
        fontweight="bold",
    )

# Boxplot de la dur√©e
df_final.boxplot(column="duration_seconds", by="sentiment", ax=ax2)
ax2.set_xlabel("Sentiment", fontsize=12, fontweight="bold")
ax2.set_ylabel("Dur√©e (secondes)", fontsize=12, fontweight="bold")
ax2.set_title(
    "Distribution de la dur√©e des segments par sentiment",
    fontsize=13,
    fontweight="bold",
)
plt.suptitle("")

plt.tight_layout()
plt.show()

print("\nStatistiques de dur√©e par sentiment:")
print(duration_by_sentiment)

In [None]:
# Visualisation 11: Temporalit√© - Distribution dans le temps
df_final["segment_start_dt"] = pd.to_datetime(df_final["segment_start"], utc=True)
df_final["month"] = df_final["segment_start_dt"].dt.to_period("M")
df_final["week"] = df_final["segment_start_dt"].dt.to_period("W")

# Par mois
monthly_counts = df_final.groupby("month").size()

fig, ax = plt.subplots(figsize=(16, 6))
monthly_counts.plot(
    kind="line", marker="o", ax=ax, linewidth=2, markersize=8, color="#3498db"
)
ax.set_xlabel("Mois", fontsize=12, fontweight="bold")
ax.set_ylabel("Nombre de segments", fontsize=12, fontweight="bold")
ax.set_title(
    "√âvolution temporelle du nombre de reportages agricoles par mois",
    fontsize=14,
    fontweight="bold",
    pad=20,
)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(
    f"\nP√©riode couverte: de {df_final['segment_start_dt'].min()} √† {df_final['segment_start_dt'].max()}"
)
print(
    f"Nombre total de jours: {(df_final['segment_start_dt'].max() - df_final['segment_start_dt'].min()).days}"
)

In [None]:
# Visualisation 12: Heatmap - Sentiment par cha√Æne et mois
pivot_sentiment_time = df_final.pivot_table(
    index="channel_name",
    columns="month",
    values="segment_id",
    aggfunc="count",
    fill_value=0,
)

fig, ax = plt.subplots(figsize=(16, 8))
sns.heatmap(
    pivot_sentiment_time,
    annot=True,
    fmt="g",
    cmap="YlOrRd",
    ax=ax,
    cbar_kws={"label": "Nombre de segments"},
)
ax.set_xlabel("Mois", fontsize=12, fontweight="bold")
ax.set_ylabel("Cha√Æne", fontsize=12, fontweight="bold")
ax.set_title(
    "Nombre de reportages agricoles par cha√Æne et par mois",
    fontsize=14,
    fontweight="bold",
    pad=20,
)
plt.tight_layout()
plt.show()

In [None]:
# Visualisation 13: Corr√©lation entre nombre de keywords et densit√©
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Scatter plot
ax1.scatter(
    df_final["num_keywords"],
    df_final["avg_density"],
    alpha=0.5,
    c=df_final["duration_seconds"],
    cmap="viridis",
)
ax1.set_xlabel("Nombre de mots-cl√©s", fontsize=12, fontweight="bold")
ax1.set_ylabel("Densit√© moyenne", fontsize=12, fontweight="bold")
ax1.set_title(
    "Relation entre nombre de mots-cl√©s et densit√©", fontsize=13, fontweight="bold"
)
cbar = plt.colorbar(ax1.collections[0], ax=ax1)
cbar.set_label("Dur√©e (s)", fontsize=10)

# Distribution de la densit√©
ax2.hist(
    df_final["avg_density"], bins=30, color="#3498db", alpha=0.7, edgecolor="black"
)
ax2.set_xlabel("Densit√© moyenne", fontsize=12, fontweight="bold")
ax2.set_ylabel("Fr√©quence", fontsize=12, fontweight="bold")
ax2.set_title(
    "Distribution de la densit√© des mots-cl√©s", fontsize=13, fontweight="bold"
)
ax2.axvline(
    df_final["avg_density"].mean(),
    color="red",
    linestyle="--",
    linewidth=2,
    label=f"Moyenne: {df_final['avg_density'].mean():.1f}",
)
ax2.legend()

plt.tight_layout()
plt.show()

print("\nStatistiques sur les mots-cl√©s:")
print(f"Nombre moyen de mots-cl√©s par segment: {df_final['num_keywords'].mean():.2f}")
print(f"Densit√© moyenne: {df_final['avg_density'].mean():.2f}")
print(
    f"Corr√©lation num_keywords - avg_density: {df_final['num_keywords'].corr(df_final['avg_density']):.3f}"
)

### Synth√®se et conclusions

R√©sum√© des principales d√©couvertes de l'analyse.

In [None]:
# Synth√®se des r√©sultats
print("=" * 80)
print("SYNTH√àSE DE L'ANALYSE DES REPORTAGES AGRICOLES")
print("=" * 80)

print("\nüìä DONN√âES G√âN√âRALES:")
print(f"  ‚Ä¢ Nombre total de segments analys√©s: {len(df_final)}")
print(
    f"  ‚Ä¢ P√©riode couverte: {(df_final['segment_start_dt'].max() - df_final['segment_start_dt'].min()).days} jours"
)
print(f"  ‚Ä¢ Cha√Ænes: {', '.join(df_final['channel_name'].unique())}")
print(f"  ‚Ä¢ Dur√©e totale: {df_final['duration_seconds'].sum() / 3600:.1f} heures")

print("\nüåæ QUESTION 1 - MOD√àLES AGRICOLES:")
print(f"  ‚Ä¢ Nombre de th√®mes identifi√©s: {len(theme_counts)}")
print(
    f"  ‚Ä¢ Th√®me le plus fr√©quent: {theme_counts.most_common(1)[0][0]} ({theme_counts.most_common(1)[0][1]} mentions)"
)
print(
    f"  ‚Ä¢ Cat√©gorie dominante: {cat_df.iloc[0]['Cat√©gorie']} ({cat_df.iloc[0]['Count']} mentions)"
)

print("\nüë• QUESTION 2 - ACTEURS:")
print(f"  ‚Ä¢ Personnes uniques: {len(person_counts)}")
print(f"  ‚Ä¢ Organisations uniques: {len(org_counts)}")
print(f"  ‚Ä¢ Total mentions personnes: {len(all_persons)}")
print(f"  ‚Ä¢ Total mentions organisations: {len(all_organizations)}")
if person_counts:
    print(
        f"  ‚Ä¢ Personne la plus cit√©e: {person_counts.most_common(1)[0][0]} ({person_counts.most_common(1)[0][1]} mentions)"
    )
if org_counts:
    print(
        f"  ‚Ä¢ Organisation la plus cit√©e: {org_counts.most_common(1)[0][0]} ({org_counts.most_common(1)[0][1]} mentions)"
    )

print("\nüì∞ QUESTION 3 - ANGLES DE TRAITEMENT:")
print(f"  ‚Ä¢ Mots-cl√©s uniques: {len(keyword_counts)}")
print("  ‚Ä¢ Distribution des sentiments:")
for sent, count in sentiment_dist.items():
    print(f"    - {str(sent).capitalize()}: {count} ({100 * count / len(df_final):.1f}%)")
print(
    f"  ‚Ä¢ Confiance moyenne du sentiment: {df_final['sentiment_confidence'].mean():.3f}"
)
if category_counts:
    print(
        f"  ‚Ä¢ Cat√©gorie secondaire principale: {category_counts.most_common(1)[0][0]} ({category_counts.most_common(1)[0][1]} mentions)"
    )

print("\n" + "=" * 80)

## üìã Conclusions d√©taill√©es

### Question 1: Mod√®les agricoles repr√©sent√©s

**Principaux th√®mes identifi√©s:**
- **Biodiversit√©** (9.2% des mentions) : Concepts g√©n√©raux et causes de la perte de biodiversit√©
- **Changement climatique** (8.8%) : Constat du changement climatique et ses causes
- **Ressources** (3.3%) : Gestion des ressources naturelles
- **Att√©nuation climatique** (4.4%) : Solutions pour r√©duire l'impact climatique  
- **Adaptation climatique** (1.7%) : Mesures d'adaptation au changement

**Observation:** La cat√©gorie "autres" (72.5%) sugg√®re une grande diversit√© de sujets ne relevant pas strictement de ces cat√©gories environnementales pr√©d√©finies.

---

### Question 2: Acteurs m√©diatiques

**Personnes:**
- 3,248 personnes uniques identifi√©es
- Les personnalit√©s politiques dominent (Donald Trump: 81 mentions, Emmanuel Macron: 51)
- Pr√©sence significative de personnalit√©s diverses (journalistes, experts, c√©l√©brit√©s)

**Organisations:**
- 345 organisations uniques mentionn√©es
- Pr√©dominance des institutions europ√©ennes (Union europ√©enne: 72 mentions)
- Forte pr√©sence des institutions fran√ßaises (Assembl√©e nationale: 54)
- Mentions d'entreprises priv√©es (Nestl√©, Renault) et d'organisations internationales (UNESCO, OMS)

---

### Question 3: Angles de traitement

**Principaux angles identifi√©s:**
1. **√âconomie/Pouvoir d'achat** (5,529 mentions) : Angle dominant centr√© sur les aspects √©conomiques
2. **Environnement** (3,251 mentions) : Questions √©cologiques et climatiques
3. **R√©glementation/Normes** (2,071 mentions) : Aspects l√©gislatifs et normatifs
4. **Sant√©** (1,658 mentions) : Impact sanitaire et alimentaire
5. **Emploi** (1,499 mentions) : Questions d'emploi dans le secteur
6. **Technologie/Innovation** (900 mentions) : Modernisation et innovation
7. **Mobilisation/Col√®re** (544 mentions) : Manifestations et protestation des agriculteurs

**Distribution des sentiments:**
- **N√©gatif**: 43.8% - Pr√©dominance de la tonalit√© n√©gative
- **Positif**: 35.2% - Pr√©sence substantielle de reportages positifs
- **Neutre**: 21.0% - Traitement factuel

**Observations temporelles:**
- Pics de couverture en mai, juillet et ao√ªt 2025
- Variations saisonni√®res refl√©tant probablement les cycles agricoles et √©v√©nements sp√©cifiques
- France 2 assure la plus forte couverture (913 segments), suivie de TF1 (731)

## üî¨ Analyses crois√©es approfondies - Point de vue Data Analyst Agricole

Analyses d√©taill√©es des relations entre acteurs, sentiments, localisations et th√©matiques agricoles.

### 1. Analyse des substantifs agricoles (nouns_in_keywords)

In [None]:
# Extraire et analyser les substantifs agricoles
all_nouns = []
for nouns in df_final["nouns_in_keywords"]:
    extracted = extract_actors(nouns)
    all_nouns.extend(extracted)

noun_counts = Counter(all_nouns)

# Identifier les termes sp√©cifiques √† l'agriculture
agricultural_terms = {
    "Production": ["production", "r√©colte", "rendement", "cultures", "√©levage"],
    "Produits": [
        "produits",
        "produit",
        "aliments",
        "nourriture",
        "viande",
        "l√©gumes",
        "fruits",
        "c√©r√©ales",
    ],
    "Acteurs agricoles": [
        "agriculteurs",
        "agriculteur",
        "√©leveurs",
        "producteurs",
        "exploitants",
        "fermiers",
    ],
    "Territoire": ["terre", "terrains", "parcelles", "exploitation", "ferme", "champs"],
    "√âconomie": ["prix", "march√©", "vente", "revenus", "co√ªts", "subventions", "aide"],
    "Environnement": [
        "eau",
        "sol",
        "climat",
        "environnement",
        "nature",
        "biodiversit√©",
    ],
    "Pratiques": ["techniques", "m√©thodes", "pratiques", "travail", "gestion"],
}

# Compter les mentions par cat√©gorie agricole
agri_category_counts = {}
for category, terms in agricultural_terms.items():
    count = 0
    for noun, noun_count in noun_counts.items():
        if any(term in noun.lower() for term in terms):
            count += noun_count
    agri_category_counts[category] = count

agri_cat_df = pd.DataFrame(
    list(agri_category_counts.items()), columns=["Cat√©gorie agricole", "Mentions"]
)
agri_cat_df = agri_cat_df.sort_values("Mentions", ascending=False)

print("üåæ Distribution des substantifs par cat√©gorie agricole:")
print(agri_cat_df)
print("\nTop 30 substantifs les plus fr√©quents:")
for noun, count in noun_counts.most_common(30):
    print(f"  {noun}: {count}")

In [None]:
# Visualisation des cat√©gories agricoles
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# Barres horizontales
bars = ax1.barh(
    agri_cat_df["Cat√©gorie agricole"], agri_cat_df["Mentions"], color="#27ae60"
)
ax1.set_xlabel("Nombre de mentions", fontsize=12, fontweight="bold")
ax1.set_ylabel("Cat√©gorie", fontsize=12, fontweight="bold")
ax1.set_title(
    "Distribution des substantifs par cat√©gorie agricole",
    fontsize=13,
    fontweight="bold",
)
ax1.invert_yaxis()
for bar in bars:
    width = bar.get_width()
    ax1.text(
        width,
        bar.get_y() + bar.get_height() / 2,
        f" {int(width)}",
        ha="left",
        va="center",
        fontweight="bold",
    )

# Top 15 substantifs agricoles sp√©cifiques
agri_specific_nouns = []
for noun, count in noun_counts.items():
    for category, terms in agricultural_terms.items():
        if any(term in noun.lower() for term in terms):
            agri_specific_nouns.append((noun, count))
            break

agri_specific_nouns = sorted(agri_specific_nouns, key=lambda x: x[1], reverse=True)[:15]
nouns_df = pd.DataFrame(agri_specific_nouns, columns=["Substantif", "Count"])

bars2 = ax2.barh(nouns_df["Substantif"], nouns_df["Count"], color="#16a085")
ax2.set_xlabel("Fr√©quence", fontsize=12, fontweight="bold")
ax2.set_ylabel("Substantif", fontsize=12, fontweight="bold")
ax2.set_title("Top 15 des substantifs agricoles", fontsize=13, fontweight="bold")
ax2.invert_yaxis()
for bar in bars2:
    width = bar.get_width()
    ax2.text(
        width,
        bar.get_y() + bar.get_height() / 2,
        f" {int(width)}",
        ha="left",
        va="center",
        fontweight="bold",
        fontsize=9,
    )

plt.tight_layout()
plt.show()

### 2. Analyse g√©ographique : Localisation des reportages agricoles (actor_locations)

In [None]:
# Analyser les localisations g√©ographiques
all_locations = []
for locations in df_final["actor_locations"]:
    extracted = extract_actors(locations)
    all_locations.extend(extracted)

location_counts = Counter(all_locations)

# Cat√©goriser les localisations
french_regions = [
    "bretagne",
    "normandie",
    "auvergne",
    "occitanie",
    "nouvelle-aquitaine",
    "grand est",
    "hauts-de-france",
    "provence",
    "pays de la loire",
    "centre-val de loire",
    "bourgogne",
    "√Æle-de-france",
    "corse",
]
french_cities = [
    "paris",
    "lyon",
    "marseille",
    "toulouse",
    "bordeaux",
    "lille",
    "nantes",
    "strasbourg",
    "rennes",
    "nice",
    "montpellier",
]

location_types = {
    "France (r√©gions)": 0,
    "France (villes)": 0,
    "Europe": 0,
    "International": 0,
    "Autres": 0,
}

for location, count in location_counts.items():
    loc_lower = location.lower()
    if any(region in loc_lower for region in french_regions):
        location_types["France (r√©gions)"] += count
    elif any(city in loc_lower for city in french_cities) or "france" in loc_lower:
        location_types["France (villes)"] += count
    elif any(
        country in loc_lower
        for country in [
            "allemagne",
            "italie",
            "espagne",
            "belgique",
            "europe",
            "bruxelles",
        ]
    ):
        location_types["Europe"] += count
    elif loc_lower not in ["", "null", "none"]:
        location_types["International"] += count

print(f"üó∫Ô∏è Nombre total de localisations uniques: {len(location_counts)}")
print(f"Total de mentions de lieux: {len(all_locations)}")
print("\nTop 30 des localisations les plus mentionn√©es:")
for location, count in location_counts.most_common(30):
    print(f"  {location}: {count}")
print("\nDistribution par type de localisation:")
for loc_type, count in location_types.items():
    print(f"  {loc_type}: {count}")

In [None]:
# Visualisation g√©ographique
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# Top 20 localisations
top_locations = location_counts.most_common(20)
if top_locations:
    loc_df = pd.DataFrame(top_locations, columns=["Localisation", "Count"])

    bars = ax1.barh(loc_df["Localisation"], loc_df["Count"], color="#e67e22")
    ax1.set_xlabel("Nombre de mentions", fontsize=12, fontweight="bold")
    ax1.set_ylabel("Localisation", fontsize=12, fontweight="bold")
    ax1.set_title(
        "Top 20 des localisations dans les reportages agricoles",
        fontsize=13,
        fontweight="bold",
    )
    ax1.invert_yaxis()
    for bar in bars:
        width = bar.get_width()
        ax1.text(
            width,
            bar.get_y() + bar.get_height() / 2,
            f" {int(width)}",
            ha="left",
            va="center",
            fontweight="bold",
            fontsize=9,
        )

# Distribution par type
loc_type_df = pd.DataFrame(list(location_types.items()), columns=["Type", "Count"])
loc_type_df = loc_type_df[loc_type_df["Count"] > 0].sort_values(
    "Count", ascending=False
)

colors_loc = sns.color_palette("Set2", len(loc_type_df))
wedges, texts, autotexts = ax2.pie(
    loc_type_df["Count"],
    labels=loc_type_df["Type"],
    autopct="%1.1f%%",
    colors=colors_loc,
    startangle=90,
)
ax2.set_title("R√©partition g√©ographique des reportages", fontsize=13, fontweight="bold")
for autotext in autotexts:
    autotext.set_color("white")
    autotext.set_fontweight("bold")

plt.tight_layout()
plt.show()

### 3. Analyse crois√©e : Sentiment selon les acteurs (personnes vs organisations)

In [None]:
# Cr√©er des indicateurs pour la pr√©sence d'acteurs
df_final["has_person"] = df_final["actor_persons"].apply(
    lambda x: len(extract_actors(x)) > 0 if pd.notna(x) else False
)
df_final["has_organization"] = df_final["actor_organizations"].apply(
    lambda x: len(extract_actors(x)) > 0 if pd.notna(x) else False
)
df_final["has_location"] = df_final["actor_locations"].apply(
    lambda x: len(extract_actors(x)) > 0 if pd.notna(x) else False
)

# Analyser le sentiment selon le type d'acteur pr√©sent
sentiment_by_actor_type = pd.DataFrame(
    {
        "Avec personnes": df_final[df_final["has_person"]]["sentiment"].value_counts(),
        "Avec organisations": df_final[df_final["has_organization"]][
            "sentiment"
        ].value_counts(),
        "Avec localisation": df_final[df_final["has_location"]][
            "sentiment"
        ].value_counts(),
        "Sans acteur": df_final[
            ~df_final["has_person"] & ~df_final["has_organization"]
        ]["sentiment"].value_counts(),
    }
).fillna(0)

print("üìä R√©partition des sentiments selon le type d'acteur pr√©sent:")
print(sentiment_by_actor_type)
print("\nüìä R√©partition en pourcentage:")
print((sentiment_by_actor_type / sentiment_by_actor_type.sum() * 100).round(1))

# Confiance moyenne du sentiment selon les acteurs
conf_by_actor = {
    "Avec personnes": df_final[df_final["has_person"]]["sentiment_confidence"].mean(),
    "Avec organisations": df_final[df_final["has_organization"]][
        "sentiment_confidence"
    ].mean(),
    "Avec localisation": df_final[df_final["has_location"]][
        "sentiment_confidence"
    ].mean(),
    "Sans acteur": df_final[~df_final["has_person"] & ~df_final["has_organization"]][
        "sentiment_confidence"
    ].mean(),
}
print("\nüéØ Confiance moyenne du sentiment selon les acteurs:")
for actor_type, conf in conf_by_actor.items():
    print(f"  {actor_type}: {conf:.3f}")

In [None]:
# Visualisation du sentiment selon les acteurs
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))

# Graphique empil√© des sentiments par type d'acteur
sentiment_by_actor_type.T.plot(
    kind="bar", stacked=True, ax=ax1, color=["#e74c3c", "#95a5a6", "#2ecc71"]
)
ax1.set_xlabel("Type d'acteur", fontsize=12, fontweight="bold")
ax1.set_ylabel("Nombre de segments", fontsize=12, fontweight="bold")
ax1.set_title(
    "Distribution des sentiments selon le type d'acteur pr√©sent",
    fontsize=13,
    fontweight="bold",
)
ax1.legend(title="Sentiment", title_fontsize=11)
ax1.tick_params(axis="x", rotation=45)

# Confiance du sentiment par type d'acteur
conf_df = pd.DataFrame(
    list(conf_by_actor.items()), columns=["Type d'acteur", "Confiance"]
)
bars = ax2.bar(conf_df["Type d'acteur"], conf_df["Confiance"], color="#3498db")
ax2.set_xlabel("Type d'acteur", fontsize=12, fontweight="bold")
ax2.set_ylabel("Confiance moyenne", fontsize=12, fontweight="bold")
ax2.set_title(
    "Confiance du sentiment selon les acteurs pr√©sents", fontsize=13, fontweight="bold"
)
ax2.tick_params(axis="x", rotation=45)
ax2.set_ylim(0.7, 0.85)
for bar in bars:
    height = bar.get_height()
    ax2.text(
        bar.get_x() + bar.get_width() / 2.0,
        height,
        f"{height:.3f}",
        ha="center",
        va="bottom",
        fontweight="bold",
    )

plt.tight_layout()
plt.show()

### 4. Analyse approfondie : Top acteurs et leur sentiment associ√©

In [None]:
# Analyser les sentiments pour les top personnes et organisations
top_persons_list = [person for person, _ in person_counts.most_common(15)]
top_orgs_list = [org for org, _ in org_counts.most_common(15)]

# Cr√©er un DataFrame pour chaque personne avec ses sentiments
person_sentiment_data = []
for person in top_persons_list:
    person_segments = df_final[
        df_final["actor_persons"].apply(
            lambda x: person in extract_actors(x) if pd.notna(x) else False
        )
    ]
    if len(person_segments) > 0:
        for sentiment in ["negative", "positive", "neutral"]:
            count = len(person_segments[person_segments["sentiment"] == sentiment])
            person_sentiment_data.append(
                {
                    "Acteur": person,
                    "Type": "Personne",
                    "Sentiment": sentiment,
                    "Count": count,
                    "Confiance_moy": person_segments[
                        person_segments["sentiment"] == sentiment
                    ]["sentiment_confidence"].mean(),
                }
            )

# Cr√©er un DataFrame pour chaque organisation avec ses sentiments
org_sentiment_data = []
for org in top_orgs_list:
    org_segments = df_final[
        df_final["actor_organizations"].apply(
            lambda x: org in extract_actors(x) if pd.notna(x) else False
        )
    ]
    if len(org_segments) > 0:
        for sentiment in ["negative", "positive", "neutral"]:
            count = len(org_segments[org_segments["sentiment"] == sentiment])
            org_sentiment_data.append(
                {
                    "Acteur": org,
                    "Type": "Organisation",
                    "Sentiment": sentiment,
                    "Count": count,
                    "Confiance_moy": org_segments[
                        org_segments["sentiment"] == sentiment
                    ]["sentiment_confidence"].mean(),
                }
            )

person_sent_df = pd.DataFrame(person_sentiment_data)
org_sent_df = pd.DataFrame(org_sentiment_data)

# Afficher le top 10 des personnes avec leur profil de sentiment
print("üë§ TOP 10 PERSONNES - Profil de sentiment:")
top_persons_profile = person_sent_df.pivot_table(
    index="Acteur", columns="Sentiment", values="Count", fill_value=0
)
top_persons_profile["Total"] = top_persons_profile.sum(axis=1)
top_persons_profile = top_persons_profile.sort_values("Total", ascending=False).head(10)
print(top_persons_profile)

print("\nüè¢ TOP 10 ORGANISATIONS - Profil de sentiment:")
top_orgs_profile = org_sent_df.pivot_table(
    index="Acteur", columns="Sentiment", values="Count", fill_value=0
)
top_orgs_profile["Total"] = top_orgs_profile.sum(axis=1)
top_orgs_profile = top_orgs_profile.sort_values("Total", ascending=False).head(10)
print(top_orgs_profile)

In [None]:
# Visualisation des profils de sentiment des top acteurs
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(16, 12))

# Top personnes
top_persons_profile_pct = (
    top_persons_profile[["negative", "neutral", "positive"]].div(
        top_persons_profile["Total"], axis=0
    )
    * 100
)
top_persons_profile_pct.plot(
    kind="barh", stacked=True, ax=ax1, color=["#e74c3c", "#95a5a6", "#2ecc71"]
)
ax1.set_xlabel("Pourcentage", fontsize=12, fontweight="bold")
ax1.set_ylabel("Personne", fontsize=12, fontweight="bold")
ax1.set_title(
    "Profil de sentiment des top 10 personnes (en %)", fontsize=14, fontweight="bold"
)
ax1.legend(title="Sentiment", loc="lower right", title_fontsize=11)
ax1.invert_yaxis()

# Top organisations
top_orgs_profile_pct = (
    top_orgs_profile[["negative", "neutral", "positive"]].div(
        top_orgs_profile["Total"], axis=0
    )
    * 100
)
top_orgs_profile_pct.plot(
    kind="barh", stacked=True, ax=ax2, color=["#e74c3c", "#95a5a6", "#2ecc71"]
)
ax2.set_xlabel("Pourcentage", fontsize=12, fontweight="bold")
ax2.set_ylabel("Organisation", fontsize=12, fontweight="bold")
ax2.set_title(
    "Profil de sentiment des top 10 organisations (en %)",
    fontsize=14,
    fontweight="bold",
)
ax2.legend(title="Sentiment", loc="lower right", title_fontsize=11)
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

### 5. Analyse territoriale crois√©e : Sentiment et localisation

In [None]:
# Analyser le sentiment par top localisation
top_locations_list = [loc for loc, _ in location_counts.most_common(15)]

location_sentiment_data = []
for location in top_locations_list:
    loc_segments = df_final[
        df_final["actor_locations"].apply(
            lambda x: location in extract_actors(x) if pd.notna(x) else False
        )
    ]
    if len(loc_segments) > 0:
        for sentiment in ["negative", "positive", "neutral"]:
            count = len(loc_segments[loc_segments["sentiment"] == sentiment])
            if count > 0:
                location_sentiment_data.append(
                    {
                        "Localisation": location,
                        "Sentiment": sentiment,
                        "Count": count,
                        "Confiance_moy": loc_segments[
                            loc_segments["sentiment"] == sentiment
                        ]["sentiment_confidence"].mean(),
                    }
                )

loc_sent_df = pd.DataFrame(location_sentiment_data)

# Profil de sentiment par localisation
if len(loc_sent_df) > 0:
    loc_sentiment_profile = loc_sent_df.pivot_table(
        index="Localisation", columns="Sentiment", values="Count", fill_value=0
    )
    loc_sentiment_profile["Total"] = loc_sentiment_profile.sum(axis=1)
    loc_sentiment_profile = loc_sentiment_profile.sort_values(
        "Total", ascending=False
    ).head(12)

    print("üó∫Ô∏è TOP 12 LOCALISATIONS - Profil de sentiment:")
    print(loc_sentiment_profile)

    # Calculer les pourcentages n√©gatifs par localisation
    loc_sentiment_profile["%_negative"] = (
        loc_sentiment_profile["negative"] / loc_sentiment_profile["Total"] * 100
    ).round(1)
    loc_sentiment_profile["%_positive"] = (
        loc_sentiment_profile["positive"] / loc_sentiment_profile["Total"] * 100
    ).round(1)

    print("\nüìä Pourcentages de sentiment n√©gatif/positif par localisation:")
    print(
        loc_sentiment_profile[["%_negative", "%_positive", "Total"]].sort_values(
            "%_negative", ascending=False
        )
    )
else:
    print("Pas assez de donn√©es de localisation avec sentiment")

In [None]:
# Visualisation du sentiment par localisation
if len(loc_sent_df) > 0 and len(loc_sentiment_profile) > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))

    # Graphique empil√© en pourcentage
    loc_sentiment_pct = (
        loc_sentiment_profile[["negative", "neutral", "positive"]].div(
            loc_sentiment_profile["Total"], axis=0
        )
        * 100
    )
    loc_sentiment_pct.plot(
        kind="barh", stacked=True, ax=ax1, color=["#e74c3c", "#95a5a6", "#2ecc71"]
    )
    ax1.set_xlabel("Pourcentage", fontsize=12, fontweight="bold")
    ax1.set_ylabel("Localisation", fontsize=12, fontweight="bold")
    ax1.set_title(
        "Profil de sentiment par localisation (en %)", fontsize=14, fontweight="bold"
    )
    ax1.legend(title="Sentiment", title_fontsize=11)
    ax1.invert_yaxis()

    # Scatter plot : n√©gatif vs positif
    ax2.scatter(
        loc_sentiment_profile["%_positive"],
        loc_sentiment_profile["%_negative"],
        s=loc_sentiment_profile["Total"] * 10,
        alpha=0.6,
        color="#3498db",
    )

    # Ajouter les labels pour les localisations
    for idx, row in loc_sentiment_profile.iterrows():
        ax2.annotate(idx, (row["%_positive"], row["%_negative"]), fontsize=9, alpha=0.7)

    ax2.set_xlabel("% Sentiment positif", fontsize=12, fontweight="bold")
    ax2.set_ylabel("% Sentiment n√©gatif", fontsize=12, fontweight="bold")
    ax2.set_title(
        "Cartographie sentiment positif vs n√©gatif par localisation\n(taille = nombre de mentions)",
        fontsize=13,
        fontweight="bold",
    )
    ax2.grid(True, alpha=0.3)
    ax2.axhline(y=50, color="r", linestyle="--", alpha=0.5)
    ax2.axvline(x=50, color="r", linestyle="--", alpha=0.5)

    plt.tight_layout()
    plt.show()
else:
    print("Visualisation impossible : donn√©es insuffisantes")

### 6. Analyse s√©mantique : Substantifs selon le sentiment

In [None]:
# Analyser les substantifs les plus fr√©quents par sentiment
nouns_by_sentiment = {}

for sentiment in ["negative", "positive", "neutral"]:
    sentiment_nouns = []
    sentiment_segments = df_final[df_final["sentiment"] == sentiment]

    for nouns in sentiment_segments["nouns_in_keywords"]:
        extracted = extract_actors(nouns)
        sentiment_nouns.extend(extracted)

    nouns_by_sentiment[sentiment] = Counter(sentiment_nouns)

print("üìù TOP 20 SUBSTANTIFS PAR SENTIMENT:\n")

print("üî¥ SENTIMENT N√âGATIF:")
for noun, count in nouns_by_sentiment["negative"].most_common(20):
    print(f"  {noun}: {count}")

print("\nüü¢ SENTIMENT POSITIF:")
for noun, count in nouns_by_sentiment["positive"].most_common(20):
    print(f"  {noun}: {count}")

print("\n‚ö™ SENTIMENT NEUTRE:")
for noun, count in nouns_by_sentiment["neutral"].most_common(20):
    print(f"  {noun}: {count}")

# Identifier les mots sp√©cifiques √† chaque sentiment (diff√©rentiel)
negative_specific = set(
    [noun for noun, _ in nouns_by_sentiment["negative"].most_common(50)]
)
positive_specific = set(
    [noun for noun, _ in nouns_by_sentiment["positive"].most_common(50)]
)
neutral_specific = set(
    [noun for noun, _ in nouns_by_sentiment["neutral"].most_common(50)]
)

print("\nüéØ MOTS DISTINCTIFS:")
print(
    f"Seulement dans n√©gatif (top 50): {negative_specific - positive_specific - neutral_specific}"
)
print(
    f"Seulement dans positif (top 50): {positive_specific - negative_specific - neutral_specific}"
)

In [None]:
# Visualisation compar√©e des substantifs par sentiment
fig, axes = plt.subplots(1, 3, figsize=(18, 7))

sentiments_to_plot = [
    ("negative", "üî¥ N√©gatif", "#e74c3c"),
    ("positive", "üü¢ Positif", "#2ecc71"),
    ("neutral", "‚ö™ Neutre", "#95a5a6"),
]

for idx, (sentiment, title, color) in enumerate(sentiments_to_plot):
    top_nouns = nouns_by_sentiment[sentiment].most_common(15)
    if top_nouns:
        nouns_df_sent = pd.DataFrame(top_nouns, columns=["Substantif", "Count"])

        bars = axes[idx].barh(
            nouns_df_sent["Substantif"], nouns_df_sent["Count"], color=color
        )
        axes[idx].set_xlabel("Fr√©quence", fontsize=11, fontweight="bold")
        axes[idx].set_ylabel("Substantif", fontsize=11, fontweight="bold")
        axes[idx].set_title(
            f"Top 15 substantifs - {title}", fontsize=12, fontweight="bold"
        )
        axes[idx].invert_yaxis()

        for bar in bars:
            width = bar.get_width()
            axes[idx].text(
                width,
                bar.get_y() + bar.get_height() / 2,
                f" {int(width)}",
                ha="left",
                va="center",
                fontsize=8,
                fontweight="bold",
            )

plt.tight_layout()
plt.show()

### 7. Synth√®se : Insights cl√©s pour l'agriculture fran√ßaise

In [None]:
# G√©n√©rer une synth√®se compl√®te bas√©e sur les analyses crois√©es
print("=" * 80)
print("üåæ SYNTH√àSE ANALYTIQUE - AGRICULTURE DANS LES M√âDIAS FRAN√áAIS")
print("=" * 80)

print("\n1Ô∏è‚É£ TERMINOLOGIE ET LANGAGE AGRICOLE:")
print(
    f"   ‚Ä¢ Cat√©gorie dominante: {agri_cat_df.iloc[0]['Cat√©gorie agricole']} ({agri_cat_df.iloc[0]['Mentions']} mentions)"
)
print("   ‚Ä¢ Focus principal: Aspects √©conomiques (prix, production, march√©)")
print("   ‚Ä¢ Termes environnementaux tr√®s pr√©sents (eau, sol, climat)")

print("\n2Ô∏è‚É£ DIMENSION G√âOGRAPHIQUE:")
total_loc_mentions = sum(location_types.values())
if total_loc_mentions > 0:
    print(f"   ‚Ä¢ Total de mentions g√©ographiques: {total_loc_mentions}")
    for loc_type, count in sorted(
        location_types.items(), key=lambda x: x[1], reverse=True
    ):
        if count > 0:
            pct = count / total_loc_mentions * 100
            print(f"   ‚Ä¢ {loc_type}: {count} ({pct:.1f}%)")

print("\n3Ô∏è‚É£ ACTEURS ET SENTIMENT:")
print(
    f"   ‚Ä¢ Segments avec personnes: {df_final['has_person'].sum()} ({df_final['has_person'].mean() * 100:.1f}%)"
)
print(
    f"   ‚Ä¢ Segments avec organisations: {df_final['has_organization'].sum()} ({df_final['has_organization'].mean() * 100:.1f}%)"
)
print(
    f"   ‚Ä¢ Segments avec localisation: {df_final['has_location'].sum()} ({df_final['has_location'].mean() * 100:.1f}%)"
)

print("\n4Ô∏è‚É£ PROFIL DE SENTIMENT DES ACTEURS:")
for actor_type in ["Avec personnes", "Avec organisations"]:
    if actor_type in sentiment_by_actor_type.columns:
        dominant_sentiment = sentiment_by_actor_type[actor_type].idxmax()
        dominant_pct = (
            sentiment_by_actor_type[actor_type].max()
            / sentiment_by_actor_type[actor_type].sum()
            * 100
        )
        print(
            f"   ‚Ä¢ {actor_type}: majoritairement {dominant_sentiment} ({dominant_pct:.1f}%)"
        )

print("\n5Ô∏è‚É£ CONFIANCE DES ANALYSES:")
avg_conf = df_final["sentiment_confidence"].mean()
print(f"   ‚Ä¢ Confiance moyenne globale: {avg_conf:.3f}")
print(f"   ‚Ä¢ Plus haute confiance: {max(conf_by_actor.items(), key=lambda x: x[1])}")
print(f"   ‚Ä¢ Plus faible confiance: {min(conf_by_actor.items(), key=lambda x: x[1])}")

print("\n6Ô∏è‚É£ INSIGHTS CL√âS POUR L'AGRICULTURE:")
print("   ‚úì Le traitement m√©diatique privil√©gie l'angle √©conomique")
print("   ‚úì Fort ancrage territorial des reportages (localisations fr√©quentes)")
print("   ‚úì Pr√©sence significative d'acteurs politiques et institutionnels")
print("   ‚úì Tonalit√© globalement n√©gative (43.8%) refl√©tant les d√©fis du secteur")
print("   ‚úì Questions environnementales omnipr√©sentes dans le vocabulaire")

print("\n" + "=" * 80)