# ðŸ”Ž Projet â€“ Exploration du Corpus de Discours US
### Interface de recherche + Analyse temporelle
**RÃ©alisÃ© par : Cyrine Nighaoui**


In [1]:
import pandas as pd
import sys
sys.path.append("../src")

from Corpus import Corpus

df = pd.read_csv("discours_US.csv", sep="\t")

corpus = Corpus("discours_US")
for i, row in df.iterrows():
    corpus.add_document(
        titre=row["descr"],
        auteur=row["speaker"],
        date=row["date"],
        url=row["link"],
        texte=row["text"]
    )

In [2]:
import pickle

with open("engine.pkl", "rb") as f:
    engine = pickle.load(f)

print("Moteur TF-IDF chargÃ©.")


Moteur TF-IDF chargÃ©.


In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

query_input = widgets.Text(description="Query:", value="america")
k_slider = widgets.IntSlider(min=1, max=20, value=5, description="Top k:")

authors = sorted(df["speaker"].unique())
author_filter = widgets.Dropdown(options=["Tous"] + authors, description="Auteur:")

dates = sorted(df["date"].unique())
date_filter = widgets.Dropdown(options=["Toutes"] + dates, description="Date:")

btn = widgets.Button(description="Rechercher")

output = widgets.Output()


In [4]:
def clique_bouton(b):
    with output:
        clear_output(wait=True)

        query = query_input.value
        k = k_slider.value
        
        # RÃ©sultats bruts
        results = engine.search(query, top=9999)

        # Filtre par auteur
        if author_filter.value != "Tous":
            results = results[results["id"].apply(
                lambda doc_id: corpus.id2doc[doc_id].auteur == author_filter.value
            )]

        # Filtre par date
        if date_filter.value != "Toutes":
            results = results[results["id"].apply(
                lambda doc_id: corpus.id2doc[doc_id].date == date_filter.value
            )]

        # Limiter aux k premiers rÃ©sultats
        results = results.head(k)

        display(results)


In [5]:
display(query_input, k_slider, author_filter, date_filter, btn, output)
btn.on_click(clique_bouton)


Text(value='america', description='Query:')

IntSlider(value=5, description='Top k:', max=20, min=1)

Dropdown(description='Auteur:', options=('Tous', 'CLINTON', 'TRUMP'), value='Tous')

Dropdown(description='Date:', options=('Toutes', 'April 12, 2015', 'April 14, 2015', 'April 20, 2015', 'April â€¦

Button(description='Rechercher', style=ButtonStyle())

Output()

# ðŸ“Š Analyse : Ã‰volution temporelle dâ€™un mot


In [6]:
# Extraire l'annÃ©e de chaque document
df['parsed_date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = df['parsed_date'].dt.year

# Associer doc_id â†’ annÃ©e pour un accÃ¨s rapide
df[['date', 'parsed_date', 'year']].head()
doc_year = {}

for doc_id, doc in corpus.id2doc.items():
    try:
        year = int(doc.date[:4])
        doc_year[doc_id] = year
    except:
        doc_year[doc_id] = None


In [None]:
import matplotlib.pyplot as plt

mot_input = widgets.Text(
    value="america",
    description="Mot:",
)

btn_evolution = widgets.Button(
    description="Voir Ã©volution",
    button_style="primary"
)

output_evol = widgets.Output()


In [8]:
def evolution_temporelle(b):
    with output_evol:
        clear_output(wait=True)
        
        mot = mot_input.value.lower()
        freq_par_an = {}

        # Calcul : occurrences du mot par annÃ©e
        for doc_id, doc in corpus.id2doc.items():
            year = pd.to_datetime(doc.date, errors='coerce').year
            if year is None:
                continue
            texte = doc.texte.lower().split()
            freq = texte.count(mot)
            freq_par_an[year] = freq_par_an.get(year, 0) + freq

        # DonnÃ©es triÃ©es
        annees = sorted(freq_par_an.keys())
        freqs = [freq_par_an[a] for a in annees]

        # Graphique
        plt.figure(figsize=(8,4))
        plt.plot(annees, freqs, marker="o")
        plt.title(f"Ã‰volution du mot '{mot}' dans le temps")
        plt.xlabel("AnnÃ©e")
        plt.ylabel("FrÃ©quence")
        plt.grid(True)
        plt.show()

        display(query_input, k_slider, author_filter, date_filter, btn, output)


In [9]:
btn_evolution.on_click(evolution_temporelle)

display(mot_input, btn_evolution, output_evol)


Text(value='america', description='Mot:')

Button(button_style='primary', description='Voir Ã©volution', style=ButtonStyle())

Output()

ðŸ”š Conclusion

Ce projet propose :

une interface claire permettant dâ€™interroger un corpus de discours politiques

un moteur de recherche basÃ© sur TF-IDF

des filtres permettant un accÃ¨s ciblÃ© par auteur ou pÃ©riode

une visualisation temporelle permettant dâ€™Ã©tudier lâ€™Ã©volution dâ€™un thÃ¨me

Lâ€™ensemble constitue un outil simple mais puissant dâ€™exploration de corpus textuels.