In [None]:
import sys, re
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

import ipywidgets as widgets
from IPython.display import display, clear_output

ROOT = Path.cwd().resolve()          # .../TD_PROGSPE_PYTHON/notebooks
SRC  = (ROOT.parent / "src").resolve()
DATA = (ROOT.parent / "data").resolve()

if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from td8_discours import read_discours_tsv, parse_date_safe, split_into_sentences
from Corpus import Corpus
from Document import Document
from SearchEngine import SearchEngine


In [None]:
csv_path = DATA / "discours_US.csv"
df_raw = read_discours_tsv(csv_path)

Corpus.reset_singleton()
corpus = Corpus("Corpus TD9 - Discours US")

docs_rows = []  # pour analyses TD9 (comparaison / temps)

for _, row in df_raw.iterrows():
    auteur = str(row.get("speaker", "Unknown")).strip() or "Unknown"
    titre  = str(row.get("descr", "Discours")).strip() or "Discours"
    date   = parse_date_safe(row.get("date", None))
    url    = str(row.get("link", "")).strip()
    text   = row.get("text", "")

    for sent in split_into_sentences(text):
        corpus.add_document(Document(titre=titre, auteur=auteur, date=date, url=url, texte=sent))
        docs_rows.append({"auteur": auteur, "date": date, "texte": sent, "url": url, "titre": titre})

se = SearchEngine(corpus)
df_docs = pd.DataFrame(docs_rows)

print("✅ Corpus:", corpus)
df_docs.head()


In [None]:
STOPWORDS = {
    "the","a","an","and","or","to","of","in","on","for","with","is","are","was","were",
    "be","been","it","that","this","as","at","by","from","we","you","they","i","our",
    "your","their","not","but","will","would","can","could","should","my","me","us"
}

def tokenize(text: str):
    text = (text or "").lower()
    words = re.findall(r"[a-z']+", text)
    return [w for w in words if len(w) >= 3 and w not in STOPWORDS]

def tf_counts(sub_df: pd.DataFrame):
    c = {}
    for t in sub_df["texte"].astype(str):
        for w in tokenize(t):
            c[w] = c.get(w, 0) + 1
    return c

def top_distinctive_words(df_docs, a1, a2, top_n=20):
    d1 = df_docs[df_docs["auteur"] == a1]
    d2 = df_docs[df_docs["auteur"] == a2]
    c1 = tf_counts(d1)
    c2 = tf_counts(d2)

    # score simple: fréquence relative (diff)
    n1 = sum(c1.values()) or 1
    n2 = sum(c2.values()) or 1

    all_words = set(c1) | set(c2)
    scored = []
    for w in all_words:
        s = (c1.get(w,0)/n1) - (c2.get(w,0)/n2)
        scored.append((w, s, c1.get(w,0), c2.get(w,0)))

    scored.sort(key=lambda x: x[1], reverse=True)
    return pd.DataFrame(scored[:top_n], columns=["mot", "score(a1-a2)", "tf_a1", "tf_a2"])

def word_over_time(df_docs, word: str, auteur=None):
    word = word.lower().strip()
    d = df_docs.copy()
    if auteur and auteur != "ALL":
        d = d[d["auteur"] == auteur]

    d = d.dropna(subset=["date"])
    d["month"] = pd.to_datetime(d["date"]).dt.to_period("M").dt.to_timestamp()

    # compter occurrences par phrase (simple)
    def count_word(t):
        return sum(1 for w in tokenize(t) if w == word)

    d["cnt"] = d["texte"].astype(str).map(count_word)
    out = d.groupby("month")["cnt"].sum().reset_index()
    return out


In [None]:
# Widgets communs
authors = sorted(df_docs["auteur"].unique().tolist())
authors_all = ["ALL"] + authors

# --- Onglet 1 : Recherche ---
q = widgets.Text(value="climate change", description="Query:", layout=widgets.Layout(width="70%"))
topk = widgets.IntSlider(value=10, min=1, max=50, step=1, description="TopK:", continuous_update=False)
btn_search = widgets.Button(description="Search", button_style="success")
out_search = widgets.Output()

def on_search(_):
    with out_search:
        clear_output()
        res = se.search(q.value, top_k=int(topk.value), show_progress=False)
        display(res)

btn_search.on_click(on_search)
tab_search = widgets.VBox([widgets.HBox([q, btn_search]), topk, out_search])

# --- Onglet 2 : Comparaison auteurs ---
a1 = widgets.Dropdown(options=authors, value=authors[0], description="A1:")
a2 = widgets.Dropdown(options=authors, value=authors[1] if len(authors)>1 else authors[0], description="A2:")
topn = widgets.IntSlider(value=20, min=5, max=50, step=5, description="TopN:", continuous_update=False)
btn_cmp = widgets.Button(description="Compare", button_style="primary")
out_cmp = widgets.Output()

def on_compare(_):
    with out_cmp:
        clear_output()
        df_cmp = top_distinctive_words(df_docs, a1.value, a2.value, top_n=int(topn.value))
        display(df_cmp)

btn_cmp.on_click(on_compare)
tab_cmp = widgets.VBox([widgets.HBox([a1, a2, btn_cmp]), topn, out_cmp])

# --- Onglet 3 : Évolution temporelle ---
w_word = widgets.Text(value="climate", description="Word:")
w_auth = widgets.Dropdown(options=authors_all, value="ALL", description="Author:")
btn_time = widgets.Button(description="Plot", button_style="warning")
out_time = widgets.Output()

def on_time(_):
    with out_time:
        clear_output()
        serie = word_over_time(df_docs, w_word.value, auteur=None if w_auth.value=="ALL" else w_auth.value)
        display(serie)
        plt.figure()
        plt.plot(serie["month"], serie["cnt"])
        plt.xticks(rotation=45)
        plt.title(f"Occurrences de '{w_word.value}' (par mois) - {w_auth.value}")
        plt.xlabel("Mois")
        plt.ylabel("Occurrences")
        plt.show()

btn_time.on_click(on_time)
tab_time = widgets.VBox([widgets.HBox([w_word, w_auth, btn_time]), out_time])

tabs = widgets.Tab(children=[tab_search, tab_cmp, tab_time])
tabs.set_title(0, "Search")
tabs.set_title(1, "Compare")
tabs.set_title(2, "Time")
display(tabs)
