##  Lecture du fichier CSV des discours am√©ricains

In [2]:
import pandas as pd

df = pd.read_csv("discours_US.csv", sep="\t")

##  Analyse de la r√©partition des orateurs (speakers)

In [3]:
df['speaker'].value_counts()


speaker
CLINTON    93
TRUMP      71
Name: count, dtype: int64

## Cr√©ation du corpus √† partir des discours

In [4]:
import sys
sys.path.append("../src")

from Corpus import Corpus

corpus = Corpus("discours_US")

for i, row in df.iterrows():
    corpus.add_document(
        titre=row["descr"],        # description du discours
        auteur=row["speaker"],     # orateur
        date=row["date"],          # d√©j√† bon format
        url=row["link"],           # lien vers l'archive
        texte=row["text"]          # contenu du discours
    )



##  V√©rification du corpus construit

In [5]:
corpus.ndoc, corpus.naut


(164, 2)

##  Recherche simple par mot-cl√© dans le corpus

In [6]:
corpus.search("america")


[('doc_0', <document.Document at 0x21d568535d0>),
 ('doc_2', <document.Document at 0x21d56870610>),
 ('doc_3', <document.Document at 0x21d56866d90>),
 ('doc_4', <document.Document at 0x21d56870990>),
 ('doc_5', <document.Document at 0x21d56867010>),
 ('doc_6', <document.Document at 0x21d568709d0>),
 ('doc_7', <document.Document at 0x21d56870ad0>),
 ('doc_8', <document.Document at 0x21d56870a50>),
 ('doc_9', <document.Document at 0x21d56870b50>),
 ('doc_10', <document.Document at 0x21d56870b90>),
 ('doc_11', <document.Document at 0x21d56870c10>),
 ('doc_12', <document.Document at 0x21d56870c90>),
 ('doc_13', <document.Document at 0x21d56870d10>),
 ('doc_14', <document.Document at 0x21d56870150>),
 ('doc_15', <document.Document at 0x21d56870e10>),
 ('doc_16', <document.Document at 0x21d56870e90>),
 ('doc_17', <document.Document at 0x21d56870f10>),
 ('doc_19', <document.Document at 0x21d568710d0>),
 ('doc_20', <document.Document at 0x21d56871190>),
 ('doc_23', <document.Document at 0x21d5

## üîé  Analyse de contexte avec le concordancier

In [7]:
corpus.concorde("america", window=30)[:10]


[('use when families are strong,',
  'America',
  "is strong. So I'm hitting the",
  'doc_0'),
 ('esses starting and growing in',
  'America',
  'again. We have stalled out. I',
  'doc_2'),
 ('ths about race and justice in',
  'America',
  '. There is something profoundl',
  'doc_3'),
 ('ne this shared vision of what',
  'America',
  'can be and should be. I learn',
  'doc_3'),
 ('sweeping small-town and rural',
  'America',
  'as well. We have to do more a',
  'doc_3'),
 ('sty about race and justice in',
  'America',
  '. And, yes, a time for reform.',
  'doc_3'),
 ('use when families are strong,',
  'America',
  'is strong. And I am convinced',
  'doc_4'),
 ('lly undermining what has made',
  'America',
  'unique‚Äîthe way we have assimi',
  'doc_4'),
 ('iew the right thing to do for',
  'America',
  '. If you compare us to other c',
  'doc_4'),
 ('the kindergarten teachers in',
  'America',
  '. And when you think about val',
  'doc_5')]

## ü§ñ Chargement ou construction du moteur TF-IDF (SAFE MODE)

In [8]:
import os
import pickle

from SearchEngine import SearchEngine


ENGINE_PATH = "engine.pkl"

if os.path.exists(ENGINE_PATH):
    print("üîÅ Moteur TF-IDF d√©j√† existant. Chargement depuis engine.pkl...")
    
    with open(ENGINE_PATH, "rb") as f:
        engine = pickle.load(f)

else:
    print("‚öôÔ∏è Aucun moteur trouv√©. Construction du moteur TF-IDF...")
    
    engine = SearchEngine(corpus, mode="tfidf")

    with open(ENGINE_PATH, "wb") as f:
        pickle.dump(engine, f)

    print("‚úÖ Moteur TF-IDF construit et sauvegard√©.")


üîÅ Moteur TF-IDF d√©j√† existant. Chargement depuis engine.pkl...


##  V√©rification des dimensions de la matrice TF-IDF

In [13]:
engine.matrix.shape


(164, 12203)

## Recherche multi-mots avec le moteur TF-IDF

In [14]:
engine.search("america freedom", top=5)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [00:00<00:00, 780.04it/s]


Unnamed: 0,id,titre,score
90,doc_90,Remarks at a Rally at the James L. Knight Cent...,0.068383
72,doc_72,"Remarks at the XFinity Arena in Everett, Washi...",0.066183
54,doc_54,"Remarks in San Diego, California",0.039486
140,doc_140,Remarks at the Collier County Fairgrounds in N...,0.039355
71,doc_71,Remarks at the Mississippi Coliseum in Jackson...,0.038743
