# Analyse de Sentiment des Articles GDELT avec VADER
Ce notebook récupère des articles via GDELT, calcule leur sentiment avec VADER, et les insère dans une base MariaDB ou SQLite.

In [1]:
# pip install gdeltdoc vaderSentiment beautifulsoup4 sqlalchemy pymysql
import pandas as pd
import re, html
from bs4 import BeautifulSoup
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from gdeltdoc import GdeltDoc, Filters, repeat
import mariadb

In [2]:
# ==== DB SETUP ===============================================================
USE_MARIADB = True

from sqlalchemy import create_engine, text

USER = "root"
PWD  = "2003"
HOST = "127.0.0.1"
PORT = 3306
DB   = "NewsVader"

if USE_MARIADB:
    ENGINE_URL = f"mysql+pymysql://{USER}:{PWD}@{HOST}:{PORT}/{DB}?charset=utf8mb4"
else:
    ENGINE_URL = "sqlite:///NewsVader.db"

# Crée la base si elle n'existe pas (MariaDB)
if USE_MARIADB:
    ADMIN_URL = f"mysql+pymysql://{USER}:{PWD}@{HOST}:{PORT}/?charset=utf8mb4"
    admin_engine = create_engine(ADMIN_URL, future=True, pool_pre_ping=True)
    with admin_engine.begin() as conn:
        conn.exec_driver_sql(f"""
            CREATE DATABASE IF NOT EXISTS {DB}
            CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci;
        """)

engine = create_engine(ENGINE_URL, future=True, pool_pre_ping=True)

DDL_ARTICLES_SQLITE = """
CREATE TABLE IF NOT EXISTS articles (
  id INTEGER PRIMARY KEY AUTOINCREMENT,
  source TEXT,
  url TEXT NOT NULL,
  title TEXT,
  description TEXT,
  content TEXT,
  full_text TEXT,
  seendate TEXT,
  published_at TEXT,
  language TEXT,
  sentiment_compound REAL,
  sentiment_pos REAL,
  sentiment_neu REAL,
  sentiment_neg REAL,
  sentiment_label TEXT,
  UNIQUE(url) ON CONFLICT IGNORE
);
"""
DDL_ARTICLES_MYSQL = """
CREATE TABLE IF NOT EXISTS `articles` (
  `id` BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
  `source` VARCHAR(255),
  `url` TEXT NOT NULL,                 -- pas d'index direct sur url
  `url_hash` CHAR(32) NOT NULL,        -- md5(url) pour la dédup
  `title` TEXT,
  `description` MEDIUMTEXT,
  `content` MEDIUMTEXT,
  `full_text` MEDIUMTEXT,
  `gdelt_date` VARCHAR(32) NULL,       -- une seule colonne date, brute GDELT
  `language` VARCHAR(16),
  `sentiment_compound` DOUBLE,
  `sentiment_pos` DOUBLE,
  `sentiment_neu` DOUBLE,
  `sentiment_neg` DOUBLE,
  `sentiment_label` VARCHAR(16),
  UNIQUE KEY `uk_url_hash` (`url_hash`),
  KEY `idx_gdelt_date` (`gdelt_date`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
"""


from sqlalchemy.exc import OperationalError

try:
    with engine.begin() as conn:
        conn.exec_driver_sql(DDL_ARTICLES_MYSQL if USE_MARIADB else DDL_ARTICLES_SQLITE)
    print("Connexion et schéma OK :", ENGINE_URL)
except OperationalError as e:
    # MariaDB error code 1050: Table already exists
    if USE_MARIADB and hasattr(e.orig, 'args') and e.orig.args[0] == 1050:
        print("Table 'articles' existe déjà, création ignorée.")
        print("Connexion OK :", ENGINE_URL)
    else:
        raise


Table 'articles' existe déjà, création ignorée.
Connexion OK : mysql+pymysql://root:2003@127.0.0.1:3306/NewsVader?charset=utf8mb4


In [3]:
# ==== VADER + CLEAN ==========================================================
analyzer = SentimentIntensityAnalyzer()
def clean_text_soft(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = BeautifulSoup(text, "html.parser").get_text(" ", strip=True)
    text = re.sub(r'(https?://\S+|www\.\S+)', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
def label_from_compound(x: float) -> str:
    return "Positive" if x >= 0.05 else ("Negative" if x <= -0.05 else "Neutral")

In [4]:
# ==== GDELT -> DF avec Multiple Batches ====================================
def get_multiple_batches(num_batches=6):
    gd = GdeltDoc()
    all_articles = []
    from datetime import datetime, timedelta
    import time
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2025, 9, 20)
    total_days = (end_date - start_date).days
    days_per_batch = total_days // num_batches
    current_date = start_date
    for i in range(num_batches):
        if i == num_batches - 1:
            period_end = end_date
        else:
            period_end = current_date + timedelta(days=days_per_batch)
        f = Filters(
            start_date=current_date.strftime("%Y-%m-%d"),
            end_date=period_end.strftime("%Y-%m-%d"),
            num_records=250,
            language="ENGLISH",
            domain=["bbc.co.uk", "bloomberg.com", "theguardian.com", "ft.com","economist.com"]
        )
        try:
            df_batch = gd.article_search(f)
            if not df_batch.empty:
                all_articles.append(df_batch)
                print(f"Batch {i+1} ({current_date.strftime('%Y-%m-%d')} à {period_end.strftime('%Y-%m-%d')}): {len(df_batch)} articles")
        except Exception as e:
            print(f"Erreur batch {i+1}: {e}")
        current_date = period_end
        time.sleep(1)
    if all_articles:
        final_df = pd.concat(all_articles, ignore_index=True)
        final_df = final_df.drop_duplicates(subset=['url'], keep='first')
        print(f"Total final après suppression doublons: {len(final_df)} articles")
        return final_df
    return pd.DataFrame()

In [5]:
# Récupération des articles
df = get_multiple_batches(6)

Batch 1 (2024-01-01 à 2024-04-14): 250 articles
Batch 2 (2024-04-14 à 2024-07-27): 250 articles
Batch 3 (2024-07-27 à 2024-11-08): 250 articles
Batch 4 (2024-11-08 à 2025-02-20): 250 articles
Batch 5 (2025-02-20 à 2025-06-04): 250 articles
Batch 6 (2025-06-04 à 2025-09-20): 250 articles
Total final après suppression doublons: 1500 articles


In [6]:
# ==== SCORING ================================================================
def sentiment_on_df(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    title   = df["title"]        if "title" in df.columns else pd.Series([""]*len(df))
    content = df["content"]      if "content" in df.columns else pd.Series([""]*len(df))
    desc    = df["description"]  if "description" in df.columns else pd.Series([""]*len(df))
    snip    = df["snippet"]      if "snippet" in df.columns else pd.Series([""]*len(df))
    lang    = df["language"]     if "language" in df.columns else pd.Series([""]*len(df))
    url     = df["url"]          if "url" in df.columns else df.get("DocumentIdentifier", pd.Series([""]*len(df)))
    source  = df["domain"]       if "domain" in df.columns else df.get("sourceCommonName", pd.Series([""]*len(df)))
    seendt  = df["seendate"]     if "seendate" in df.columns else pd.Series([None]*len(df))
    pubdt   = df["publishdate"]  if "publishdate" in df.columns else df.get("date", pd.Series([None]*len(df)))
    full_text = (title.fillna("") + " " + content.fillna("") + " " + desc.fillna("") + " " + snip.fillna("")).map(clean_text_soft)
    scores = full_text.map(lambda t: analyzer.polarity_scores(t) if t else {"compound":0,"pos":0,"neu":1,"neg":0})
    out = pd.DataFrame({
        "source":  source.astype(str).str[:255],
        "url":     url.astype(str).str[:1024],
        "title":   title.astype(str),
        "description": desc.astype(str),
        "content": content.astype(str),
        "full_text": full_text,
        "language": lang.astype(str).str[:16],
        "seendate": seendt,
        "published_at": pubdt
    })
    out["sentiment_compound"] = scores.map(lambda s: s["compound"])
    out["sentiment_pos"]      = scores.map(lambda s: s["pos"])
    out["sentiment_neu"]      = scores.map(lambda s: s["neu"])
    out["sentiment_neg"]      = scores.map(lambda s: s["neg"])
    out["sentiment_label"]    = out["sentiment_compound"].map(label_from_compound)
    print("🔍 COLONNES DATES ORIGINALES:")
    if not seendt.empty and seendt.notna().any():
        print(f"seendate exemples: {seendt.dropna().head(3).tolist()}")
    if not pubdt.empty and pubdt.notna().any():
        print(f"published_at exemples: {pubdt.dropna().head(3).tolist()}")
    print(f"Résultat final - seendate nulles: {out['seendate'].isna().sum()}/{len(out)}")
    print(f"Résultat final - published_at nulles: {out['published_at'].isna().sum()}/{len(out)}")
    return out

In [7]:
scored = sentiment_on_df(df)

🔍 COLONNES DATES ORIGINALES:
seendate exemples: ['20240124T114500Z', '20240208T064500Z', '20240216T233000Z']
Résultat final - seendate nulles: 0/1500
Résultat final - published_at nulles: 1500/1500


In [8]:
# ==== UPSERT EN DB ===========================================================
from datetime import datetime, date

def _to_sql_value_dt(x):
    # None / NaN -> None ; sinon on passe tel quel
    if x is None:
        return None
    try:
        import pandas as pd
        if pd.isna(x):
            return None
    except Exception:
        pass
    if isinstance(x, str):
        s = x.strip()
        if s == "" or s.lower() in ("none", "nan", "nat", "null"):
            return None
        return s
    return str(x)

def upsert_articles(df_scored: pd.DataFrame):
    if df_scored.empty:
        print("Aucun article à insérer.")
        return 0, 0

    cols = ["source","url","title","description","content","full_text",
            "seendate","published_at","language",
            "sentiment_compound","sentiment_pos","sentiment_neu","sentiment_neg","sentiment_label"]

    for c in cols:
        if c not in df_scored.columns:
            df_scored[c] = None

    payload = []
    for _, r in df_scored.iterrows():
        rec = {c: r.get(c) for c in cols}
        rec["seendate"]     = _to_sql_value_dt(rec["seendate"])
        rec["published_at"] = _to_sql_value_dt(rec["published_at"])
        payload.append(rec)

    with engine.begin() as conn:
        if USE_MARIADB:
            sql = text("""
            INSERT INTO articles
              (source, url, title, description, content, full_text,
               seendate, published_at, language,
               sentiment_compound, sentiment_pos, sentiment_neu, sentiment_neg, sentiment_label)
            VALUES
              (:source, :url, :title, :description, :content, :full_text,
               :seendate, :published_at, :language,
               :sentiment_compound, :sentiment_pos, :sentiment_neu, :sentiment_neg, :sentiment_label)
            ON DUPLICATE KEY UPDATE
              title=VALUES(title),
              description=VALUES(description),
              content=VALUES(content),
              full_text=VALUES(full_text),
              seendate=VALUES(seendate),
              published_at=VALUES(published_at),
              language=VALUES(language),
              sentiment_compound=VALUES(sentiment_compound),
              sentiment_pos=VALUES(sentiment_pos),
              sentiment_neu=VALUES(sentiment_neu),
              sentiment_neg=VALUES(sentiment_neg),
              sentiment_label=VALUES(sentiment_label);
            """)
        else:
            sql = text("""
            INSERT INTO articles
              (source, url, title, description, content, full_text,
               seendate, published_at, language,
               sentiment_compound, sentiment_pos, sentiment_neu, sentiment_neg, sentiment_label)
            VALUES
              (:source, :url, :title, :description, :content, :full_text,
               :seendate, :published_at, :language,
               :sentiment_compound, :sentiment_pos, :sentiment_neu, :sentiment_neg, :sentiment_label)
            ON CONFLICT(url) DO UPDATE SET
              title=excluded.title,
              description=excluded.description,
              content=excluded.content,
              full_text=excluded.full_text,
              seendate=excluded.seendate,
              published_at=excluded.published_at,
              language=excluded.language,
              sentiment_compound=excluded.sentiment_compound,
              sentiment_pos=excluded.sentiment_pos,
              sentiment_neu=excluded.sentiment_neu,
              sentiment_neg=excluded.sentiment_neg,
              sentiment_label=excluded.sentiment_label;
            """)

        conn.execute(sql, payload)

    print(f"Écrit dans la base: {len(payload)} lignes (insert+update confondus).")
    return len(payload), 0


In [9]:
def ensure_dates_are_varchar(engine, db_name="NewsVader"):
    sql_check = f"""
    SELECT COLUMN_NAME, DATA_TYPE, COLUMN_TYPE
    FROM INFORMATION_SCHEMA.COLUMNS
    WHERE TABLE_SCHEMA = '{db_name}'
      AND TABLE_NAME   = 'articles'
      AND COLUMN_NAME IN ('seendate','published_at');
    """
    try:
        with engine.begin() as conn:
            rows = list(conn.exec_driver_sql(sql_check))
            print("Types actuels (seendate/published_at):", rows)

            # si la table n'existe pas encore, rien à faire (ton CREATE suivra)
            if not rows:
                print("Table 'articles' introuvable pour contrôle; elle sera créée ensuite.")
                return

            need_alter = any(r[1] in ("datetime", "timestamp") for r in rows)
            if need_alter:
                print("↪️  Conversion en VARCHAR(32)…")
                conn.exec_driver_sql("""
                    ALTER TABLE `articles`
                      MODIFY `seendate` VARCHAR(32) NULL,
                      MODIFY `published_at` VARCHAR(32) NULL;
                """)
                print("✅ ALTER appliqué.")
            else:
                print("✅ Déjà en VARCHAR(32), pas d'ALTER nécessaire.")
    except Exception as e:
        # Montre l’erreur SQL complète pour comprendre
        import traceback
        print("❌ ALTER/Vérif échoué :", e)
        traceback.print_exc()

# — appelle-le ici —
ensure_dates_are_varchar(engine, DB)


Types actuels (seendate/published_at): []
Table 'articles' introuvable pour contrôle; elle sera créée ensuite.


In [10]:
if USE_MARIADB:
    # Vérifie si la table existe avant d'appliquer ALTER
    with engine.begin() as conn:
        result = conn.exec_driver_sql("""
            SELECT COUNT(*) FROM information_schema.tables
            WHERE table_schema = DATABASE() AND table_name = 'articles';
        """)
        exists = result.scalar()
        if exists:
            conn.exec_driver_sql("""
                ALTER TABLE articles
                  MODIFY seendate VARCHAR(32) NULL,
                  MODIFY published_at VARCHAR(32) NULL;
            """)
            print("ALTER TABLE appliqué.")
        else:
            print("La table 'articles' n'existe pas encore. ALTER TABLE ignoré.")

La table 'articles' n'existe pas encore. ALTER TABLE ignoré.


In [11]:
if not scored.empty:
    print(scored[["title","sentiment_label","sentiment_compound"]].head(10))
    upsert_articles(scored)
    print("OK. Base prête :", ENGINE_URL)
else:
    print("Aucun article renvoyé par GDELT.")

                                               title sentiment_label  \
0  Adventures in application compatibility : The ...        Positive   
1  India schoolteachers are drafting better lesso...        Positive   
2  If youre just going to sit there doing nothing...         Neutral   
3  Black and African American Community - Inclusi...        Positive   
4        Personal Cloud Storage – Microsoft OneDrive         Neutral   
5  Microsoft Copilot for Microsoft 365 - Business...         Neutral   
6  AI - Powered Low - Code Tools | Microsoft Powe...        Negative   
7  Buy Microsoft 365 Personal ( formerly Office 3...         Neutral   
8                    Windows Subsystem for Android™️         Neutral   
9  Educators and students now have a secure AI  s...        Positive   

   sentiment_compound  
0              0.3400  
1              0.7003  
2              0.0000  
3              0.3818  
4              0.0000  
5              0.0000  
6             -0.2732  
7              

ProgrammingError: (pymysql.err.ProgrammingError) (1146, "Table 'newsvader.articles' doesn't exist")
[SQL: 
            INSERT INTO articles
              (source, url, title, description, content, full_text,
               seendate, published_at, language,
               sentiment_compound, sentiment_pos, sentiment_neu, sentiment_neg, sentiment_label)
            VALUES
              (%(source)s, %(url)s, %(title)s, %(description)s, %(content)s, %(full_text)s,
               %(seendate)s, %(published_at)s, %(language)s,
               %(sentiment_compound)s, %(sentiment_pos)s, %(sentiment_neu)s, %(sentiment_neg)s, %(sentiment_label)s)
            ON DUPLICATE KEY UPDATE
              title=VALUES(title),
              description=VALUES(description),
              content=VALUES(content),
              full_text=VALUES(full_text),
              seendate=VALUES(seendate),
              published_at=VALUES(published_at),
              language=VALUES(language),
              sentiment_compound=VALUES(sentiment_compound),
              sentiment_pos=VALUES(sentiment_pos),
              sentiment_neu=VALUES(sentiment_neu),
              sentiment_neg=VALUES(sentiment_neg),
              sentiment_label=VALUES(sentiment_label);
            ]
[parameters: [{'source': 'devblogs.microsoft.com', 'url': 'https://devblogs.microsoft.com/oldnewthing/20230324-00/?p=107966', 'title': 'Adventures in application compatibility : The case of the jump into the middle of an instruction from nowhere', 'description': '', 'content': '', 'full_text': 'Adventures in application compatibility : The case of the jump into the middle of an instruction from nowhere', 'seendate': '20240124T114500Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.34, 'sentiment_pos': 0.124, 'sentiment_neu': 0.876, 'sentiment_neg': 0.0, 'sentiment_label': 'Positive'}, {'source': 'news.microsoft.com', 'url': 'https://news.microsoft.com/source/asia/features/indias-schoolteachers-are-drafting-better-lesson-plans-faster-thanks-to-a-copilot/', 'title': 'India schoolteachers are drafting better lesson plans faster , thanks to a copilot', 'description': '', 'content': '', 'full_text': 'India schoolteachers are drafting better lesson plans faster , thanks to a copilot', 'seendate': '20240208T064500Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.7003, 'sentiment_pos': 0.345, 'sentiment_neu': 0.655, 'sentiment_neg': 0.0, 'sentiment_label': 'Positive'}, {'source': 'devblogs.microsoft.com', 'url': 'https://devblogs.microsoft.com/oldnewthing/20240216-00/?p=109409', 'title': 'If youre just going to sit there doing nothing , at least do nothing correctly', 'description': '', 'content': '', 'full_text': 'If youre just going to sit there doing nothing , at least do nothing correctly', 'seendate': '20240216T233000Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.0, 'sentiment_pos': 0.0, 'sentiment_neu': 1.0, 'sentiment_neg': 0.0, 'sentiment_label': 'Neutral'}, {'source': 'news.microsoft.com', 'url': 'https://news.microsoft.com/inclusionisinnovation/blackandafricanamerican/', 'title': 'Black and African American Community - Inclusion is Innovation', 'description': '', 'content': '', 'full_text': 'Black and African American Community - Inclusion is Innovation', 'seendate': '20240227T133000Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.3818, 'sentiment_pos': 0.245, 'sentiment_neu': 0.755, 'sentiment_neg': 0.0, 'sentiment_label': 'Positive'}, {'source': 'microsoft.com', 'url': 'https://www.microsoft.com/en-us/microsoft-365/onedrive/online-cloud-storage', 'title': 'Personal Cloud Storage – Microsoft OneDrive', 'description': '', 'content': '', 'full_text': 'Personal Cloud Storage – Microsoft OneDrive', 'seendate': '20240227T133000Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.0, 'sentiment_pos': 0.0, 'sentiment_neu': 1.0, 'sentiment_neg': 0.0, 'sentiment_label': 'Neutral'}, {'source': 'microsoft.com', 'url': 'https://www.microsoft.com/en-us/microsoft-365/business/copilot-for-microsoft-365?icid=mscom_marcom_CPW2a_M365forBusiness_Copilot', 'title': 'Microsoft Copilot for Microsoft 365 - Business Plans', 'description': '', 'content': '', 'full_text': 'Microsoft Copilot for Microsoft 365 - Business Plans', 'seendate': '20240227T133000Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.0, 'sentiment_pos': 0.0, 'sentiment_neu': 1.0, 'sentiment_neg': 0.0, 'sentiment_label': 'Neutral'}, {'source': 'microsoft.com', 'url': 'https://www.microsoft.com/en-us/power-platform', 'title': 'AI - Powered Low - Code Tools | Microsoft Power Platform', 'description': '', 'content': '', 'full_text': 'AI - Powered Low - Code Tools | Microsoft Power Platform', 'seendate': '20240227T133000Z', 'published_at': None, 'language': 'English', 'sentiment_compound': -0.2732, 'sentiment_pos': 0.0, 'sentiment_neu': 0.826, 'sentiment_neg': 0.174, 'sentiment_label': 'Negative'}, {'source': 'microsoft.com', 'url': 'https://www.microsoft.com/en-us/microsoft-365/p/microsoft-365-personal/cfq7ttc0k5bf?icid=mscom_marcom_CPH3a_M365Personal', 'title': 'Buy Microsoft 365 Personal ( formerly Office 365 ) - Subscription Price', 'description': '', 'content': '', 'full_text': 'Buy Microsoft 365 Personal ( formerly Office 365 ) - Subscription Price', 'seendate': '20240229T103000Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.0, 'sentiment_pos': 0.0, 'sentiment_neu': 1.0, 'sentiment_neg': 0.0, 'sentiment_label': 'Neutral'}  ... displaying 10 of 1500 total bound parameter sets ...  {'source': 'bbc.co.uk', 'url': 'https://www.bbc.co.uk/news/articles/cpvjdw3wry1o', 'title': 'Police hunt Oban for firearm linked to Euan Johnston killing', 'description': '', 'content': '', 'full_text': 'Police hunt Oban for firearm linked to Euan Johnston killing', 'seendate': '20250708T154500Z', 'published_at': None, 'language': 'English', 'sentiment_compound': -0.6597, 'sentiment_pos': 0.0, 'sentiment_neu': 0.672, 'sentiment_neg': 0.328, 'sentiment_label': 'Negative'}, {'source': 'bbc.co.uk', 'url': 'https://www.bbc.co.uk/iplayer/episode/m002b1px/dr-xands-con-or-cure-series-3-episode-2', 'title': 'Dr Xands Con or Cure - Series 3 : Episode 2', 'description': '', 'content': '', 'full_text': 'Dr Xands Con or Cure - Series 3 : Episode 2', 'seendate': '20250708T154500Z', 'published_at': None, 'language': 'English', 'sentiment_compound': 0.0, 'sentiment_pos': 0.0, 'sentiment_neu': 1.0, 'sentiment_neg': 0.0, 'sentiment_label': 'Neutral'}]]
(Background on this error at: https://sqlalche.me/e/20/f405)