In [22]:
import numpy as np
import pandas as pd
from gnews import GNews
import yfinance as yf
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse
from gnews import GNews
import time
import json
from scrapping_articles_fun import extract_news_fixed


In [None]:
def align_with_index(df1, df2, date_col='Date', join_type='inner'):
    """
    Align datasets using pandas align method.
    join_type: 'inner', 'outer', 'left', 'right'
    """
    # Set date as index
    df1_indexed = df1.set_index(date_col)
    df2_indexed = df2.set_index(date_col)
    
    # Align the dataframes
    df1_aligned, df2_aligned = df1_indexed.align(df2_indexed, join=join_type, axis=0)
    
    return df1_aligned.reset_index(), df2_aligned.reset_index()


In [None]:
df_oat = pd.read_csv("data\\Rendement de l'Obligation France 10 ans - Données Historiques (1).csv")
df_bund = pd.read_csv("data\\Rendement de l'Obligation Allemagne 10 ans - Données Historiques.csv")
df_oat_aligned, df_bund_aligned = align_with_index(df_oat, df_bund, date_col='Date', join_type='inner')
df_oat_aligned['Dernier'] = df_oat_aligned['Dernier'].str.replace(',', '.').astype(float)
df_bund_aligned['Dernier'] = df_bund_aligned['Dernier'].str.replace(',', '.').astype(float)

df_oat_aligned['Date'] = pd.to_datetime(df_oat_aligned['Date'], format='%d/%m/%Y')
df_bund_aligned['Date'] = pd.to_datetime(df_bund_aligned['Date'], format='%d/%m/%Y')

In [None]:
y = df_oat_aligned['Dernier']/df_bund_aligned['Dernier']

In [None]:
numbers_of_points = 700
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(df_oat_aligned['Date'][:numbers_of_points], y[:numbers_of_points], label='Ratio OAT/Bund 10 ans', color='blue')
ax.set_title("Ratio OAT/Bund 10 ans")
ax.set_xlabel("Date")
ax.set_ylabel("Ratio OAT/Bund")
plt.xticks(rotation=45)
plt.show()

In [None]:
#import adf test
from statsmodels.tsa.stattools import adfuller

adf_result = adfuller(y[:numbers_of_points])
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])

In [None]:
spread_one_year = y[:numbers_of_points]
spread_one_year_diff = spread_one_year.diff().dropna()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(df_oat_aligned['Date'][1:numbers_of_points], spread_one_year_diff, label='Ratio OAT/Bund 10 ans Diff', color='blue')
ax.set_title("Ratio OAT/Bund 10 ans Diff")
ax.set_xlabel("Date")
ax.set_ylabel("Ratio OAT/Bund Diff")
plt.xticks(rotation=45)
plt.show()

In [None]:
from statsmodels.tsa.stattools import adfuller

adf_result = adfuller(spread_one_year_diff)
print('ADF Statistic:', adf_result[0])
print('p-value:', adf_result[1])

In [None]:
#Plotting pcf and acf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
fig, ax = plt.subplots(2,1, figsize=(16,10))
plot_acf(spread_one_year_diff, ax=ax[0], lags=40)
plot_pacf(spread_one_year_diff, ax=ax[1], lags=40)
plt.show()

In [None]:
# p = 3, q=3
from statsmodels.tsa.arima.model import ARIMA

model = ARIMA(spread_one_year, order=(3,1,3))
model_fit = model.fit()
print(model_fit.summary())

In [None]:
from fredapi import Fred

# Obtenir une clé API gratuite sur: https://fred.stlouisfed.org/docs/api/api-key.html
FRED_API_KEY = 'c1d79a456b2cc144c9132a39f0865cd8'
fred = Fred(api_key=FRED_API_KEY)

# Dictionnaire des séries FRED
series_dict = {
    'FPCPITOTLZGFRA': 'FR_Inflation_CPI',
    'CLVMNACSCAB1GQFR': 'FR_PIB',
    'LRHUTTTTFRM156S': 'FR_Unemployment_Rate',
    'GGGDTAFRA188N': 'FR_Debt_Rate',
    'FPCPITOTLZGDEU': 'DE_Inflation_CPI',
    'CLVMNACSCAB1GQDE': 'DE_PIB_Growth',
    'LRHUTTTTDEM156S': 'DE_Unemployment_Rate',
    'GGGDTADEA188N': 'DE_Debt_Rate',
}

data_frames = []
for series_id, label in series_dict.items():
    try:
        data = fred.get_series(series_id, observation_start='2020-01-01', observation_end='2026-01-08')
        df = pd.DataFrame({label: data})
        if label == 'FR_PIB' or label == 'DE_PIB_Growth':
            df[label] = (np.log(df[label]/df[label].shift()) * 400).round(2)
        data_frames.append(df)
    except Exception as e:
        print(f"Erreur pour {label}: {e}")

macro_data = pd.concat(data_frames, axis=1)
print(macro_data.head())
# Transformer le PIB en variation trimestrielle


macro_processed = macro_data
macro_processed.to_csv('macro_processed.csv')

In [None]:
# KEYWORDS USED FOR SCRAPPING ARTICLES FROM GNEWS

DOMAIN_PATTERNS_FR = [
    "lemonde", "figaro", "echos", "tribune", "liberation",
    "opinion", "capital", "challenges", "boursorama",
    "parisien", "croix", "express", "obs", "nouvelobs",
    "bfm", "francetvinfo", "franceinfo", "20minutes",
    "lepoint", "marianne", "mediapart", "ladepeche",
    "ouest-france", "sudouest",
]

DOMAIN_PATTERNS_DE = [
    "handelsblatt", "faz", "sueddeutsche", "welt",
    "spiegel", "wiwo", "manager-magazin", "capital",
    "boersen-zeitung", "zeit", "tagesschau", "ntv",
    "focus", "stern", "tagesspiegel",
]

BROAD_QUERIES_FR = [
    "économie",
    "finance", 
    "politique",
    "France",
]

BROAD_QUERIES_DE = [
    "wirtschaft",
    "finanzen",
    "politik",
    "Deutschland",
]

# Mots-clés complets (version précédente avec TPE/PME/énergie/etc.)
SCORING_KEYWORDS_FR = {
    # Politique monétaire
    "bce": 5, "banque centrale européenne": 5, "banque centrale": 3,
    "lagarde": 3, "christine lagarde": 3,
    "politique monétaire": 5, "taux directeur": 5, "taux d'intérêt": 4,
    
    # Inflation & prix
    "inflation": 5, "désinflation": 3, "ipc": 3, "hausse des prix": 3,
    "prix": 2, "prix de l'énergie": 3,
    
    # Énergie
    "énergie": 2, "électricité": 2, "prix électricité": 3,
    "gaz": 2, "prix du gaz": 3, "pétrole": 2, "prix du pétrole": 3,
    "carburant": 2, "essence": 1,
    
    # Dette & Budget
    "dette publique": 5, "dette": 3, "déficit budgétaire": 5, "déficit": 4,
    "budget": 3, "loi de finances": 4, "finances publiques": 4,
    
    # Obligations
    "oat": 6, "obligations": 4, "emprunt d'état": 4, "trésor": 3,
    "spread": 6, "prime de risque": 5, "rendement": 3,
    
    # Notation
    "notation": 4, "moody's": 3, "fitch": 3, "s&p": 3, "dégradation": 3,
    
    # Macro
    "pib": 3, "croissance": 2, "récession": 3, "ralentissement": 2,
    "chômage": 2, "emploi": 2, "salaires": 2, "pouvoir d'achat": 2,
    
    # Entreprises
    "tpe": 2, "pme": 2, "petites entreprises": 2, "faillites": 2,
    "investissement": 2, "production industrielle": 2,
    
    # Commerce
    "commerce extérieur": 2, "exportations": 2, "importations": 2,
    "tarifs douaniers": 3, "douanes": 2, "protectionnisme": 2,
    "tensions commerciales": 3,
    
    # Politique
    "gouvernement": 2, "assemblée nationale": 2, "élections": 2,
    "réforme": 2, "instabilité politique": 3,
    
    # Marchés
    "marchés financiers": 2, "bourse": 2, "cac 40": 2, "banques": 2,
    
    # Crises
    "crise": 2, "tension": 2, "incertitude": 2, "risque": 2,
}

SCORING_KEYWORDS_DE = {
    # Politique monétaire
    "ezb": 5, "europäische zentralbank": 5, "zentralbank": 3,
    "lagarde": 3, "geldpolitik": 5, "leitzins": 5, "zinsen": 4,
    
    # Inflation
    "inflation": 5, "inflationsrate": 4, "verbraucherpreise": 3,
    "preisanstieg": 3, "preise": 2,
    
    # Énergie
    "energie": 2, "energiepreise": 3, "strom": 2, "strompreise": 3,
    "gas": 2, "gaspreise": 3, "öl": 2, "ölpreis": 3,
    "kraftstoff": 2, "benzin": 1,
    
    # Dette & Budget
    "staatsschulden": 5, "schulden": 3, "haushaltsdefizit": 5, "defizit": 4,
    "haushalt": 3, "finanzpolitik": 3, "schuldenbremse": 4,
    
    # Obligations
    "bundesanleihe": 6, "bund": 4, "anleihen": 4, "staatsanleihe": 4,
    "spread": 6, "risikoprämie": 5, "rendite": 3,
    
    # Notation
    "rating": 4, "moody's": 3, "fitch": 3, "s&p": 3, "herabstufung": 3,
    
    # Macro
    "bip": 3, "wachstum": 2, "rezession": 3, "konjunktur": 2,
    "arbeitslosigkeit": 2, "beschäftigung": 2, "löhne": 2, "kaufkraft": 2,
    
    # Entreprises
    "kmu": 2, "kleine unternehmen": 2, "mittelstand": 2, "insolvenzen": 2,
    "investitionen": 2, "industrieproduktion": 2,
    
    # Commerce
    "außenhandel": 2, "exporte": 2, "importe": 2,
    "zölle": 3, "protektionismus": 2, "handelsspannungen": 3,
    
    # Politique
    "bundesregierung": 2, "bundestag": 2, "wahlen": 2,
    "reform": 2, "regierungskrise": 3,
    
    # Marchés
    "finanzmärkte": 2, "börse": 2, "dax": 2, "banken": 2,
    
    # Crises
    "krise": 2, "spannung": 2, "unsicherheit": 2, "risiko": 2,
}

# May require the use of a VPN or proxy if GNews blocks requests from your IP. Adjust the date range and keywords as needed to get a good number of relevant articles.
articles_fr, _ = extract_news_fixed(
        broad_queries=BROAD_QUERIES_FR,
        scoring_keywords=SCORING_KEYWORDS_FR,
        domain_patterns=DOMAIN_PATTERNS_FR,
        start_date="2020-01-01",
        language="fr",
        country="FR",
        min_score=2,
        period_days=14,  
        verbose=True,
    )
    
if articles_fr:
    print("\nTop 15 articles (meilleurs scores):")
    for i, a in enumerate(articles_fr[:15]):
        print(f"\n{i+1}. [Score {a['relevance_score']}] [{a['source']}]")
        print(f"{a['title']}")
        print(f"KW: {', '.join(a['matched_keywords'][:5])}")
    else:
        print("\nToujours 0 articles - vérifier les requêtes GNews")

with open("articles_fr_final2npart.json", "w", encoding="utf-8") as f:
        json.dump(articles_fr, f, ensure_ascii=False, indent=2)

articles_fr = pd.read_json("articles_fr_final.json")
articles_fr['date'] = pd.to_datetime(articles_fr['published'], errors='coerce')
articles_fr.index = articles_fr['date']
articles_fr.sort_index(inplace=True)

articles_fr.to_csv("data\\articles_fr_final.csv", index=True)


articles_de, _ = extract_news_fixed(
        broad_queries=BROAD_QUERIES_DE,
        scoring_keywords=SCORING_KEYWORDS_DE,
        domain_patterns=DOMAIN_PATTERNS_DE,
        start_date="2020-01-01",
        language="de",
        country="DE",
        min_score=2,
        verbose=True,
    )
    
if articles_de:
    print("Top 15 articles:")
    for i, a in enumerate(articles_de[:15]):
        print(f"\n{i+1}. [Score {a['relevance_score']}] [{a['source']}]")
        print(f"{a['title']}")
        print(f"KW: {', '.join(a['matched_keywords'][:5])}")

with open("articles_de_final.json", "w", encoding="utf-8") as f:
        json.dump(articles_de, f, ensure_ascii=False, indent=2)

articles_de = pd.read_json("articles_de_final.json")
articles_de['date'] = pd.to_datetime(articles_de['published'], errors='coerce')
articles_de.index = articles_de['date']
articles_de.sort_index(inplace=True)
articles_de.to_csv("data\\articles_de_final.csv", index=True)
