In [1]:
import yfinance as yf
from newsapi import NewsApiClient
import pandas as pd
import datetime
import time
from tickers_metadata import tickers_metadata

# API NewsAPI
NEWS_API_KEY = 'e67a21b3ecc14ee395ea4256670b8af7'
newsapi = NewsApiClient(api_key=NEWS_API_KEY)

# -------------------- STOCK DATA --------------------

def download_stock_data(ticker, start_date="2020-01-01", end_date=None):
    if end_date is None:
        end_date = datetime.date.today().strftime('%Y-%m-%d')
    df = yf.download(ticker, start=start_date, end=end_date)
    df.reset_index(inplace=True)
    df["Ticker"] = ticker
    return df

# -------------------- NEWS DATA --------------------

def fetch_news(company_name, ticker, from_date, to_date, page_size=100):
    all_articles = []
    try:
        response = newsapi.get_everything(
            q=company_name,
            from_param=from_date,
            to=to_date,
            language='en',
            sort_by='relevancy',
            page=1,
            page_size=page_size
        )
        articles = response['articles']
        for a in articles:
            all_articles.append({
                'ticker': ticker,
                'company': company_name,
                'title': a['title'],
                'description': a['description'],
                'publishedAt': a['publishedAt'],
                'source': a['source']['name'],
                'url': a['url']
            })
        time.sleep(1)
    except Exception as e:
        print(f"[{company_name}] Erreur news : {e}")
    return pd.DataFrame(all_articles)

# -------------------- MAIN --------------------

if __name__ == "__main__":
    today = datetime.date.today()
    last_week = today - datetime.timedelta(days=7)

    all_stock_data = []
    all_news_data = []

    for entry in tickers_metadata:
        ticker = entry["ticker"]
        name = entry["name"]

        print(f"🔁 {ticker} - {name}")

        # Stock data
        stock_df = download_stock_data(ticker)
        stock_df["Company"] = name
        all_stock_data.append(stock_df)

        # News data
        news_df = fetch_news(name, ticker, last_week.strftime('%Y-%m-%d'), today.strftime('%Y-%m-%d'))
        all_news_data.append(news_df)

    df_stocks = pd.concat(all_stock_data, ignore_index=True)
    df_news = pd.concat(all_news_data, ignore_index=True)

    df_stocks.to_csv("data/stock_data.csv", index=False)
    df_news.to_csv("data/news_data.csv", index=False)

    print("✅ Données stockées dans /data/")


ModuleNotFoundError: No module named 'yfinance'

In [10]:
import pandas as pd

# Charger sans entête pour reconstituer manuellement
df_raw = pd.read_csv("data/stock_data.csv", header=None)

# La première ligne : types de colonnes ("Date", "Close", etc.)
col_types = df_raw.iloc[0]
# La deuxième ligne : tickers associés ("AAPL", "AAPL", ...)
tickers = df_raw.iloc[1]

# Fusionner les deux en noms de colonnes uniques
multi_index = [f"{c}_{t}" if pd.notna(t) else c for c, t in zip(col_types, tickers)]

# Appliquer en tant que noms de colonnes
df_raw.columns = multi_index

# Supprimer les deux premières lignes
df_data = df_raw.iloc[2:].reset_index(drop=True)

# Extraire la colonne Date
# Cherche dynamiquement la colonne contenant "Date"
date_col = [col for col in df_data.columns if "Date" in col][0]
df_data["Date"] = pd.to_datetime(df_data[date_col])

# Initialisation liste pour empiler les données correctement
all_stocks = []

# Identifier tous les tickers présents (hors colonnes "Date")
unique_tickers = tickers.dropna().unique()

for ticker in unique_tickers:
    temp_df = pd.DataFrame({
        "Date": df_data["Date"],
        "Ticker": ticker,
        "Open": pd.to_numeric(df_data.get(f"Open_{ticker}"), errors="coerce"),
        "High": pd.to_numeric(df_data.get(f"High_{ticker}"), errors="coerce"),
        "Low": pd.to_numeric(df_data.get(f"Low_{ticker}"), errors="coerce"),
        "Close": pd.to_numeric(df_data.get(f"Close_{ticker}"), errors="coerce"),
        "Volume": pd.to_numeric(df_data.get(f"Volume_{ticker}"), errors="coerce"),
    })
    all_stocks.append(temp_df)

# Fusion propre (forme longue = tidy format)
df_final = pd.concat(all_stocks, ignore_index=True)

# Optionnel : supprime les lignes vides
df_final.dropna(subset=["Close"], inplace=True)

# Enregistrement final
df_final.to_csv("data/stock_data_clean.csv", index=False)
print("✅ Données stockées dans stock_data_clean.csv")


  df_raw = pd.read_csv("data/stock_data.csv", header=None)


✅ Données stockées dans stock_data_clean.csv


In [11]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# ========== 1. CHARGEMENT ==========

df_stock = pd.read_csv("data/stock_data_clean.csv")
df_news = pd.read_csv("data/news_data.csv")

# ========== 2. NETTOYAGE STOCKS ==========

# Forcer les types numériques
df_stock["Close"] = pd.to_numeric(df_stock["Close"], errors="coerce")
df_stock["Date"] = pd.to_datetime(df_stock["Date"])

# Calcul variation J+1 (%)
df_stock = df_stock.sort_values(by=["Ticker", "Date"])
df_stock["next_close"] = df_stock.groupby("Ticker")["Close"].shift(-1)
df_stock["variation_pct"] = (df_stock["next_close"] - df_stock["Close"]) / df_stock["Close"]

# ========== 3. NETTOYAGE NEWS ==========

df_news["date"] = pd.to_datetime(df_news["publishedAt"]).dt.date
df_news["date"] = pd.to_datetime(df_news["date"])  # Uniformiser avec df_stock

# ========== 4. ANALYSE DE SENTIMENT ==========

analyzer = SentimentIntensityAnalyzer()
df_news["sentiment"] = df_news["title"].astype(str).apply(lambda x: analyzer.polarity_scores(x)["compound"])

# ========== 5. AGRÉGATION JOURNALIÈRE DU SENTIMENT ==========

df_sentiment = df_news.groupby(["date", "ticker"])["sentiment"].mean().reset_index()
df_sentiment.rename(columns={"date": "Date", "ticker": "Ticker"}, inplace=True)

# ========== 6. FUSION FINALE ==========

df_final = pd.merge(df_sentiment, df_stock, on=["Date", "Ticker"], how="inner")

# Optionnel : garder que les colonnes utiles
df_final = df_final[["Date", "Ticker", "sentiment", "variation_pct", "Open", "Close", "High", "Low", "Volume"]]

# Supprimer les lignes avec valeurs manquantes
df_final.dropna(subset=["sentiment", "variation_pct"], inplace=True)

# ========== 7. ENREGISTREMENT ==========
df_final.to_csv("data/final_dataset.csv", index=False)
print("✅ Dataset prêt : data/final_dataset.csv")


✅ Dataset prêt : data/final_dataset.csv


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# 1. Charger le dataset
df = pd.read_csv("data/final_dataset.csv")

# 2. Créer la variable cible binaire
df["target"] = (df["variation_pct"] > 0).astype(int)

# 3. Sélection des features
features = ["sentiment", "Open", "High", "Low", "Close", "Volume"]
X = df[features]
y = df["target"]

# 4. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 5. Modèles à tester
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
}

# 6. Entraînement et évaluation
for name, model in models.items():
    print(f"\n📊 {name}")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"Accuracy : {acc:.4f}")
    print(classification_report(y_test, preds))



📊 Random Forest
Accuracy : 0.8148
              precision    recall  f1-score   support

           0       0.85      0.96      0.90        23
           1       0.00      0.00      0.00         4

    accuracy                           0.81        27
   macro avg       0.42      0.48      0.45        27
weighted avg       0.72      0.81      0.76        27


📊 Logistic Regression
Accuracy : 0.8519
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        23
           1       0.00      0.00      0.00         4

    accuracy                           0.85        27
   macro avg       0.43      0.50      0.46        27
weighted avg       0.73      0.85      0.78        27



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
import yfinance as yf
import streamlit as st
from newsapi import NewsApiClient
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import datetime
import os
import pandas as pd
import joblib
from train_model import models, features  # réutilise les features du training
from tickers_metadata import tickers_metadata
import matplotlib.pyplot as plt
import pydeck as pdk

# NewsAPI config
NEWS_API_KEY = "e67a21b3ecc14ee395ea4256670b8af7"
newsapi = NewsApiClient(api_key=NEWS_API_KEY)
analyzer = SentimentIntensityAnalyzer()

def enrich_and_update_tickers(tickers_to_add,
                               dataset_path="data/final_dataset.csv",
                               stock_base_path="data/stock_data_clean.csv",
                               news_base_path="data/news_data.csv"):
    if not tickers_to_add:
        return None

    # Chargement des bases existantes
    try:
        df_final_existing = pd.read_csv(dataset_path)
        df_final_existing["Date"] = pd.to_datetime(df_final_existing["Date"])
    except FileNotFoundError:
        df_final_existing = pd.DataFrame()

    try:
        df_news_existing = pd.read_csv(news_base_path)
    except FileNotFoundError:
        df_news_existing = pd.DataFrame()

    all_final = []
    all_news = []

    for ticker in tickers_to_add:
        try:
            st.info(f"\U0001F4E1 Téléchargement des données pour {ticker}...")

            # === 1. Données boursières ===
            stock_df = yf.download(ticker, period="2mo")
            if stock_df.empty:
                st.error(f"❌ Pas de données boursières pour {ticker}")
                continue

            stock_df.reset_index(inplace=True)
            stock_df = stock_df[["Date", "Open", "High", "Low", "Close", "Volume"]]
            stock_df["Date"] = pd.to_datetime(stock_df["Date"])
            stock_df["Ticker"] = ticker
            stock_df["Close"] = pd.to_numeric(stock_df["Close"], errors="coerce")
            stock_df.dropna(subset=["Close"], inplace=True)

            # === 2. News & Sentiment ===
            today = datetime.date.today()
            last_week = today - datetime.timedelta(days=7)
            news = newsapi.get_everything(
                q=ticker,
                from_param=last_week.isoformat(),
                to=today.isoformat(),
                language="en",
                sort_by="relevancy",
                page=1,
                page_size=10,
            )

            news_df = pd.DataFrame([{
                "ticker": ticker,
                "title": a["title"],
                "publishedAt": a["publishedAt"],
                "source": a["source"]["name"],
                "url": a["url"],
                "sentiment": analyzer.polarity_scores(a["title"])['compound']
            } for a in news["articles"]])

            if news_df.empty or "sentiment" not in news_df.columns:
                st.warning(f"⚠️ Aucune news ou pas de sentiment pour {ticker}")
                continue

            news_df["sentiment"] = pd.to_numeric(news_df["sentiment"], errors="coerce")
            news_df.dropna(subset=["sentiment"], inplace=True)
            if news_df.empty:
                st.warning(f"⚠️ Toutes les valeurs de sentiment sont nulles pour {ticker}")
                continue

            news_df["date"] = pd.to_datetime(news_df["publishedAt"]).dt.date
            news_df["date"] = pd.to_datetime(news_df["date"])
            all_news.append(news_df)

            # === 3. Moyenne journalière du sentiment ===
            df_sentiment = news_df.groupby("date")["sentiment"].mean().reset_index()
            df_sentiment.rename(columns={"date": "Date"}, inplace=True)
            df_sentiment["Ticker"] = ticker

            # === 4. Merge avec les données de marché ===
            df_stock = stock_df.copy()
            df_stock["Date"] = pd.to_datetime(df_stock["Date"])
            df_stock = df_stock.sort_values(by=["Ticker", "Date"])
            df_stock["next_close"] = df_stock.groupby("Ticker")["Close"].shift(-1)
            df_stock["variation_pct"] = (df_stock["next_close"] - df_stock["Close"]) / df_stock["Close"]

            df_merged = pd.merge(df_sentiment, df_stock, on=["Date", "Ticker"], how="inner")
            df_merged = df_merged[["Date", "Ticker", "sentiment", "variation_pct", "Open", "Close", "High", "Low", "Volume"]]
            df_merged.dropna(subset=["sentiment", "variation_pct"], inplace=True)

            if df_merged.empty:
                st.warning(f"⚠️ Aucune ligne finale pour {ticker} après fusion.")
                continue

            all_final.append(df_merged)

        except Exception as e:
            st.error(f"❌ Erreur pour {ticker} : {e}")

    if all_final:
        df_new_final = pd.concat(all_final, ignore_index=True)
        df_updated = pd.concat([df_final_existing, df_new_final], ignore_index=True)
        df_updated.to_csv(dataset_path, index=False)
        st.success(f"✅ Données ajoutées à {dataset_path}.")
    else:
        df_new_final = None

    if all_news:
        df_new_news = pd.concat(all_news, ignore_index=True)
        df_news_combined = pd.concat([df_news_existing, df_new_news], ignore_index=True)
        df_news_combined.to_csv(news_base_path, index=False)
        st.success(f"🗞️ News ajoutées à {news_base_path}.")

    return df_new_final

enrich_and_update_tickers("COST")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [33]:
ticker = "COST"
stock_df = yf.download(ticker, period="2mo")
if stock_df.empty:
    st.error(f"❌ Pas de données boursières pour {ticker}")

stock_df.reset_index(inplace=True)
expected_cols = ["Date", "Open", "High", "Low", "Close", "Volume"]
missing = [col for col in expected_cols if col not in stock_df.columns]

if missing:
    st.error(f"❌ Données incomplètes pour {ticker} – colonnes manquantes : {missing}")
stock_df = stock_df[["Date", "Open", "High", "Low", "Close", "Volume"]]
stock_df["Date"] = pd.to_datetime(stock_df["Date"])
stock_df["Ticker"] = ticker
stock_df.columns
stock_df.dropna(subset=["Close"], inplace=True)

[*********************100%***********************]  1 of 1 completed


MultiIndex([(  'Date',     ''),
            (  'Open', 'COST'),
            (  'High', 'COST'),
            (   'Low', 'COST'),
            ( 'Close', 'COST'),
            ('Volume', 'COST'),
            ('Ticker',     '')],
           names=['Price', 'Ticker'])

In [39]:
yf.download("XLV", period="2mo")

[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,XLV,XLV,XLV,XLV,XLV
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-02-10,145.73616,146.084802,144.899401,146.035007,7374300
2025-02-11,145.467194,145.756084,144.879471,145.317778,6634900
2025-02-12,145.258011,146.044959,144.809733,145.118536,7572200
2025-02-13,145.825806,146.14458,144.60055,145.546886,6293100
2025-02-14,144.241943,146.134615,144.241943,145.935389,5299200
2025-02-18,143.963013,144.331582,143.285626,143.365319,11366800
2025-02-19,145.83577,145.875624,143.943098,143.972981,6295900
2025-02-20,146.652603,146.732297,145.536925,145.626574,6264800
2025-02-21,145.865646,146.572917,144.779851,144.919311,8647000
2025-02-24,147.051056,147.64875,145.706268,146.025027,7559400


In [40]:
from datetime import datetime, timedelta

In [51]:
ticker = "SAN.PA"

In [53]:
start_stock = (datetime.today() - timedelta(days=75)).date()
start_news = (datetime.today() - timedelta(days=30)).date()
end_date = datetime.today().date()
df_stock = yf.download(ticker, start=start_stock.isoformat(), end=end_date.isoformat(), auto_adjust=False)
df_stock.reset_index(inplace=True)
if isinstance(df_stock.columns, pd.MultiIndex):
        df_stock.columns = df_stock.columns.get_level_values(0)
df_stock

[*********************100%***********************]  1 of 1 completed


Price,Date,Adj Close,Close,High,Low,Open,Volume
0,2025-01-27,101.339996,101.339996,102.480003,100.120003,100.620003,1881624
1,2025-01-28,101.720001,101.720001,103.239998,100.900002,101.279999,1614837
2,2025-01-29,101.860001,101.860001,102.720001,101.260002,101.400002,1657302
3,2025-01-30,103.620003,103.620003,103.800003,101.739998,102.099998,2387213
4,2025-01-31,104.400002,104.400002,104.720001,103.400002,103.860001,2389467
5,2025-02-03,104.459999,104.459999,104.800003,103.620003,104.0,1804169
6,2025-02-04,102.68,102.68,103.879997,102.360001,102.839996,1602394
7,2025-02-05,103.599998,103.599998,103.879997,102.040001,102.879997,1517578
8,2025-02-06,103.720001,103.720001,104.459999,103.459999,103.800003,1739445
9,2025-02-07,103.360001,103.360001,103.699997,100.400002,100.739998,2066340


In [54]:
df_stock = df_stock.copy()
df_stock.reset_index(inplace=True)
df_stock["Ticker"] = ticker

# Ne garde que les colonnes utiles
df_stock = df_stock[["Date", "Ticker", "Open", "High", "Low", "Close", "Volume"]]
df_stock.sort_values("Date", inplace=True)
df_stock["next_close"] = df_stock["Close"].shift(-1)
df_stock["variation_pct"] = (df_stock["next_close"] - df_stock["Close"]) / df_stock["Close"]
df_stock

Price,Date,Ticker,Open,High,Low,Close,Volume,next_close,variation_pct
0,2025-01-27,SAN.PA,100.620003,102.480003,100.120003,101.339996,1881624,101.720001,0.00375
1,2025-01-28,SAN.PA,101.279999,103.239998,100.900002,101.720001,1614837,101.860001,0.001376
2,2025-01-29,SAN.PA,101.400002,102.720001,101.260002,101.860001,1657302,103.620003,0.017279
3,2025-01-30,SAN.PA,102.099998,103.800003,101.739998,103.620003,2387213,104.400002,0.007527
4,2025-01-31,SAN.PA,103.860001,104.720001,103.400002,104.400002,2389467,104.459999,0.000575
5,2025-02-03,SAN.PA,104.0,104.800003,103.620003,104.459999,1804169,102.68,-0.01704
6,2025-02-04,SAN.PA,102.839996,103.879997,102.360001,102.68,1602394,103.599998,0.00896
7,2025-02-05,SAN.PA,102.879997,103.879997,102.040001,103.599998,1517578,103.720001,0.001158
8,2025-02-06,SAN.PA,103.800003,104.459999,103.459999,103.720001,1739445,103.360001,-0.003471
9,2025-02-07,SAN.PA,100.739998,103.699997,100.400002,103.360001,2066340,103.800003,0.004257


In [55]:
all_news = []
all_final = []

news = newsapi.get_everything(
        q=ticker,
        from_param=start_news.isoformat(),
        to=end_date.isoformat(),
        language="en",
        sort_by="relevancy",
        page_size=50
    )

df_n = pd.DataFrame([{
    "ticker": ticker,
    "title": a["title"],
    "publishedAt": a["publishedAt"],
    "source": a["source"]["name"],
    "url": a["url"],
    "sentiment": analyzer.polarity_scores(a["title"])["compound"]
} for a in news["articles"]])
if df_n.empty:
    print(f"⚠️ Aucune news pour {ticker}")
df_n["date"] = pd.to_datetime(df_n["publishedAt"]).dt.date
df_n["date"] = pd.to_datetime(df_n["date"])
all_news.append(df_n)
df_sentiment = df_n.groupby("date")["sentiment"].mean().reset_index()
df_sentiment.rename(columns={"date": "Date"}, inplace=True)
df_sentiment["Ticker"] = ticker
# Fusion
df_final = pd.merge(df_sentiment, df_stock, on=["Date", "Ticker"], how="outer")
df_final = df_final[["Date", "Ticker", "sentiment", "variation_pct", "Open", "Close", "High", "Low", "Volume"]]
#df_final.dropna(subset=["sentiment", "variation_pct"], inplace=True)
if len(df_final) < 30:
    print(f"⚠️ Moins de 30 jours valides pour {ticker} ({len(df_final)} lignes)")
all_final.append(df_final)
print(f"✅ {ticker} → {len(df_final)} lignes prêtes.")

df_final

⚠️ Aucune news pour SAN.PA


KeyError: 'publishedAt'