In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../sp500_ohlcv_2005_2025_2.csv")

In [3]:

# Suppose df a les colonnes : Date, Ticker, Open, High, Low, Close, Volume
df = df.sort_values(["Ticker", "Date"])

# --- 1) Log-return
df["log_return"] = np.log(df["Close"]) - np.log(df["Close"].shift(1))

# --- 2) Target direction (r_{t+1} > 0)
df["target"] = (df.groupby("Ticker")["log_return"].shift(-1) > 0).astype(int)

# --- 3) Features momentum
df["mom_5"]  = df.groupby("Ticker")["Close"].transform(lambda x: x / x.shift(5) - 1)
df["mom_21"] = df.groupby("Ticker")["Close"].transform(lambda x: x / x.shift(21) - 1)

# --- 4) Features volatilité
df["vol_5"]  = df.groupby("Ticker")["log_return"].transform(lambda x: x.rolling(5).std())
df["vol_21"] = df.groupby("Ticker")["log_return"].transform(lambda x: x.rolling(21).std())

# --- 5) High–Low range
df["range"] = (df["High"] - df["Low"]) / df["Open"]

# --- 6) Volume z-score
df["volume_z"] = df.groupby("Ticker")["Volume"].transform(
    lambda x: (x - x.mean()) / x.std()
)

df['Return'] = df.groupby('Ticker')['Close'].pct_change(fill_method=None)

# --- 7) Clean
df = df.dropna()


In [4]:
import requests

url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
headers = {"User-Agent": "Mozilla/5.0"}

resp = requests.get(url, headers=headers)
resp.raise_for_status()  # lève une erreur si 4xx/5xx

tables = pd.read_html(resp.text, header=0)
sp500 = tables[0]

sp500 = sp500.rename(columns={"Symbol": "Ticker", "GICS Sector": "Sector"})
sp500['Ticker'] = sp500['Ticker'].str.replace('.', '-', regex=False)

df = df.sort_values(by=['Ticker', 'Date'])

df = df.merge(
    sp500[['Ticker', 'Sector']], 
    on='Ticker', 
    how='left'
)

df.tail()

  tables = pd.read_html(resp.text, header=0)


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,log_return,target,mom_5,mom_21,vol_5,vol_21,range,volume_z,Return,Sector
2278836,2024-12-24,ZTS,161.553123,162.875546,160.605722,162.540009,1023600.0,0.002553,1,-0.035658,-0.069281,0.013838,0.011887,0.01405,-0.714869,0.002557,Health Care
2278837,2024-12-26,ZTS,161.572877,163.615722,160.88206,163.349274,2167200.0,0.004967,0,-0.008921,-0.073807,0.01159,0.01166,0.016919,-0.25648,0.004979,Health Care
2278838,2024-12-27,ZTS,162.786715,164.345996,161.375477,162.441315,1800100.0,-0.005574,0,0.006605,-0.063176,0.005697,0.011267,0.018248,-0.403625,-0.005558,Health Care
2278839,2024-12-30,ZTS,161.740613,161.898519,159.332611,160.112259,1531400.0,-0.014442,1,-0.015773,-0.082041,0.007613,0.011328,0.015864,-0.511328,-0.014338,Health Care
2278840,2024-12-31,ZTS,160.763608,161.602466,159.747117,160.793213,1327400.0,0.004244,0,-0.008218,-0.0703,0.008295,0.011421,0.011541,-0.593097,0.004253,Health Care


la variable Sector est catégorielle par conséquent on va l'encoder par par la volatilité moyenne du secteur. Ce choix est motivé par la raison suivante. On veut faire comprendre au modèle que certains secteurs sont très volatils et très dépendants de chocs extérieur ainsi pour ces secteurs la, l'évolution futur de l'action est moins évidente.

In [5]:
# 1) Calcul de volatilité moyenne par secteur
sector_vol_mean = df.groupby("Sector")["vol_21"].mean()

# 2) Encodage de Sector par cette moyenne
df["Sector_encoded"] = df["Sector"].map(sector_vol_mean)


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report



def regression(df, features):

    # --- 8) Train/val/test split
    train = df[df["Date"] < "2018-01-01"]
    val   = df[(df["Date"] >= "2018-01-01") & (df["Date"] < "2021-01-01")]
    test  = df[df["Date"] >= "2021-01-01"].copy()   # copy important

    X_train, y_train = train[features], train["target"]
    X_val, y_val     = val[features], val["target"]
    X_test, y_test   = test[features], test["target"]
    
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Prédictions sur le test
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)


    # --- Autres métriques globales
    print("Accuracy :", accuracy_score(y_test, y_pred))

    test["pred"] = y_pred

    # --- Calcul F1 par secteur (ajuste "Sector" si ta colonne s'appelle autrement)
    sector_f1 = (
        test.groupby("Sector")
            .apply(lambda g: accuracy_score(g["target"], g["pred"]), include_groups=False)
            .sort_values(ascending=False)
    )

    return  model, sector_f1


In [7]:
regression(df, ["mom_5", "mom_21", "vol_5", "vol_21", "range", "volume_z"])


Accuracy : 0.5177679134039983


(LogisticRegression(max_iter=1000),
 Sector
 Financials                0.524993
 Utilities                 0.521979
 Industrials               0.521291
 Real Estate               0.520815
 Energy                    0.518289
 Information Technology    0.516843
 Consumer Staples          0.516335
 Consumer Discretionary    0.515701
 Health Care               0.511678
 Communication Services    0.509799
 Materials                 0.506109
 dtype: float64)

De premier abord, la regression parait ne pas être suffisante pour prédire le signe du rendement

In [8]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
import matplotlib.pyplot as plt

def training(df, features):

    # --- 8) Train/val/test split
    train = df[df["Date"] < "2018-01-01"]
    val   = df[(df["Date"] >= "2018-01-01") & (df["Date"] < "2021-01-01")]
    test  = df[df["Date"] >= "2021-01-01"].copy()   # copy important

    X_train, y_train = train[features], train["target"]
    X_val, y_val     = val[features], val["target"]
    X_test, y_test   = test[features], test["target"]
    
    # --- 9) Modèle XGBoost
    model = xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.03,
        n_estimators=800,
        subsample=0.7,
        colsample_bytree=0.7,
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Prédictions sur le test
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)


    # --- Autres métriques globales
    print("Accuracy :", accuracy_score(y_test, y_pred))
    # print("F1        :", f1_score(y_test, y_pred))
    # print("AUC       :", roc_auc_score(y_test, y_pred_proba))
    # print()
    # print(confusion_matrix(y_test, y_pred))
    # print()
    # print(classification_report(y_test, y_pred))

     # Ajout des prédictions dans test
    test["pred"] = y_pred

    # --- Calcul F1 par secteur (ajuste "Sector" si ta colonne s'appelle autrement)
    sector_f1 = (
        test.groupby("Sector")
            .apply(lambda g: accuracy_score(g["target"], g["pred"]), include_groups=False)
            .sort_values(ascending=False)
    )

    return  sector_f1


In [9]:
training(df, ["Close", "mom_5", "mom_21", "vol_5", "vol_21", "range", "volume_z"])

Accuracy : 0.5128854497365115


Sector
Financials                0.518164
Utilities                 0.517800
Real Estate               0.516996
Information Technology    0.515275
Materials                 0.515144
Industrials               0.512890
Energy                    0.512804
Consumer Staples          0.511715
Communication Services    0.507419
Health Care               0.506284
Consumer Discretionary    0.506053
dtype: float64

In [10]:
training(df, ["mom_5", "mom_21", "vol_5", "vol_21", "range", "volume_z"])

Accuracy : 0.5140293068226058


Sector
Financials                0.518523
Utilities                 0.517638
Information Technology    0.517502
Real Estate               0.516803
Industrials               0.514599
Materials                 0.514428
Energy                    0.512079
Consumer Staples          0.511988
Consumer Discretionary    0.509810
Health Care               0.508049
Communication Services    0.507592
dtype: float64

In [11]:
training(df, ["mom_5", "vol_5", "vol_21", "range", "volume_z"])


Accuracy : 0.515229353379596


Sector
Financials                0.521412
Real Estate               0.519275
Utilities                 0.518707
Industrials               0.516551
Information Technology    0.516432
Energy                    0.515660
Consumer Discretionary    0.514949
Materials                 0.514866
Consumer Staples          0.512754
Health Care               0.506539
Communication Services    0.503699
dtype: float64

In [12]:
training(df, ["mom_5", "vol_5", "range", "volume_z"])

Accuracy : 0.5158634916939921


Sector
Financials                0.521732
Utilities                 0.521104
Real Estate               0.518729
Industrials               0.518517
Materials                 0.517851
Information Technology    0.517502
Energy                    0.515796
Consumer Discretionary    0.512613
Consumer Staples          0.512262
Health Care               0.507862
Communication Services    0.503093
dtype: float64

In [13]:
training(df, ["mom_5", "vol_5", "range", "volume_z", "Return"])


Accuracy : 0.512750996359725


Sector
Materials                 0.518090
Financials                0.516460
Real Estate               0.516290
Utilities                 0.515306
Energy                    0.514572
Industrials               0.514380
Information Technology    0.511583
Consumer Staples          0.510895
Consumer Discretionary    0.509099
Communication Services    0.508285
Health Care               0.507014
dtype: float64

In [14]:
training(df, ["mom_5", "vol_5", "range", "volume_z", "Sector_encoded"])

Accuracy : 0.5153658135232002


Sector
Energy                    0.521325
Utilities                 0.520327
Real Estate               0.519178
Materials                 0.519085
Information Technology    0.518953
Industrials               0.518285
Financials                0.515781
Consumer Staples          0.513383
Consumer Discretionary    0.510013
Health Care               0.507302
Communication Services    0.507203
dtype: float64

In [15]:
training(df, ["mom_5", "vol_5", "volume_z"])

Accuracy : 0.5152072789446012


Sector
Financials                0.521306
Real Estate               0.518633
Industrials               0.518452
Utilities                 0.516925
Materials                 0.516697
Information Technology    0.516579
Consumer Discretionary    0.513304
Energy                    0.512714
Consumer Staples          0.510950
Health Care               0.507506
Communication Services    0.504694
dtype: float64

In [16]:
training(df, ["Open", "Close", "High", "Low", "Volume", "mom_5", "mom_21", "vol_5", "vol_21", "range", "volume_z"])

Accuracy : 0.5122653587898394


Sector
Utilities                 0.517508
Real Estate               0.517381
Materials                 0.516577
Financials                0.516287
Information Technology    0.514762
Industrials               0.512890
Consumer Staples          0.511770
Energy                    0.510629
Consumer Discretionary    0.506723
Health Care               0.505080
Communication Services    0.503612
dtype: float64

In [17]:
training(df, ["mom_5", "vol_5", "range", "volume_z"])

Accuracy : 0.5158634916939921


Sector
Financials                0.521732
Utilities                 0.521104
Real Estate               0.518729
Industrials               0.518517
Materials                 0.517851
Information Technology    0.517502
Energy                    0.515796
Consumer Discretionary    0.512613
Consumer Staples          0.512262
Health Care               0.507862
Communication Services    0.503093
dtype: float64

In [18]:
training(df, ["mom_5", "vol_5", "range"])

Accuracy : 0.5166922863897061


Sector
Financials                0.521199
Industrials               0.521163
Utilities                 0.520618
Real Estate               0.517926
Energy                    0.517699
Materials                 0.517134
Information Technology    0.516887
Consumer Staples          0.513875
Consumer Discretionary    0.513690
Health Care               0.509507
Communication Services    0.507246
dtype: float64

In [19]:
training(df, ["mom_5", "vol_5", "range"])

Accuracy : 0.5166922863897061


Sector
Financials                0.521199
Industrials               0.521163
Utilities                 0.520618
Real Estate               0.517926
Energy                    0.517699
Materials                 0.517134
Information Technology    0.516887
Consumer Staples          0.513875
Consumer Discretionary    0.513690
Health Care               0.509507
Communication Services    0.507246
dtype: float64

meme en ajoutant des variable pertinentes, l'accuracy reste sensiblement la meme peu importe le secteur. L'accuracy max réalisé est 0.5167. Peut etre qu'étant donné le fait que les années d'entrainement sont très anciennes par rapport aux années de test le modèle est mal adapté aux nouvelles années

Changeons de méthode d'entrainement : éparpillons des années de test et de validation dans tous le dataset

# Ajout de la variable secteur

In [20]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
import matplotlib.pyplot as plt

def training2(df, features):

    # --- 8) Train/val/test split
    mask_train = (
        df["Date"].between("2004-01-01", "2008-01-01") |
        df["Date"].between("2010-01-01", "2013-01-01") |
        df["Date"].between("2015-01-01", "2018-01-01") |
        df["Date"].between("2020-01-01", "2023-01-01")
    )

    mask_val = (
        df["Date"].between("2008-01-01", "2009-01-01") |
        df["Date"].between("2013-01-01", "2014-01-01") |
        df["Date"].between("2018-01-01", "2019-01-01") |
        df["Date"].between("2023-01-01", "2024-01-01")
    )

    mask_test = (
        df["Date"].between("2009-01-01", "2010-01-01") |
        df["Date"].between("2014-01-01", "2015-01-01") |
        df["Date"].between("2019-01-01", "2020-01-01") |
        df["Date"].between("2024-01-01", "2025-01-01")
    )

    train = df[mask_train]
    val   = df[mask_val]
    test  = df[mask_test].copy()   # copy pour pouvoir ajouter des colonnes

    X_train, y_train = train[features], train["target"]
    X_val, y_val     = val[features], val["target"]
    X_test, y_test   = test[features], test["target"]
    
    # --- 9) Modèle XGBoost
    model = xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.03,
        n_estimators=800,
        subsample=0.7,
        colsample_bytree=0.7,
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Prédictions sur le test
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Ajout des prédictions dans test
    test["pred"] = y_pred

    # --- Calcul F1 par secteur (ajuste "Sector" si ta colonne s'appelle autrement)
    sector_f1 = (
        test.groupby("Sector")
            .apply(lambda g: accuracy_score(g["target"], g["pred"]), include_groups=False)
            .sort_values(ascending=False)
    )

    print("Accuracy :", accuracy_score(y_test, y_pred))

    return  sector_f1
    

In [21]:
training2(df, ["mom_21", "vol_21", "range", "volume_z", "Sector_encoded"])


Accuracy : 0.5284668043771447


Sector
Real Estate               0.541104
Utilities                 0.539249
Materials                 0.536179
Financials                0.534383
Industrials               0.530297
Information Technology    0.528648
Consumer Staples          0.525105
Health Care               0.521544
Consumer Discretionary    0.518321
Communication Services    0.517416
Energy                    0.516766
dtype: float64

In [22]:
training2(df, ["mom_5", "vol_5", "volume_z", "Sector_encoded"])


Accuracy : 0.527175768646397


Sector
Utilities                 0.539649
Real Estate               0.539054
Materials                 0.534860
Information Technology    0.529625
Financials                0.528245
Industrials               0.525970
Consumer Staples          0.525388
Health Care               0.523426
Communication Services    0.521362
Consumer Discretionary    0.519351
Energy                    0.512748
dtype: float64

In [23]:
training2(df, ["mom_5", "vol_5", "volume_z"])


Accuracy : 0.5275748160540826


Sector
Financials                0.537192
Utilities                 0.536315
Real Estate               0.534094
Materials                 0.532476
Information Technology    0.528355
Industrials               0.525500
Consumer Staples          0.524963
Health Care               0.522851
Communication Services    0.522044
Consumer Discretionary    0.520038
Energy                    0.511260
dtype: float64

In [24]:
training2(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"])


Accuracy : 0.5293417211533537


Sector
Real Estate               0.539187
Utilities                 0.538949
Financials                0.537290
Materials                 0.534264
Consumer Staples          0.528420
Information Technology    0.528274
Industrials               0.527339
Health Care               0.525448
Communication Services    0.524870
Consumer Discretionary    0.521432
Energy                    0.512450
dtype: float64

la meilleur accuracy est meilleure que celle obtenue précédemment 0.5293 mais reste cependant faible. 

Essayons de préciser nos modèles en les concentrant sur un secteur. Peut être que les modèles précédent essayaient de trop généraliser ce qui les rendait imprécis.

In [25]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
import matplotlib.pyplot as plt

def training3(df, features, sector):

    # --- 8) Train/val/test split
    mask_train = (
        (df["Sector"] == sector) &
        (df["Date"].between("2004-01-01", "2008-01-01") |
        df["Date"].between("2010-01-01", "2013-01-01") |
        df["Date"].between("2015-01-01", "2018-01-01") |
        df["Date"].between("2020-01-01", "2023-01-01"))
    )

    mask_val = (
        (df["Sector"] == sector) &
        (df["Date"].between("2008-01-01", "2009-01-01") |
        df["Date"].between("2013-01-01", "2014-01-01") |
        df["Date"].between("2018-01-01", "2019-01-01") |
        df["Date"].between("2023-01-01", "2024-01-01"))
    )

    mask_test = (       
        (df["Sector"] == sector) &
        (df["Date"].between("2009-01-01", "2010-01-01") |
        df["Date"].between("2014-01-01", "2015-01-01") |
        df["Date"].between("2019-01-01", "2020-01-01") |
        df["Date"].between("2024-01-01", "2025-01-01"))
    )

    train = df[mask_train]
    val   = df[mask_val]
    test  = df[mask_test].copy()   # copy pour pouvoir ajouter des colonnes

    X_train, y_train = train[features], train["target"]
    X_val, y_val     = val[features], val["target"]
    X_test, y_test   = test[features], test["target"]
    
    # --- 9) Modèle XGBoost
    model = xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.03,
        n_estimators=800,
        subsample=0.7,
        colsample_bytree=0.7,
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Prédictions sur le test
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Ajout des prédictions dans test
    test["pred"] = y_pred

    # --- Calcul F1 par secteur (ajuste "Sector" si ta colonne s'appelle autrement)
    sector_f1 = (
        test.groupby("Sector")
            .apply(lambda g: accuracy_score(g["target"], g["pred"]), include_groups=False)
            .sort_values(ascending=False)
    )

    print("Accuracy :", accuracy_score(y_test, y_pred))

    return  sector_f1
    

In [26]:
training3(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"], "Financials")

Accuracy : 0.5232871709879491


Sector
Financials    0.523287
dtype: float64

In [27]:
training3(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"], "Utilities")

Accuracy : 0.5264772575696945


Sector
Utilities    0.526477
dtype: float64

In [28]:
training3(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"], "Information Technology")

Accuracy : 0.5222149837133551


Sector
Information Technology    0.522215
dtype: float64

In [29]:
def training4(df, features, ticker):

    # --- 8) Train/val/test split
    mask_train = (
        (df["Ticker"] == ticker) &
        (df["Date"].between("2004-01-01", "2008-01-01") |
        df["Date"].between("2010-01-01", "2013-01-01") |
        df["Date"].between("2015-01-01", "2018-01-01") |
        df["Date"].between("2020-01-01", "2023-01-01"))
    )

    mask_val = (
        (df["Ticker"] == ticker) &
        (df["Date"].between("2008-01-01", "2009-01-01") |
        df["Date"].between("2013-01-01", "2014-01-01") |
        df["Date"].between("2018-01-01", "2019-01-01") |
        df["Date"].between("2023-01-01", "2024-01-01"))
    )

    mask_test = (       
        (df["Ticker"] == ticker) &
        (df["Date"].between("2009-01-01", "2010-01-01") |
        df["Date"].between("2014-01-01", "2015-01-01") |
        df["Date"].between("2019-01-01", "2020-01-01") |
        df["Date"].between("2024-01-01", "2025-01-01"))
    )

    train = df[mask_train]
    val   = df[mask_val]
    test  = df[mask_test].copy()   # copy pour pouvoir ajouter des colonnes

    X_train, y_train = train[features], train["target"]
    X_val, y_val     = val[features], val["target"]
    X_test, y_test   = test[features], test["target"]
    
    # --- 9) Modèle XGBoost
    model = xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.03,
        n_estimators=800,
        subsample=0.7,
        colsample_bytree=0.7,
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Prédictions sur le test
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Ajout des prédictions dans test
    test["pred"] = y_pred

    # --- Calcul F1 par secteur (ajuste "Sector" si ta colonne s'appelle autrement)
    sector_f1 = (
        test.groupby("Sector")
            .apply(lambda g: accuracy_score(g["target"], g["pred"]), include_groups=False)
            .sort_values(ascending=False)
    )

    print("Accuracy :", accuracy_score(y_test, y_pred))

    return  sector_f1

In [30]:
training4(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"], "GOOGL")

Accuracy : 0.5099206349206349


Sector
Communication Services    0.509921
dtype: float64

In [31]:
training4(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"], "XOM")


Accuracy : 0.5277777777777778


Sector
Energy    0.527778
dtype: float64

In [32]:
training4(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"], "JPM")


Accuracy : 0.498015873015873


Sector
Financials    0.498016
dtype: float64

In [33]:
training4(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"], "PLD")


Accuracy : 0.4742063492063492


Sector
Real Estate    0.474206
dtype: float64

On remarque que souvent la catégories la moins bien prédites est celle de l'énergie qui est le secteur le plus sensible au choc d'après ce qui a été vu précédemment. Peut être que le faible score provient des période de crises dues à des chocs extérieurs aux marché qui sont imprévisibles à l'aide 'uniquement les données. Essayons de se limiter à la période la plus calme de 2005-2025

In [34]:
def training5(df, features):

    # --- 8) Train/val/test split
    mask_train = (
        df["Date"].between("2010-01-01", "2012-01-01") 
    )

    mask_val = (
        df["Date"].between("2012-01-01", "2013-01-01") 
    )

    mask_test = (
        df["Date"].between("2013-01-01", "2014-01-01") 
    )

    train = df[mask_train]
    val   = df[mask_val]
    test  = df[mask_test].copy()   # copy pour pouvoir ajouter des colonnes

    X_train, y_train = train[features], train["target"]
    X_val, y_val     = val[features], val["target"]
    X_test, y_test   = test[features], test["target"]
    
    # --- 9) Modèle XGBoost
    model = xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.03,
        n_estimators=800,
        subsample=0.7,
        colsample_bytree=0.7,
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Prédictions sur le test
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Ajout des prédictions dans test
    test["pred"] = y_pred

    # --- Calcul F1 par secteur (ajuste "Sector" si ta colonne s'appelle autrement)
    sector_f1 = (
        test.groupby("Sector")
            .apply(lambda g: accuracy_score(g["target"], g["pred"]), include_groups=False)
            .sort_values(ascending=False)
    )

    print("Accuracy :", accuracy_score(y_test, y_pred))

    return  sector_f1
    

In [35]:
training5(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"])


Accuracy : 0.5262041088323235


Sector
Materials                 0.541235
Industrials               0.531409
Health Care               0.529594
Financials                0.529561
Consumer Staples          0.529295
Information Technology    0.527334
Consumer Discretionary    0.525391
Communication Services    0.520982
Utilities                 0.516694
Energy                    0.513228
Real Estate               0.505200
dtype: float64

Malheuresement les résultats ne sont pas sensiblement meilleur

 Or on a vu précédemment que ces périodes de crise peuvent être caractérisées par des période de volatilité élevée. Ainsi on va retirer du dataset tous les moments ou la volatilité est élevée statistiquement.

In [36]:
def remove_outliers_iqr_per_ticker(df):
    # Fonction appliquée à chaque groupe (chaque Ticker)
    def filter_group(g):
        q1 = g['vol_21'].quantile(0.25)
        q3 = g['vol_21'].quantile(0.75)
        iqr = q3 - q1
        upper = q3 + 3* iqr

        # on garde seulement les lignes sous la borne haute
        return g[g['vol_21'] <= upper]

    # groupby puis concaténation automatique
    return df.groupby('Ticker', group_keys=False).apply(filter_group, include_groups=False)

In [37]:
def training6(df, features):

    # --- 8) Train/val/test split
    mask_train = (
        df["Date"].between("2004-01-01", "2008-01-01") |
        df["Date"].between("2010-01-01", "2013-01-01") |
        df["Date"].between("2015-01-01", "2018-01-01") |
        df["Date"].between("2020-01-01", "2023-01-01")
    )

    mask_val = (
        df["Date"].between("2008-01-01", "2009-01-01") |
        df["Date"].between("2013-01-01", "2014-01-01") |
        df["Date"].between("2018-01-01", "2019-01-01") |
        df["Date"].between("2023-01-01", "2024-01-01")
    )

    mask_test = (
        df["Date"].between("2009-01-01", "2010-01-01") |
        df["Date"].between("2014-01-01", "2015-01-01") |
        df["Date"].between("2019-01-01", "2020-01-01") |
        df["Date"].between("2024-01-01", "2025-01-01")
    )

    train_uncleaned = df[mask_train]
    val_uncleaned   = df[mask_val]
    test_uncleaned  = df[mask_test].copy()   # copy pour pouvoir ajouter des colonnes

    train = remove_outliers_iqr_per_ticker(train_uncleaned)
    val = remove_outliers_iqr_per_ticker(val_uncleaned)
    test = remove_outliers_iqr_per_ticker(test_uncleaned).copy()

    X_train, y_train = train[features], train["target"]
    X_val, y_val     = val[features], val["target"]
    X_test, y_test   = test[features], test["target"]
    
    # --- 9) Modèle XGBoost
    model = xgb.XGBClassifier(
        max_depth=5,
        learning_rate=0.03,
        n_estimators=800,
        subsample=0.7,
        colsample_bytree=0.7,
    )
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Prédictions sur le test
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)

    # Ajout des prédictions dans test
    test["pred"] = y_pred

    # --- Calcul F1 par secteur (ajuste "Sector" si ta colonne s'appelle autrement)
    sector_f1 = (
        test.groupby("Sector")
            .apply(lambda g: accuracy_score(g["target"], g["pred"]), include_groups=False)
            .sort_values(ascending=False)
    )

    print("Accuracy :", accuracy_score(y_test, y_pred))

    return  sector_f1
    

In [38]:
training6(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21", "vol_21"])


Accuracy : 0.5288139222858841


Sector
Utilities                 0.540888
Materials                 0.538983
Real Estate               0.536637
Financials                0.536412
Information Technology    0.528453
Consumer Staples          0.527482
Industrials               0.527058
Health Care               0.523812
Communication Services    0.521803
Consumer Discretionary    0.520512
Energy                    0.512863
dtype: float64

In [39]:
training6(df, ["mom_5", "vol_5", "range", "volume_z", "mom_21"])


Accuracy : 0.5290805155301491


Sector
Utilities                 0.541262
Financials                0.537963
Materials                 0.536909
Real Estate               0.536780
Industrials               0.528331
Information Technology    0.528108
Consumer Staples          0.526794
Health Care               0.523671
Communication Services    0.522638
Consumer Discretionary    0.520796
Energy                    0.512015
dtype: float64

# Conclusion

Dans un premier temps, nous avons tenté de construire un modèle de machine learning capable de prédire le signe du rendement journalier des actions du S&P 500.
Cette approche s’est révélée insuffisante : malgré l’utilisation de différents modèles (régression logistique, XGBoost) et d’un ensemble de variables explicatives dérivées du prix (momentum 5 et 21 jours, volatilité instantanée, amplitude intraday, volume standardisé), les performances prédictives sont restées très proches du hasard.

Ce résultat n’est pas dû à un problème de modélisation ou de qualité des données : il reflète au contraire une propriété structurelle des rendements financiers quotidiens.

En effet, la littérature empirique (Lo & MacKinlay, 1999 ; Bouchaud et al., 2003) montre que les rendements journaliers des actifs financiers sont extrêmement bruités et présentent :

une autocorrélation quasi nulle, ce qui signifie que le rendement d’un jour ne contient presque aucune information exploitable pour prédire celui du lendemain ;

une variance dominée par le bruit de marché, lui-même influencé par des facteurs non observables dans les données OHLCV (annonces macroéconomiques, surprises de résultats, flux d’ordres intraday, microstructure, sentiment, chocs exogènes, etc.) ;

une prédictibilité directionnelle théorique très faible, généralement limitée à 2–4 % d’information exploitable, ce qui entraîne un plafond d’accuracy proche de 52–54 % même pour les modèles les plus puissants.

Dans ce contexte, les variables utilisées — basées sur des caractéristiques de prix relativement lentes (momentum, volatilité historique, volume, amplitudes intraday) — sont structurellement peu informatives pour capturer le signal directionnel à un horizon aussi court. La faible performance observée est donc cohérente avec les limites théoriques du problème.

En revanche, ces mêmes variables sont fortement liées à la dynamique de la volatilité, qui présente une autocorrélation élevée et des régimes persistants (volatility clustering). Contrairement aux rendements, la volatilité est un processus beaucoup plus régulier et prévisible, ce qui en fait une cible de modélisation beaucoup plus appropriée.

C’est pourquoi nous avons choisi, dans un second temps, de réorienter notre travail vers la prédiction de la volatilité future, un problème mieux posé et pour lequel les méthodes d’apprentissage peuvent exploiter un véritable signal.