In [132]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime as dt
import yfinance as yf
import numpy as np  
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim

In [133]:
tickers = [
    "AC.PA",   # Accor  
    "AI.PA",   # Air Liquide  
    "AIR.PA",  # Airbus  
    "CS.PA",   # Axa  
    "BNP.PA",  # BNP Paribas  
    "EN.PA",   # Bouygues  
    "BVI.PA",  # Bureau Veritas  
    "CAP.PA",  # Capgemini  
    "CA.PA",   # Carrefour  
    "ACA.PA",  # Crédit Agricole  
    "BN.PA",   # Danone (attention : s'assurer du bon ticker, parfois “BN.PA” peut poser problème selon la source)  
    "DSY.PA",  # Dassault Systèmes  
    "EDEN.PA", # Edenred  
    "ENGI.PA", # Engie  
    "EL.PA",   # EssilorLuxottica  
    "ERF.PA",  # Eurofins Scientific  
    "RMS.PA",  # Hermès (note : sur Yahoo, Hermès peut être “RMS.PA”)  
    "KER.PA",  # Kering  
    "OR.PA",   # L’Oréal  
    "LR.PA",   # Legrand  
    "MC.PA",   # LVMH  
    "ML.PA",   # Michelin  
    "ORA.PA",  # Orange  
    "RI.PA",   # Pernod Ricard  
    "PUB.PA",  # Publicis Groupe  
    "RNO.PA",  # Renault  
    "SAF.PA",  # Safran  
    "SGO.PA",  # Saint-Gobain  
    "SAN.PA",  # Sanofi  
    "SU.PA",   # Schneider Electric  
    "GLE.PA",  # Société Générale  
    "STLAP.PA",# Stellantis  
    "STMPA.PA",# STMicroelectronics (avec le “.PA”)  
    "TEP.PA",  # Teleperformance  
    "HO.PA",   # Thales  
    "TTE.PA",  # TotalEnergies  
    "URW.PA",  # Unibail-Rodamco-Westfield  
    "VIE.PA",  # Veolia Environnement  
    "DG.PA",   # Vinci  
    "VIV.PA"   # Vivendi  
# ,"BOVA11.SA", "BBDC4.SA", "CIEL3.SA", "TIUB4.SA", "PETR4.SA"
]

nb_days = 59
force_dl = False

In [134]:

def date_max(date1 : dt.datetime,date2 : dt.datetime) -> dt.datetime:
    if date1 > date2:
        return date1.replace(hour=0, minute=0, second=0)
    else:
        return date2.replace(hour=0, minute=0, second=0)
    
def dl_data(company : str, start_date : dt.datetime, end_date : dt.datetime, interval : str) -> pd.DataFrame:
    if((end_date-start_date).days > 7):
        start_data = dl_data(company ,date_max(start_date, dt.datetime.now() - dt.timedelta(days=nb_days)), start_date + dt.timedelta(days=7), interval)
        end_data = dl_data(company ,start_date + dt.timedelta(days=7), end_date, interval)
        return pd.concat([start_data, end_data])
    else:
        data = yf.Ticker(company)
        data = data.history(start=start_date, end=end_date, interval=interval)
        data.reset_index(inplace=True)
        return data

In [135]:
if not os.path.exists("data"):
    os.makedirs("data")


datas = []
for company in tickers:
    if not os.path.exists("data/" + company + ".csv") or force_dl:
        
        data = dl_data(company, dt.datetime.now() - dt.timedelta(days=nb_days), dt.datetime.now(), "15m")
        data = data.drop(columns=["Dividends","Stock Splits"])
        # flemme pour le moment    
        # try:
        #     old_data = pd.read_csv("data/" + company + ".csv")
        # except FileNotFoundError:
        #     old_data = pd.DataFrame()
            
        # # Concatenate old and new data but if the datetime is already in the old data, keep the new one
        # data = pd.concat([old_data, data])
        # data = data.drop_duplicates(subset=["Datetime"])
        data.to_csv("data/" + company + ".csv")
        datas.append(data)
    else:
        datas.append(pd.read_csv("data/" + company + ".csv", index_col=0))



In [136]:
lambda_smooth = 0.1

span = 2/lambda_smooth - 1
for data in datas:
    data['log_return'] = np.log(data["Close"]) - np.log(data["Close"].shift(1))
    data['smooth_close'] = data["Close"].ewm(span=span, adjust=False).mean()
    data['y'] = (data["Close"] > data["Close"].shift(1)).astype(int)
    data.dropna(inplace=True)

print(datas[0].head())


                    Datetime       Open       High        Low      Close  \
1  2025-09-22 09:15:00+02:00  41.009998  41.080002  40.910000  40.970001   
2  2025-09-22 09:30:00+02:00  40.990002  41.119999  40.990002  41.080002   
3  2025-09-22 09:45:00+02:00  41.070000  41.169998  40.990002  41.150002   
4  2025-09-22 10:00:00+02:00  41.139999  41.220001  41.139999  41.189999   
5  2025-09-22 10:15:00+02:00  41.200001  41.209999  41.139999  41.139999   

   Volume  log_return  smooth_close  y  
1    8473   -0.001220     41.015001  0  
2    6167    0.002681     41.021501  1  
3    6415    0.001703     41.034351  1  
4    4763    0.000972     41.049916  1  
5    4501   -0.001215     41.058924  0  


In [137]:
cols_to_scale = datas[0].columns
cols_to_scale = cols_to_scale.delete(cols_to_scale.get_loc("Datetime"))
cols_to_scale = cols_to_scale.delete(cols_to_scale.get_loc("y"))

df_concat = pd.concat(datas, keys=tickers, names=['Ticker', 'Row'])

scaler = StandardScaler()
df_concat[cols_to_scale] = scaler.fit_transform(df_concat[cols_to_scale])


In [138]:
for data in datas:
    data[cols_to_scale] = scaler.transform(data[cols_to_scale])

In [139]:
def create_continuous_sequences(df: pd.DataFrame, k: int, interval_minutes: int = 15) -> tuple[list[pd.DataFrame], list[pd.DataFrame]]:
    """
    Crée des séquences continues de taille k+1 (k étapes d'entrée et 1 étape de sortie) à partir du DataFrame.
    S'assure que la différence de temps entre chaque étape est exactement de interval_minutes.
    
    Args:
        df (pd.DataFrame): DataFrame contenant une colonne 'Datetime'.
        k (int): Taille de la fenêtre glissante (séquence d'entrée).
        interval_minutes (int): Intervalle de temps attendu en minutes (défaut: 15).
        
    Returns:
        list[pd.DataFrame]: Liste de DataFrames contenant les séquences d'entrée et de sortie.
    """
    sequences = []
    targets = []
    
    # Vérifications
    if 'Datetime' not in df.columns:
        raise ValueError("Le DataFrame doit avoir une colonne 'Datetime'")
        
    df = df.copy()
    if not pd.api.types.is_datetime64_any_dtype(df['Datetime']):
        df['Datetime'] = pd.to_datetime(df['Datetime'], utc=True)
        
    df = df.sort_values('Datetime')
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])
    
    
    df['time_diff'] = df['Datetime'].diff()
    
    df['gap'] = df['time_diff'] != pd.Timedelta(minutes=interval_minutes)
    
    df['group_id'] = df['gap'].cumsum()
    
    cols_to_keep = [c for c in df.columns if c not in ['time_diff', 'gap', 'group_id']]
    
    # Pour chaque groupe continu, on extrait les séquences
    for g_id, group in df.groupby('group_id'):
        group_data = group[cols_to_keep]
        
        if len(group_data) > k:
            # Pour avoir une séquence d'entrée de taille k et une cible à k+1,
            for i in range(len(group_data) - k):
                seq = group_data.iloc[i : i+k].copy()
                seq.drop(columns=['Datetime','y'], inplace=True)
                seq = seq.sort_index(axis=1)
                target = group_data.iloc[[i+k]].copy()
                
                sequences.append(seq)
                targets.append(target["y"].values[-1]) # strat moldave
                
    return sequences, targets

In [140]:
inputs = []
outputs = []
k = 20
time_interval = 15
for data in datas:
    seqs, targs = create_continuous_sequences(data, k, time_interval)
    inputs.extend(seqs)
    outputs.extend(targs)




In [141]:
print(type(inputs[0]))
input0 = inputs[0]
num_input0 = input0.to_numpy()
print(num_input0)
print(len(inputs))

<class 'pandas.core.frame.DataFrame'>
[[-3.55099002e-01 -3.54922377e-01 -3.55124963e-01 -3.54982474e-01
  -2.96539891e-01 -3.69198996e-01 -3.54937667e-01]
 [-3.54774183e-01 -3.54804471e-01 -3.54888317e-01 -3.55041522e-01
  -3.57843719e-01  8.14348252e-01 -3.54918472e-01]
 [-3.54567483e-01 -3.54657079e-01 -3.54888317e-01 -3.54805297e-01
  -3.51250766e-01  5.17388611e-01 -3.54880526e-01]
 [-3.54449376e-01 -3.54509676e-01 -3.54444622e-01 -3.54598596e-01
  -3.95168340e-01  2.95596256e-01 -3.54834562e-01]
 [-3.54597018e-01 -3.54539161e-01 -3.54444622e-01 -3.54421419e-01
  -4.02133476e-01 -3.67670854e-01 -3.54807960e-01]
 [-3.54715136e-01 -3.54745512e-01 -3.54710843e-01 -3.54628120e-01
  -4.21992088e-01 -2.94302526e-01 -3.54795831e-01]
 [-3.55069478e-01 -3.54922377e-01 -3.54917903e-01 -3.54775762e-01
  -2.45604012e-01 -8.86286212e-01 -3.54820350e-01]
 [-3.55660057e-01 -3.55158213e-01 -3.55539083e-01 -3.55041522e-01
  -2.61847134e-01 -1.48351142e+00 -3.54901479e-01]
 [-3.55807698e-01 -3.55777

In [142]:
inputs = np.array([df.to_numpy() for df in inputs])
outputs = np.array(outputs)

In [143]:
print(inputs.shape)

(23766, 20, 7)


In [144]:

class StockPriceLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=20, num_layers=1, output_size=1):
        super(StockPriceLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Couche fully connected pour la sortie binaire
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        # x : [batch_size, seq_length, input_size]
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # LSTM forward
        out, _ = self.lstm(x, (h0, c0))
        
        # Prendre la dernière sortie de la séquence
        out = out[:, -1, :]
        out = self.fc(out)
        out = self.sigmoid(out)
        return out



In [145]:
print(inputs[0].shape)
print(outputs[0])

(20, 7)
0


In [None]:
X_train, X_test =  torch.from_numpy(inputs[:int(0.8*len(inputs))]).float(), torch.from_numpy(inputs[int(0.8*len(inputs)):]).float()
y_train, y_test = torch.from_numpy(outputs[:int(0.8*len(outputs))]).float().unsqueeze(1), torch.from_numpy(outputs[int(0.8*len(outputs)):]).float().unsqueeze(1)

In [146]:

# Hyperparamètres
input_size = inputs[0].shape[1]    # Nombre de features (prix + indicateurs techniques)

hidden_size = 20
num_layers = 1
output_size = 1     # Classe binaire
seq_length = 10     # Nombre de pas temporels (historique utilisé)
learning_rate = 0.001


In [147]:

# Initialiser le modèle, la loss et l’optimiseur
model = StockPriceLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCELoss()  # Binary Cross Entropy pour classification binaire
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [149]:
num_epochs = 1000

model.train()  # Met le modèle en mode entraînement

for epoch in range(1, num_epochs + 1):
    # Forward pass
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Affichage de la perte toutes les 10 époques
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch [{epoch}/{num_epochs}], Loss: {loss.item():.4f}")

# Affichage final de la perte
print(f"Loss finale après {num_epochs} époques: {loss.item():.4f}")

Epoch [1/1000], Loss: 0.6917
Epoch [10/1000], Loss: 0.6916
Epoch [20/1000], Loss: 0.6916
Epoch [30/1000], Loss: 0.6915
Epoch [40/1000], Loss: 0.6915
Epoch [50/1000], Loss: 0.6914
Epoch [60/1000], Loss: 0.6913
Epoch [70/1000], Loss: 0.6912
Epoch [80/1000], Loss: 0.6911
Epoch [90/1000], Loss: 0.6909
Epoch [100/1000], Loss: 0.6907
Epoch [110/1000], Loss: 0.6904
Epoch [120/1000], Loss: 0.6901
Epoch [130/1000], Loss: 0.6898
Epoch [140/1000], Loss: 0.6896
Epoch [150/1000], Loss: 0.6893
Epoch [160/1000], Loss: 0.6889
Epoch [170/1000], Loss: 0.6885
Epoch [180/1000], Loss: 0.6880
Epoch [190/1000], Loss: 0.6876
Epoch [200/1000], Loss: 0.6872
Epoch [210/1000], Loss: 0.6867
Epoch [220/1000], Loss: 0.6862
Epoch [230/1000], Loss: 0.6857
Epoch [240/1000], Loss: 0.6851
Epoch [250/1000], Loss: 0.6845
Epoch [260/1000], Loss: 0.6839
Epoch [270/1000], Loss: 0.6835
Epoch [280/1000], Loss: 0.6829
Epoch [290/1000], Loss: 0.6825
Epoch [300/1000], Loss: 0.6821
Epoch [310/1000], Loss: 0.6816
Epoch [320/1000], L