In [15]:
import pandas as pd
from pathlib import Path

data_dir = Path("data/processed/sectors")
sectors_list = [d.name for d in data_dir.iterdir() if d.is_dir() and d.name != 'unknown']

sector_log_returns = {}
for sector in sectors_list:
    returns_path = data_dir / sector / "log_returns.csv"
    df = pd.read_csv(returns_path, index_col=0)
    sector_log_returns[sector] = df.iloc[1:]  # Skip first row with NaN values

In [19]:
def preprocessing_dataset(log_returns_df: pd.DataFrame) -> pd.DataFrame:
    # 1. Rolling z-score (60j, causal)
    rolling_mean = log_returns_df.rolling(window=60, min_periods=1).mean()
    rolling_std = log_returns_df.rolling(window=60, min_periods=1).std()
    z_scored = (log_returns_df - rolling_mean) / rolling_std.clip(lower=1e-8)
    
    # 2. De-mean cross-sectionnel (médiane par date)
    cross_sectional_median = z_scored.median(axis=1).to_frame().values
    demeaned = z_scored.sub(cross_sectional_median, axis=0)
    
    # 3. Clip dans [-3, 3]
    clipped = demeaned.clip(lower=-3.0, upper=3.0)
    
    return clipped


In [24]:
for sector in sectors_list:
    df = preprocessing_dataset(sector_log_returns[sector])
    display(df)
    break

Unnamed: 0_level_0,CMCSA,DIS,EA,GOOG,GOOGL,IPG,LYV,MTCH,NFLX,OMC,PSKY,T,TKO,TMUS,TTWO,VZ,WBD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2010-01-05,,,,,,,,,,,,,,,,,
2010-01-06,1.414214,0.000000,0.000000,0.000000,-1.110223e-16,1.414214,1.414214,0.000000,1.414214,-1.110223e-16,1.110223e-16,-1.110223e-16,-1.110223e-16,0.000000,-1.110223e-16,0.000000,0.000000
2010-01-07,1.345354,1.207583,0.000000,-0.281852,-2.818478e-01,-0.843786,-0.837149,0.120419,-0.076509,1.542856e-01,1.189291e+00,-5.674194e-01,2.096394e-01,0.862138,-5.182183e-01,0.527914,-0.712770
2010-01-08,-0.694373,0.446374,0.110629,0.731646,7.316483e-01,0.260547,-1.070888,-0.542061,0.000719,9.219829e-01,0.000000e+00,-1.908437e-01,-1.693692e+00,-1.250463,-9.225657e-02,0.064031,-1.163475
2010-01-11,-0.492689,-1.806286,-0.526145,0.277029,2.770254e-01,1.336317,-1.062772,0.000000,-0.157588,7.798161e-03,-2.454006e-01,8.313955e-01,-1.721889e+00,0.690173,1.503161e+00,0.575545,-0.951261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,0.325203,0.217615,0.711595,-0.015957,-4.525512e-02,-0.525076,-0.116643,0.102638,0.606404,-3.358690e-01,4.954165e-02,-1.681480e-02,6.203284e-01,0.000000,2.870292e-01,-0.433518,-0.113636
2024-12-26,-0.103947,-0.190276,-0.101652,-0.245039,-2.535326e-01,0.519216,-0.156104,0.282650,-0.634242,7.304022e-01,1.337863e-01,0.000000e+00,-1.480105e-01,0.129058,4.364382e-01,0.468320,0.093853
2024-12-27,0.039783,-0.420144,-0.194943,-0.586686,-5.199591e-01,-0.393078,-0.499967,0.216907,-0.716452,0.000000e+00,2.004958e-01,2.473054e-02,3.257785e-02,0.129053,-1.165676e-01,0.452631,0.226742
2024-12-30,-0.099987,-0.108061,-0.423856,0.049198,6.400768e-03,0.000000,-0.056275,0.275286,-0.041300,3.043876e-01,2.687897e-01,-2.909946e-01,2.321434e-01,-0.187324,-4.346705e-01,0.072928,0.121785
