In [1]:
import pandas as pd
from portfolio_management_rl.utils.contstants import DATA_DIR
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from portfolio_management_rl.utils.contstants import WINDOW_SIZE, N_STOCKS, INITIAL_DATE, END_DATE

In [2]:
PROCESSED_DATA_DIR = DATA_DIR / "sp500/processed"
DATA_DIR = DATA_DIR / "sp500/all"
YEARS_TEST = 7
YEARS_VAL = 7

In [None]:
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

In [3]:
all_nan = set()
datasets = dict()
paths =  [path for path in DATA_DIR.glob("*.csv") if path.stem != "companies"]
for path in paths:
    df = pd.read_csv(path, index_col=0, parse_dates=True)
    df = df.loc[INITIAL_DATE:END_DATE]
    all_nan.update(df.columns[df.isna().sum() > 0])
    datasets[path.stem] = df

# new datasets
datasets["extreme_mean"] = (datasets["low"] + datasets["high"]) / 2
datasets["mean_adj"] = (datasets["open"] + datasets["adj_close"]) / 2
datasets["mean"] = (datasets["open"] + datasets["close"]) / 2

In [4]:
for df in datasets.values():
    nan_cols = (df.isna().sum() > 0).values
    nan_df = df.loc[:, nan_cols]
    all_nan.update(nan_df.columns.to_list())
all_nan

{'A',
 'AAL',
 'AAP',
 'ABBV',
 'ABC',
 'ABK',
 'ABMD',
 'ABS',
 'ACAS',
 'ACE',
 'ACGL',
 'ACN',
 'ADBE',
 'ADS',
 'ADSK',
 'ADT',
 'AEE',
 'AES',
 'AET',
 'AGN',
 'AIV',
 'AIZ',
 'AJG',
 'AKAM',
 'AKS',
 'ALB',
 'ALGN',
 'ALL',
 'ALLE',
 'ALTR',
 'ALXN',
 'AMCR',
 'AME',
 'AMG',
 'AMGN',
 'AMP',
 'AMT',
 'AMZN',
 'AN',
 'ANDV',
 'ANET',
 'ANF',
 'ANR',
 'ANSS',
 'AOS',
 'APC',
 'APH',
 'APOL',
 'APTV',
 'ARE',
 'ARG',
 'ARNC',
 'ATI',
 'ATO',
 'ATVI',
 'AV',
 'AVB',
 'AVGO',
 'AVP',
 'AWK',
 'AXON',
 'AYE',
 'AYI',
 'AZO',
 'BBBY',
 'BBY',
 'BCR',
 'BEAM',
 'BEN',
 'BG',
 'BHF',
 'BIG',
 'BIIB',
 'BJS',
 'BKNG',
 'BKR',
 'BLK',
 'BMC',
 'BMS',
 'BR',
 'BRCM',
 'BRK.B',
 'BS',
 'BSX',
 'BTU',
 'BWA',
 'BXLT',
 'BXP',
 'CA',
 'CAH',
 'CAM',
 'CARR',
 'CB',
 'CBE',
 'CBOE',
 'CBRE',
 'CCE',
 'CCI',
 'CCL',
 'CDAY',
 'CDNS',
 'CDW',
 'CE',
 'CEG',
 'CELG',
 'CEPH',
 'CERN',
 'CF',
 'CFC',
 'CFG',
 'CFN',
 'CHK',
 'CHRW',
 'CHTR',
 'CIEN',
 'CME',
 'CMG',
 'CNC',
 'CNX',
 'COF',
 'COG',
 

### Timeseries split

In [5]:
df = datasets["close"]
final_date = df[-1:].index[0]
initial_test_date = final_date - pd.DateOffset(years=YEARS_TEST) 
initial_val_date = initial_test_date - pd.DateOffset(years=YEARS_VAL)

train_datasets = dict()
val_datasets = dict()
test_datasets = dict()

for name, df in datasets.items():
    train_datasets[name] = df.loc[:initial_val_date]
    val_datasets[name] = df.loc[initial_val_date - pd.DateOffset(years=3):initial_test_date]
    test_datasets[name] = df.loc[initial_test_date - pd.DateOffset(years=3):]

In [6]:
len(train_datasets["close"]) / 252, len(val_datasets["close"]) / 252, len(test_datasets["close"]) / 252

(26.41269841269841, 9.984126984126984, 9.98015873015873)

In [7]:
len(train_datasets["close"]), len(val_datasets["close"]), len(test_datasets["close"]) 

(6656, 2516, 2515)

# Storing

In [11]:
(PROCESSED_DATA_DIR / f"train").mkdir(exist_ok=True)
(PROCESSED_DATA_DIR / f"val").mkdir(exist_ok=True)
(PROCESSED_DATA_DIR / f"test").mkdir(exist_ok=True)

for name, df in train_datasets.items():
    df.drop(columns=all_nan).iloc[:, :N_STOCKS].to_csv(PROCESSED_DATA_DIR / f"train/{name}.csv", index=True)

for name, df in val_datasets.items():
    df.drop(columns=all_nan).iloc[:, :N_STOCKS].to_csv(PROCESSED_DATA_DIR / f"val/{name}.csv", index=True)

for name, df in test_datasets.items():
    df.drop(columns=all_nan).iloc[:, :N_STOCKS].to_csv(PROCESSED_DATA_DIR / f"test/{name}.csv", index=True)


In [12]:
 df.drop(columns=all_nan).iloc[:, :N_STOCKS]

Unnamed: 0_level_0,^FVX,MMM,ABT,ADM,ADP,AFL,APD,ALK,LNT,MO,...,HST,HPQ,HUM,HBAN,IBM,ITW,INTC,IFF,IP,IPG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-08-01,1.4430,118.395000,36.865000,36.950001,62.493416,31.482500,100.249767,30.982499,26.770000,35.405001,...,18.065000,11.811989,90.849998,8.655,187.600380,73.285000,23.305000,81.450001,45.896278,16.545000
2013-08-02,1.4380,118.120003,36.770000,37.299999,62.686567,31.407499,98.866791,31.297500,26.662500,35.600000,...,18.130000,12.095822,90.424999,8.685,186.739960,73.759998,23.200000,81.709999,46.115744,16.440000
2013-08-05,1.3840,118.044998,36.545000,37.665001,62.831430,31.117500,99.865864,31.190001,26.657500,35.610001,...,18.275000,12.211626,90.584999,8.720,186.739960,73.869999,22.960000,82.125000,45.765532,16.335000
2013-08-06,1.3845,117.660000,36.174999,37.375000,63.090431,30.937500,98.959297,30.700000,26.530000,35.475000,...,17.844999,12.154859,91.529999,8.680,183.197899,73.450001,22.839999,84.110001,45.195858,16.145000
2013-08-07,1.3785,117.565002,36.000000,37.754999,63.257242,30.667500,98.728027,30.442500,26.395000,35.280001,...,17.685000,12.045867,92.024998,8.560,180.764816,73.369999,22.705000,83.209999,44.868992,15.970000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-26,4.1155,111.239998,113.290001,86.850002,248.724998,72.674999,303.279999,47.960001,55.594999,45.500000,...,17.405000,32.970001,451.239990,12.180,140.755005,255.174995,34.040001,87.110001,33.885000,32.525002
2023-07-27,4.1715,110.820000,113.555000,86.345001,252.815002,72.509998,302.945007,47.219999,54.984999,45.629999,...,17.535001,33.135000,449.899994,12.115,142.635002,255.870003,34.684999,85.730003,34.960001,33.290001
2023-07-28,4.1880,111.875000,112.955002,86.334999,252.340004,72.424999,304.194992,47.785000,54.264999,45.590000,...,17.985001,32.910000,453.259995,12.135,143.445000,258.570007,36.790001,85.350002,35.825001,33.555000
2023-07-31,4.1845,111.689999,112.090000,85.974998,248.930000,72.389999,304.164993,48.365000,53.925001,45.355000,...,18.330000,32.785002,455.934998,12.215,143.994995,262.145004,36.320000,85.299999,35.960001,33.855000
