In [3]:
import pandas as pd
import os

In [4]:
def format_data(data):
    data = data.iloc[2:].copy()
    data['Date'] = pd.to_datetime(data['Price'])
    data.drop('Price', axis=1, inplace=True)
    data.set_index('Date', inplace=True)
    return data
    
def import_all_data(folder_with_csv):
    all_data = {}
    for file in os.listdir(folder_with_csv):
        data = format_data(pd.read_csv(os.path.join(folder_with_csv, file)))
        filename = file.rsplit('.', maxsplit=1)[0]
        all_data[filename] = data
    return all_data

def verify_data(data_df, first='2015-01-02', last='2024-12-31'):
    # Check if first date is 2015-01-02 and last date is 2024-12-31
    first_date = data_df.index[0]
    last_date = data_df.index[-1]
    
    expected_first = pd.Timestamp(first)
    expected_last = pd.Timestamp(last)
    
    if first_date != expected_first or last_date != expected_last:
        print(f"Expected range: {expected_first} to {expected_last}")
        print(f"Actual range: {first_date} to {last_date}")
        print("---------------------")
        return False
        
    return True


# Import all data

In [5]:
all_data = import_all_data("data/raw")
good_data = {}
error_data = {}

for ticker, data in all_data.items():
    try:
        if verify_data(data):
            good_data[ticker] = data
        else:
            error_data[ticker] = data
    except Exception as e:
        print(f"Error verifying data for {ticker}: {e}")

Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2020-12-10 00:00:00 to 2024-12-31 00:00:00
---------------------
Error verifying data for BF.B: index 0 is out of bounds for axis 0 with size 0
Error verifying data for BRK.B: index 0 is out of bounds for axis 0 with size 0
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2020-03-19 00:00:00 to 2024-12-31 00:00:00
---------------------
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2022-01-19 00:00:00 to 2024-12-31 00:00:00
---------------------
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2021-04-14 00:00:00 to 2024-12-31 00:00:00
---------------------
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2019-06-12 00:00:00 to 2024-12-31 00:00:00
---------------------
Expected range: 2015-01-02 00:00:00 to 2024-12-31 00:00:00
Actual range: 2019-05-24 00:00:00 to 2024-12-31 00:00:00
---------------------
Expected rang

In [6]:
print(len(good_data))
print(len(error_data))

465
36


In [7]:
close_prices = {ticker: data['Close'] for ticker, data in good_data.items()}
dataset = pd.DataFrame(close_prices)
dataset

Unnamed: 0_level_0,A,AAPL,ABBV,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,37.195491790771484,24.288583755493164,42.39486312866211,36.58060836791992,18.539352416992188,74.62398529052734,72.33999633789062,44.687191009521484,38.306640625,66.32910919189453,...,100.27481079101562,23.921401977539062,126.55622100830078,25.937883377075195,58.96714401245117,33.33534622192383,42.62421798706055,100.4542465209961,77.43000030517578,39.88908386230469
2015-01-05,36.49853515625,23.60433006286621,41.5970344543457,36.588748931884766,18.42841339111328,73.36400604248047,71.9800033569336,43.87453842163086,36.98114776611328,66.28136444091797,...,99.55453491210938,23.921401977539062,124.40232849121094,25.643468856811523,57.35368728637695,31.260644912719727,41.75818634033203,104.20154571533203,76.33999633789062,39.649635314941406
2015-01-06,35.929954528808594,23.606552124023438,41.391136169433594,36.17325973510742,18.46961784362793,72.8348159790039,70.52999877929688,42.84465789794922,36.25212097167969,65.73184204101562,...,99.05936431884766,23.894901275634766,121.74867248535156,25.779905319213867,57.04878616333008,31.07681655883789,41.245628356933594,103.31826782226562,75.79000091552734,39.26279830932617
2015-01-07,36.40681076049805,23.937576293945312,43.06403732299805,36.46654510498047,18.5773868560791,74.36358642578125,71.11000061035156,43.29523468017578,36.797054290771484,66.30525970458984,...,100.47737884521484,23.9677734375,125.32418060302734,26.002519607543945,57.6268424987793,31.321910858154297,42.6124267578125,105.88783264160156,77.72000122070312,40.07328414916992
2015-01-08,37.49811553955078,24.8573055267334,43.5144157409668,37.21607971191406,18.900693893432617,75.4975814819336,72.91999816894531,44.05959701538086,36.259498596191406,67.82630157470703,...,101.62530517578125,24.226139068603516,126.83194732666016,26.304119110107422,58.58599853515625,31.55828285217285,43.354740142822266,107.01203155517578,79.37999725341797,40.69037628173828
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,135.27603149414062,257.57867431640625,175.172607421875,113.12674713134766,92.66999816894531,358.32281494140625,447.94000244140625,216.1610565185547,49.5608024597168,293.42254638671875,...,313.3326721191406,27.83730125427246,88.30902099609375,66.53433990478516,104.49430847167969,117.40975952148438,133.7091827392578,106.55384826660156,395.44000244140625,163.10595703125
2024-12-26,135.00765991210938,258.39666748046875,174.39405822753906,113.62948608398438,92.93000030517578,357.1337890625,450.1600036621094,216.13131713867188,49.541221618652344,294.1846923828125,...,315.1569519042969,27.748645782470703,88.83621215820312,66.50508117675781,104.58269500732422,117.72775268554688,134.69961547851562,106.50418853759766,396.8500061035156,163.91802978515625
2024-12-27,134.71939086914062,254.9749298095703,173.23597717285156,113.35346221923828,92.33999633789062,352.9226379394531,446.4800109863281,215.07078552246094,49.51185607910156,293.14544677734375,...,313.61029052734375,27.571338653564453,88.20955657958984,66.46605682373047,104.57288360595703,116.79364013671875,133.93698120117188,106.1268310546875,389.07000732421875,163.00692749023438
2024-12-30,133.60618591308594,251.59307861328125,171.47451782226562,111.19464111328125,91.88999938964844,349.2663879394531,445.79998779296875,210.6799774169922,49.01262664794922,289.96832275390625,...,310.56646728515625,27.47283363342285,85.37467193603516,65.92955780029297,103.86577606201172,115.55146789550781,132.2433319091797,104.90263366699219,383.8500061035156,160.6697540283203


In [8]:
if not os.path.exists("data/processed/dataset_prices.csv"):
    dataset.to_csv("data/processed/dataset_prices.csv")
    print("Dataset saved!")

In [9]:
dataset = pd.read_csv("data/processed/dataset_prices.csv")

In [10]:
# Calculate daily returns
# First convert string values to float
dataset = dataset.set_index('Date')  # Assuming 'Date' is the first column
dataset = dataset.astype(float)
returns = dataset.pct_change()
returns

Unnamed: 0_level_0,A,AAPL,ABBV,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,,,,,,,,,,,...,,,,,,,,,,
2015-01-05,-0.018738,-0.028172,-0.018819,0.000223,-0.005984,-0.016884,-0.004976,-0.018185,-0.034602,-0.000720,...,-0.007183,0.000000,-0.017019,-0.011351,-0.027362,-0.062237,-0.020318,0.037304,-0.014077,-0.006003
2015-01-06,-0.015578,0.000094,-0.004950,-0.011356,0.002236,-0.007213,-0.020145,-0.023473,-0.019713,-0.008291,...,-0.004974,-0.001108,-0.021331,0.005321,-0.005316,-0.005881,-0.012274,-0.008477,-0.007205,-0.009756
2015-01-07,0.013272,0.014023,0.040417,0.008108,0.005835,0.020990,0.008223,0.010517,0.015032,0.008724,...,0.014315,0.003050,0.029368,0.008635,0.010133,0.007887,0.033138,0.024870,0.025465,0.020643
2015-01-08,0.029975,0.038422,0.010458,0.020554,0.017403,0.015249,0.025453,0.017655,-0.014609,0.022940,...,0.011425,0.010780,0.012031,0.011599,0.016644,0.007547,0.017420,0.010617,0.021359,0.015399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,0.011144,0.011478,0.009025,0.003937,0.006298,0.007972,0.002686,0.014891,0.004364,0.007853,...,0.005248,0.006410,0.007604,0.007236,0.000941,0.005446,0.008516,0.004494,0.012262,0.002557
2024-12-26,-0.001984,0.003176,-0.004444,0.004444,0.002806,-0.003318,0.004956,-0.000138,-0.000395,0.002597,...,0.005822,-0.003185,0.005970,-0.000440,0.000846,0.002708,0.007407,-0.000466,0.003566,0.004979
2024-12-27,-0.002135,-0.013242,-0.006641,-0.002429,-0.006349,-0.011792,-0.008175,-0.004907,-0.000593,-0.003533,...,-0.004908,-0.006390,-0.007054,-0.000587,-0.000094,-0.007935,-0.005662,-0.003543,-0.019604,-0.005558
2024-12-30,-0.008263,-0.013263,-0.010168,-0.019045,-0.004873,-0.010360,-0.001523,-0.020416,-0.010083,-0.010838,...,-0.009706,-0.003573,-0.032138,-0.008072,-0.006762,-0.010636,-0.012645,-0.011535,-0.013417,-0.014338


In [14]:
if not os.path.exists("data/processed/dataset_returns.csv"):
    returns.to_csv('data/processed/dataset_returns.csv')
    print("Dataset saved!")

In [15]:
import numpy as np

# Calculate log returns
log_returns = np.log(dataset / dataset.shift(1))

if not os.path.exists("data/processed/dataset_log_returns.csv"):
    log_returns.to_csv('data/processed/dataset_log_returns.csv')
    print("Log returns dataset saved!")

Log returns dataset saved!


In [17]:
log_returns

Unnamed: 0_level_0,A,AAPL,ABBV,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,,,,,,,,,,,...,,,,,,,,,,
2015-01-05,-0.018915,-0.028576,-0.018998,0.000223,-0.006002,-0.017029,-0.004989,-0.018353,-0.035215,-0.000720,...,-0.007209,0.000000,-0.017166,-0.011416,-0.027743,-0.064258,-0.020527,0.036625,-0.014177,-0.006021
2015-01-06,-0.015701,0.000094,-0.004962,-0.011421,0.002233,-0.007239,-0.020350,-0.023753,-0.019910,-0.008325,...,-0.004986,-0.001108,-0.021562,0.005306,-0.005330,-0.005898,-0.012350,-0.008513,-0.007231,-0.009804
2015-01-07,0.013185,0.013925,0.039621,0.008075,0.005818,0.020772,0.008190,0.010462,0.014920,0.008686,...,0.014213,0.003045,0.028945,0.008598,0.010082,0.007856,0.032601,0.024566,0.025146,0.020432
2015-01-08,0.029535,0.037702,0.010404,0.020346,0.017254,0.015134,0.025135,0.017501,-0.014716,0.022681,...,0.011360,0.010722,0.011959,0.011532,0.016507,0.007518,0.017270,0.010561,0.021134,0.015282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-24,0.011083,0.011413,0.008985,0.003929,0.006278,0.007940,0.002683,0.014782,0.004355,0.007822,...,0.005235,0.006390,0.007575,0.007210,0.000940,0.005432,0.008480,0.004484,0.012187,0.002553
2024-12-26,-0.001986,0.003171,-0.004454,0.004434,0.002802,-0.003324,0.004944,-0.000138,-0.000395,0.002594,...,0.005805,-0.003190,0.005952,-0.000440,0.000845,0.002705,0.007380,-0.000466,0.003559,0.004966
2024-12-27,-0.002137,-0.013331,-0.006663,-0.002432,-0.006369,-0.011862,-0.008208,-0.004919,-0.000593,-0.003539,...,-0.004920,-0.006410,-0.007079,-0.000587,-0.000094,-0.007966,-0.005678,-0.003549,-0.019799,-0.005574
2024-12-30,-0.008297,-0.013352,-0.010220,-0.019229,-0.004885,-0.010414,-0.001524,-0.020627,-0.010134,-0.010897,...,-0.009753,-0.003579,-0.032666,-0.008105,-0.006785,-0.010693,-0.012726,-0.011602,-0.013507,-0.014442


# Fetch SP500 sectors from Wikipeida

In [31]:
# Fetch S&P 500 sectors from Wikipedia
import pandas as pd

# Read the S&P 500 table from Wikipedia
sp500_wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]

sectors_df = sp500_wiki[['Symbol', 'GICS Sector']].set_index('Symbol')
our_tickers = dataset.columns.tolist()

# Create mapping of tickers to sectors for our dataset
sectors_mapping = {}
for ticker in our_tickers:
    if ticker in sectors_df.index:
        sectors_mapping[ticker] = sectors_df.loc[ticker, 'GICS Sector']
    else:
        print(f"Ticker {ticker} not found in S&P 500")
        sectors_mapping[ticker] = 'unknown'

sectors_df = pd.DataFrame.from_dict(sectors_mapping, orient='index', columns=['Sector'])
if not os.path.exists("data/raw/tickers_sectors.csv"):
    sectors_df.to_csv("data/raw/tickers_sectors.csv")
    print("Sectors mapping saved!")

Ticker PARA not found in S&P 500


In [28]:
sectors_df.Sector.unique()

array(['Health Care', 'Information Technology', 'Financials',
       'Consumer Staples', 'Industrials', 'Utilities', 'Materials',
       'Real Estate', 'Consumer Discretionary', 'Energy',
       'Communication Services', 'unknown'], dtype=object)

In [48]:
sectors = pd.read_csv("data/raw/tickers_sectors.csv", index_col=0)
sector_list = sectors.Sector.unique().tolist()

# Create sectoral datasets
for sector in sectors.Sector.unique():
    # Get tickers for this sector
    sector_tickers = sectors[sectors.Sector == sector].index.tolist()
    
    sector_returns = returns[sector_tickers]
    sector_log_returns = log_returns[sector_tickers]
    
    sector_dir = f"data/processed/sectors/{sector.lower().replace(' ', '_')}"
    if not os.path.exists(sector_dir):
        os.makedirs(sector_dir)
        
    # Save datasets
    sector_returns.to_csv(f"{sector_dir}/returns.csv")
    sector_log_returns.to_csv(f"{sector_dir}/log_returns.csv")
    
print("Sectoral datasets created!")

Sectoral datasets created!
