# Setup

## Imports

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

## Download Data

In [1]:
startDate = '2010-01-01'
endDate = '2023-01-01'
interval = '1d'

In [20]:
# Get a list of S&P 500 tickers from Wikipedia
sp500_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp500 = pd.read_html(sp500_url)[0]
tickers = sp500['Symbol'].tolist()
# Handle tickers with periods (e.g., BRK.B) for yfinance
tickers = [t.replace('.', '-') for t in tickers]
# Download historical data for tickers
data = yf.download(tickers, start=startDate, end=endDate, interval=interval, auto_adjust=True)["Close"]


[*********************100%***********************]  503 of 503 completed

5 Failed downloads:
['SW', 'SOLV', 'VLTO', 'GEV', 'KVUE']: YFPricesMissingError('possibly delisted; no price data found  (1d 2010-01-01 -> 2023-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1262322000, endDate = 1672549200")')


## Clean Data

In [21]:
# Count total NaNs per ticker
total_nans = data.isna().sum()
print("Total NaNs per ticker:")
print(total_nans[total_nans > 0])

Total NaNs per ticker:
Ticker
ABBV     754
ABNB    2754
ALLE     976
AMCR     596
ANET    1113
        ... 
VLTO    3272
VST     1701
WDAY     701
XYL      449
ZTS      775
Length: 75, dtype: int64


In [22]:
# Remove any tickers that failed to download data
data.dropna(inplace=True, axis=1) # drop columns with any NaN values (should be about 75 of 500 tickers)
data

Ticker,A,AAPL,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP,ADSK,...,WSM,WST,WTW,WY,WYNN,XEL,XOM,YUM,ZBH,ZBRA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,19.973597,6.431898,18.579700,7.601905,31.649681,37.090000,22.062080,20.791332,26.033308,25.670000,...,7.292858,17.571609,52.258419,9.568549,40.966484,12.367369,38.568726,18.697773,51.989990,28.670000
2010-01-05,19.756628,6.443016,18.429586,7.576549,31.845285,37.700001,22.027250,20.903650,25.893511,25.280001,...,7.495924,17.349014,52.141270,9.771677,43.458027,12.220693,38.719318,18.633827,53.635796,28.620001
2010-01-06,19.686436,6.340531,18.531939,7.543795,32.183819,37.619999,21.985445,20.850796,25.832737,25.340000,...,7.762011,17.179838,52.980679,9.663630,42.887978,12.244164,39.053967,18.500616,53.618473,28.400000
2010-01-07,19.660912,6.328812,18.685461,7.499420,32.153732,36.889999,21.811293,20.632772,25.820585,25.480000,...,8.133129,17.233259,52.824551,9.620412,43.803875,12.191363,38.931271,18.495293,54.848488,27.690001
2010-01-08,19.654533,6.370886,18.780983,7.484628,32.025837,36.689999,21.936680,20.375116,25.784105,26.260000,...,8.007089,17.228804,52.765972,9.531814,43.490044,12.197228,38.775078,18.500616,53.696442,27.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,146.602066,130.173767,102.963440,60.267948,255.427048,338.450012,156.698013,87.373421,228.162613,188.160004,...,57.171524,233.644348,236.393463,28.614380,78.801727,64.514526,99.805176,122.864624,124.053886,248.220001
2022-12-27,146.916428,128.367188,103.334633,60.496162,254.678268,335.089996,155.129517,88.564629,228.247833,186.289993,...,55.659248,235.234222,236.577255,28.596205,82.325935,65.096634,101.191879,123.817795,124.631599,251.000000
2022-12-28,145.482147,124.428223,102.630310,59.526245,252.576050,328.329987,153.293198,86.450012,225.236465,181.899994,...,54.655827,229.778931,233.830185,27.832914,78.186691,64.627747,99.529671,123.255424,123.368431,246.839996
2022-12-29,148.429291,127.952568,104.990723,60.011208,257.625275,337.579987,156.831909,85.988297,227.859558,188.119995,...,55.426231,237.470001,238.023087,28.459906,79.328888,65.085655,100.282707,123.903595,125.170158,257.529999


# Select Pairs

## Perform PCA