In [1]:
import pandas as pd
from pandas_datareader import data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings

filterwarnings('ignore')

  from pandas.util.testing import assert_frame_equal


## Data scraping and prep

We first download the tickers of the S&P100 index from Wikipedia and use that to read the data of stock prices

In [5]:
try:
    tickers = pd.read_csv('tickers_sp100.csv')
except FileNotFoundError:    
    website_data = pd.read_html('https://en.wikipedia.org/wiki/S%26P_100')
    tickers = website_data[2].Symbol # this is the ticker table
    tickers.to_csv('tickers_sp100.csv', index=False)
    
tickers = tickers.Symbol.to_list()

In [11]:
start_date = '2015-05-31'
end_date = '2020-05-31'

try:
    raw_data = pd.read_pickle('sp100_adj_close.pkl')
except FileNotFoundError:
    raw_data = pd.DataFrame()
    
    print('Downloading started:')
    print('-'*50)
    
    for i, ticker in enumerate(tickers):
        try:
            col = data.DataReader(ticker, 'yahoo', start_date, end_date)['Adj Close']
            raw_data[ticker] = col
        except:
            print(f'Could not retrieve the data for {ticker}')
      
    print('Downloading complete!')
    print('-'*50)
    
    raw_data.to_pickle('sp100_adj_close.pkl')
        

In [10]:
raw_data.head()

Unnamed: 0_level_0,AAPL,ABBV,ABT,ACN,ADBE,AIG,ALL,AMGN,AMT,AMZN,...,UNH,UNP,UPS,USB,V,VZ,WBA,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-06-01,120.306801,53.367821,44.21843,87.960396,79.870003,52.126572,60.783951,137.054474,85.369728,430.920013,...,110.690834,90.819168,84.684486,37.619061,66.648201,39.303658,76.173103,47.192017,66.173225,68.942108
2015-06-02,119.772255,53.120647,44.21843,88.088417,79.82,53.014893,60.974697,136.53952,85.297272,430.98999,...,108.742249,90.525146,84.641922,37.759136,66.580704,39.239769,76.483177,47.30991,65.996109,68.950233
2015-06-03,119.919716,53.43161,44.146141,88.682808,80.129997,54.285191,61.510612,137.141739,83.965469,436.589996,...,109.296326,90.87262,85.637917,38.494534,66.493904,39.19186,76.031357,47.924656,66.314903,68.796333
2015-06-04,119.219307,53.758537,43.983513,87.722641,78.75,53.841042,60.902039,137.979599,85.768349,430.779999,...,107.726387,90.649872,85.21228,38.07431,65.780273,38.409309,75.526375,47.27623,65.65963,68.205154
2015-06-05,118.564957,53.742588,43.775707,87.237999,78.860001,53.743324,60.3843,137.246475,83.820503,426.950012,...,107.61557,91.131012,85.561317,38.512047,65.93457,37.714581,74.321518,47.672012,64.694435,68.253754


Plan of attack:
 - compute returns of the above (maybe log returns?)
 - compute the correlation matrix and the eigenvalue decomp 
 - go through Marcenko Pasteur and see what it says here