In [1]:
import yfinance as yf
import pandas as pd
import requests
import bs4
import re
import numpy as np
import datetime as dt
import os
from collections import Counter

In [5]:
datascrape_range = '1y'
datascrape_frequency = '1h' #can be changed to as low as every minute
today = dt.date.today()
#function for metadata scrape
def sp500_metascrape():
	  #scrape S&P 500 HTML
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs4.BeautifulSoup(resp.text, 'lxml')
    #isolate relevant table
    table = soup.find('table', {'class': 'wikitable sortable'})
    #initialize DataFrame
    sp = pd.DataFrame(columns=['Ticker','GICS Sector','GICS Sub-Industry'])
    #iterate over table and add to DataFrame
    for row in table.findAll('tr')[1:]:
    	#isolate strings and replace tickers including . with - to match yfinance notation
        ticker = row.findAll('td')[0].text.replace('.', '-')[:-1]
        sector = row.findAll('td')[3].text.replace('\n','')
        subind = row.findAll('td')[4].text
        #append to DataFrame
        sp = sp.append({'Ticker':ticker,'GICS Sector':sector,'GICS Sub-Industry':subind},ignore_index=True)
    return sp

#function for historical market data scrape (10 years)
def sp500_datascrape(symbols):
  #prepare tickers in spaced string
  symbols = symbols.to_string(index=False).replace('\n',' ').strip()
  symbols = re.sub(' +',' ',symbols)
  #download data
  data = yf.download(
    tickers = symbols,
    start = '2019-06-07',
    end = '2020-04-12',
    #period = datascrape_range,
    interval = datascrape_frequency,
    group_by = 'ticker',
    auto_adjust = True,
    threads = True,
    prepost=False
    )
  return data

In [3]:
meta = sp500_metascrape()
data = sp500_datascrape(meta['Ticker'])

[*********************100%***********************]  505 of 505 completed


In [6]:
data2 = sp500_datascrape(meta['Ticker'])

[*********************100%***********************]  505 of 505 completed

2 Failed downloads:
- VTRS: No data found for this date range, symbol may be delisted
- LUMN: No data found for this date range, symbol may be delisted


In [7]:
data2.head()

Unnamed: 0_level_0,OTIS,OTIS,OTIS,OTIS,OTIS,ALXN,ALXN,ALXN,ALXN,ALXN,...,ZBH,ZBH,ZBH,ZBH,ZBH,ATO,ATO,ATO,ATO,ATO
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
2019-06-07 09:30:00-04:00,,,,,,119.410004,119.699997,117.18,117.849998,0.0,...,120.910004,121.110001,120.0,120.379997,0.0,104.690002,105.190002,104.690002,104.949997,0.0
2019-06-07 10:30:00-04:00,,,,,,117.93,118.809998,117.93,118.0,196681.0,...,120.339996,120.800003,120.0,120.084999,122288.0,104.985001,105.080002,104.760002,104.779999,30839.0
2019-06-07 11:30:00-04:00,,,,,,118.110001,118.580002,117.760002,118.574997,116032.0,...,120.059998,120.084999,119.459999,119.970001,109767.0,104.779999,104.849998,104.5,104.82,29672.0
2019-06-07 12:30:00-04:00,,,,,,118.620003,118.989998,118.440002,118.589996,84039.0,...,120.010002,120.129997,119.690002,119.800003,60220.0,104.849998,105.190002,104.849998,105.022697,19462.0
2019-06-07 13:30:00-04:00,,,,,,118.580002,118.699997,118.110001,118.139999,139999.0,...,119.800003,120.494598,119.800003,120.459999,51739.0,105.0,105.0,104.220001,104.32,43613.0


In [8]:
data2.tail()

Unnamed: 0_level_0,OTIS,OTIS,OTIS,OTIS,OTIS,ALXN,ALXN,ALXN,ALXN,ALXN,...,ZBH,ZBH,ZBH,ZBH,ZBH,ATO,ATO,ATO,ATO,ATO
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
2020-04-09 11:30:00-04:00,45.689999,47.419998,45.419998,46.799999,2498734.0,95.989998,96.720001,95.830002,95.989998,361307.0,...,110.610001,112.238998,109.800003,111.769997,217106.0,106.900002,107.800003,106.260002,107.419998,100511.0
2020-04-09 12:30:00-04:00,46.830002,47.48,46.5401,47.195,1751533.0,95.989998,96.220001,95.400497,96.110001,242844.0,...,111.800003,112.989998,111.739998,112.900002,161864.0,107.480003,107.889999,106.43,106.650002,96552.0
2020-04-09 13:30:00-04:00,47.195,47.720001,46.599998,47.268902,1393009.0,96.110001,96.440002,94.230003,94.360001,432677.0,...,112.970001,113.300003,111.129997,111.415001,170387.0,106.629997,106.910004,105.620003,105.669998,119738.0
2020-04-09 14:30:00-04:00,47.259998,48.23,46.700001,47.970001,2926851.0,94.370003,95.93,94.370003,95.867996,372271.0,...,111.199997,112.050003,109.529999,110.650002,374023.0,105.660004,105.910004,103.690002,105.620003,226909.0
2020-04-09 15:30:00-04:00,47.990002,50.25,47.799999,49.610001,2482745.0,95.889999,96.269997,95.019997,96.230003,471499.0,...,110.745003,111.330002,110.160004,110.5,272856.0,105.845001,106.18,104.910004,105.190002,317458.0


In [9]:
pd.to_pickle(data,'./storage/rawdata/data.pickle' )
pd.to_pickle(meta,'./storage/rawdata/meta.pickle')
pd.to_pickle(data2,'./storage/rawdata/data2.pickle')