In [1]:
import bs4 as bs
import datetime as dt
import os
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import pickle 
import requests

def save_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    names = []
    symbols=[[]]
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text.replace('.', '-')
        ticker = ticker[:-1]
        tickers.append(ticker)
        name = row.findAll('td')[1].text
        name = name
        names.append(name)

    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(symbols, f)
    symbols=np.column_stack((tickers,names))
    return symbols
symbols=save_sp500_tickers()


In [2]:
print(len(symbols))
print(symbols)

505
[['MMM' '3M Company']
 ['ABT' 'Abbott Laboratories']
 ['ABBV' 'AbbVie Inc.']
 ...
 ['ZBH' 'Zimmer Biomet']
 ['ZION' 'Zions Bancorp']
 ['ZTS' 'Zoetis']]


In [3]:

def get_data_from_yahoo(reload_sp500=True):
    if reload_sp500:
        symbols = save_sp500_tickers()
    else:
        with open("sp500tickers.pickle", "rb") as f:
            symbols = pickle.load(f)
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')

    start = dt.datetime(2020, 1, 1)
    end = dt.datetime.now()
    for i in range(len(symbols)):
        ticker=symbols[i][0]
    # just in case your connection breaks, we'd like to save our progress!
        if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
            
            df = web.DataReader(ticker, 'yahoo', start, end)
            df.reset_index(inplace=True)
            df.set_index("Date", inplace=True)
            df['Name'] = symbols[i][1]
            df['Symbol'] = symbols[i][0]
            df.to_csv('stock_dfs/{}.csv'.format(ticker))
        else:
            print('Already have {}'.format(ticker))


get_data_from_yahoo()

In [4]:
def compile_data():
    '''
    with open("sp500tickers.pickle", "rb") as f:
        symbols= pickle.load(f)
    '''
    #symbols = save_sp500_tickers()
    main_df = pd.DataFrame()

    for i in range(len(symbols)):
        ticker=symbols[i][0]
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)
       # df.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'],1,inplace=True)
       
        if main_df.empty:
            main_df = df
        else:
            main_df=pd.concat([main_df,df])

    print(main_df.head())
    main_df.to_csv('sp500_2020.csv')


compile_data()

                  High         Low        Open       Close     Volume  \
Date                                                                    
2020-01-02  180.009995  177.139999  177.679993  180.000000  3601700.0   
2020-01-03  178.660004  175.630005  177.020004  178.449997  2466900.0   
2020-01-06  178.710007  176.350006  177.149994  178.619995  1998000.0   
2020-01-07  178.509995  176.820007  178.279999  177.899994  2173000.0   
2020-01-08  181.500000  177.649994  178.000000  180.630005  2758300.0   

             Adj Close        Name Symbol  
Date                                       
2020-01-02  172.119888  3M Company    MMM  
2020-01-03  170.637741  3M Company    MMM  
2020-01-06  170.800308  3M Company    MMM  
2020-01-07  170.111816  3M Company    MMM  
2020-01-08  172.722321  3M Company    MMM  
