In [2]:
import pandas as pd

# to downlaod stock prices and other information 
from yahoofinancials import YahooFinancials
import yfinance as yf

# to handle dates 
from datetime import date, datetime

import os

# to parallelise things 
import ray

# useful util to check the time on running a function 
from timebudget import timebudget

In [34]:
# get tickers 
os.chdir("/Users/safishajjouz/Google Drive")
df_tickers = pd.read_csv('Portfolio-09-03-2021.csv')
df_tickers = df_tickers[['Symbol', 'Name', 'Portfolio', 'Currency']]

df_tickers = df_tickers[~df_tickers['Symbol'].isin(['^FTSE', '^FTAS', '^FTMC', 'DJI'])]
df_tickers = df_tickers[df_tickers['Currency']=='USD']
df_tickers = df_tickers.drop_duplicates('Symbol')
tickers = df_tickers['Symbol'].to_list()

start_date = '1990-01-01'
date_end = date.today()
date_end = date_end.strftime("%Y-%m-%d")
time_interval = ['daily', 'weekly','monthly'][0]

In [58]:
@ray.remote
def get_stock_data(ticker, start_date, end_date, time_interval):
    yahoo_financials = YahooFinancials(ticker)
    data = yahoo_financials.get_historical_price_data(start_date= start_date, 
                                                      end_date=end_date, 
                                                      time_interval=time_interval)
    # to get short name and type 
    temp = yahoo_financials.get_stock_quote_type_data()
    
    df_temp = pd.DataFrame(data[ticker]['prices'])
    
    # fix faulty yahoo data that jumps 100x
    jumps_up   = df_temp['adjclose'] / df_temp['adjclose'].shift() >  50
    jumps_down = df_temp['adjclose'] / df_temp['adjclose'].shift() < .02
    correction_factor = 100.**(jumps_down.cumsum() - jumps_up.cumsum())
    df_temp['adjclose'] *= correction_factor
    
    df_temp['Stock'] = temp[ticker]['shortName']
    df_temp['Type'] = temp[ticker]['quoteType']
    
    # use yahoo finance to get other information 
    tik_info = yf.Ticker(ticker)
    info = tik_info.info
    if "sector" in info:
        df_temp['Sector'] = info["sector"]
    else: 
        df_temp['Sector'] = 'unclassified'
            
    return df_temp


ray.init(ignore_reinit_error=True, num_cpus=5)
@timebudget
def run_complex_operations(operation, input):
    mydata = ray.get([operation.remote(ticker,start_date,date_end, time_interval) for ticker in input]) 
    return mydata

mydata=run_complex_operations(get_stock_data, tickers)
ray.shutdown()

df = pd.concat(mydata)
df = df.rename(columns={'formatted_date':'Date'})
df = df.drop('date', axis = 1)
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df = df.set_index('Date')
df.head()

run_complex_operations took 100.534sec


Unnamed: 0_level_0,high,low,open,close,volume,adjclose,Stock,Type,Sector
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2010-06-29,5.0,3.508,3.8,4.778,93831500.0,4.778,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-06-30,6.084,4.66,5.158,4.766,85935500.0,4.766,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-07-01,5.184,4.054,5.0,4.392,41094000.0,4.392,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-07-02,4.62,3.742,4.6,3.84,25699000.0,3.84,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-07-06,4.0,3.166,4.0,3.222,34334500.0,3.222,"Tesla, Inc.",EQUITY,Consumer Cyclical


Unnamed: 0,date,high,low,open,close,volume,adjclose,formatted_date,Stock,Type,Sector
0,1277818200,5.000000,3.508000,3.800000,4.778000,93831500,4.778000,2010-06-29,"Tesla, Inc.",EQUITY,Consumer Cyclical
1,1277904600,6.084000,4.660000,5.158000,4.766000,85935500,4.766000,2010-06-30,"Tesla, Inc.",EQUITY,Consumer Cyclical
2,1277991000,5.184000,4.054000,5.000000,4.392000,41094000,4.392000,2010-07-01,"Tesla, Inc.",EQUITY,Consumer Cyclical
3,1278077400,4.620000,3.742000,4.600000,3.840000,25699000,3.840000,2010-07-02,"Tesla, Inc.",EQUITY,Consumer Cyclical
4,1278423000,4.000000,3.166000,4.000000,3.222000,34334500,3.222000,2010-07-06,"Tesla, Inc.",EQUITY,Consumer Cyclical
...,...,...,...,...,...,...,...,...,...,...,...
1744,1631021400,272.989990,264.670013,270.000000,269.709991,875400,269.709991,2021-09-07,Wayfair Inc.,EQUITY,Consumer Cyclical
1745,1631107800,270.779999,262.510010,270.000000,263.980011,784100,263.980011,2021-09-08,Wayfair Inc.,EQUITY,Consumer Cyclical
1746,1631194200,269.480011,260.600006,265.510010,265.890015,1050000,265.890015,2021-09-09,Wayfair Inc.,EQUITY,Consumer Cyclical
1747,1631280600,272.000000,262.089996,269.890015,262.230011,724300,262.230011,2021-09-10,Wayfair Inc.,EQUITY,Consumer Cyclical


In [56]:
ray.shutdown()

In [6]:
@timebudget
def download_yahoo_data(tickers,start_date, end_date, time_interval, dataframe_format = 'long'):
    
    yahoo_financials = YahooFinancials(tickers)
    data = yahoo_financials.get_historical_price_data(start_date=start_date, 
                                                     end_date=end_date, 
                                                      time_interval=time_interval)
    temp = yahoo_financials.get_stock_quote_type_data()
    
    data_list = []
    for tik in tickers:
        prices = pd.DataFrame.from_dict(data[tik]['prices'])
        prices['Symbol'] = tik
        prices['Stock'] = temp[tik]['shortName']
        prices['Type'] = temp[tik]['quoteType']
        
        tik_info = yf.Ticker(tik)
        info = tik_info.info
        
        if "sector" in info:
            prices['Sector'] = info["sector"]
        else: 
            prices['Sector'] = 'unclassified'
            
        # fix faulty yahoo data that jumps 100x
        jumps_up   = prices['adjclose'] / prices['adjclose'].shift() >  50
        jumps_down = prices['adjclose'] / prices['adjclose'].shift() < .02
        correction_factor = 100.**(jumps_down.cumsum() - jumps_up.cumsum())
        prices['adjclose'] *= correction_factor
        #print(f"Fixed {sum(correction_factor != 1)}/{len(data)} for ticker {tik}"
        #      f" (min: {prices['adjclose'].min()}, max: {prices['adjclose'].max()})")
        data_list.append(prices)
    
    # collect 
    df = pd.concat(data_list)
    df = df.rename(columns={'formatted_date':'Date'})
    df = df.drop('date', axis = 1)
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df = df.set_index('Date')
    
    if dataframe_format == 'wide':
        df = df.pivot_table(index=["Date"], 
                    columns='Stock', 
                    values='adjclose')

    
    return df

In [8]:
# # Download Data
tickers = df_tickers['Symbol'].to_list() #['^GSPC'] # VIX


df = download_yahoo_data(tickers= tickers,
                   start_date = start_date, 
                   end_date = date_end, 
                   time_interval = time_interval,
                   dataframe_format = 'long') 
df.head()

download_yahoo_data took 678.463sec


Unnamed: 0_level_0,high,low,open,close,volume,adjclose,Symbol,Stock,Type,Sector
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-06-29,5.0,3.508,3.8,4.778,93831500.0,4.778,TSLA,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-06-30,6.084,4.66,5.158,4.766,85935500.0,4.766,TSLA,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-07-01,5.184,4.054,5.0,4.392,41094000.0,4.392,TSLA,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-07-02,4.62,3.742,4.6,3.84,25699000.0,3.84,TSLA,"Tesla, Inc.",EQUITY,Consumer Cyclical
2010-07-06,4.0,3.166,4.0,3.222,34334500.0,3.222,TSLA,"Tesla, Inc.",EQUITY,Consumer Cyclical
