In [10]:
# Packages
from pynance import *
import pandas as pd
import gc 
from datetime import datetime
import numpy as np

# Directories
transform = '../../data/transform'


# del my_array
# del my_object
# gc.collect()

# Import a list of stocks

In [2]:
snp_companies = pd.read_csv(f'{transform}/dim_snp_esg_full_16-Jan-2022.csv')

In [3]:
# # Change column name for easier reading
# snp_companies = snp_companies.rename({'Symbol':'ticker',
#                                       'Security':'company_name',
#                                       'GICS Sector': 'gics_sector',
#                                       'GICS Sub-Industry':'gics_sub_industry'},
#                                        axis=1)
snp_companies

Unnamed: 0,ticker,company_name,gics_sector,gics_sub_industry,is_esg
0,A,Agilent Technologies,Health Care,Health Care Equipment,1
1,AAL,American Airlines Group,Industrials,Airlines,1
2,AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,0
3,AAPL,Apple,Information Technology,"Technology Hardware, Storage & Peripherals",1
4,ABBV,AbbVie,Health Care,Pharmaceuticals,1
...,...,...,...,...,...
502,YUM,Yum! Brands,Consumer Discretionary,Restaurants,1
503,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,0
504,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,1
505,ZION,Zions Bancorp,Financials,Regional Banks,0


In [4]:
# Data cleaning
snp_companies['ticker'] = snp_companies['ticker'].str.replace(r'.', '-',regex=True)

## Preparing the dataframe to store daily stock prices

In [5]:
stock_prices = pd.DataFrame()
# Seperated from stock price query because the date range in getting stock price will be converted to incremental loading
# This section is akin to backfilling
base_start_date = '2019-01-01' 
base_end_date = '2022-01-14'

# This behaves as an index
stock_prices['date'] = pd.Series(pd.date_range(start=base_start_date, end=base_end_date,freq='D'))

# Business Days
biz_date = pd.DataFrame(pd.bdate_range(start=base_start_date, end=base_end_date,freq='B'), columns=['biz_date'])

# Join dates and biz_date to detect weekends or hidden holidays
stock_prices = stock_prices.merge(biz_date,
                                  how='left',
                                  left_on='date',
                                  right_on='biz_date')

# The day of the week with Monday=0, Sunday=6.
stock_prices['day'] = stock_prices['date'].dt.dayofweek

## Retrieving Stock Prices

In [6]:
# Need to add data quality check here to ensure dates do not go out of range

In [7]:
# Column checker
stock_name_list = snp_companies.ticker.to_list()

print(f"""
We expect an additional {len(stock_name_list)} when the query is done')

There are {stock_prices.shape[1]} columns with shape as {stock_prices.shape} in the base dataframe with column names as : {stock_prices.columns.format()}

Therefore, when the data is loaded, there should be {len(stock_name_list) + stock_prices.shape[1]} columns
""")


We expect an additional 507 when the query is done')

There are 3 columns with shape as (1110, 3) in the base dataframe with column names as : ['date', 'biz_date', 'day']

Therefore, when the data is loaded, there should be 510 columns



In [None]:
count = 1
error_tickers = []
for stock in stock_name_list:
    try:
        temp_stock_info_df = pd.DataFrame()

        print(f'Processing {stock}')
        print(f'Stock {count} of {len(stock_name_list)}')

        temp_stock_info_df = (pd
                              .DataFrame(
                                  pn.data.get(stock, base_start_date, base_end_date).loc[:,'Close'])
                              .rename({'Close':f'{stock}'},axis=1)
                             )

        print(f'Successfully processed {stock} at {datetime.today()}!')

        stock_prices = (stock_prices.merge(temp_stock_info_df, 
                                           how='left', 
                                           left_on='date', 
                                           right_on='Date'))

        print(f'DataFrame size is {stock_prices.shape}.\n')

        count += 1

        # Reduce RAM consumption
        del temp_stock_info_df
        gc.collect()
    
    except KeyError:
        error_tickers.append(stock)
        print(f'This ticker {stock} is not found by Pynance\n')
        continue

In [None]:
pn.data.get('BSX', base_start_date, base_end_date).loc[:,'Close']

KeyError: 'Close'

In [11]:
pn.data.get('AAPL', base_start_date, base_end_date)