In [11]:
# Packages
import pynance as pn
import pandas_datareader.data as web
import pandas as pd
import gc 
from datetime import datetime
from datetime import timedelta
import numpy as np

# Directories
transform_path = '../../data/transform'


# del my_array
# del my_object
# gc.collect()

# Import a list of stocks

In [2]:
snp_companies = pd.read_csv(f'{transform}/dim_snp_esg_full_16-Jan-2022.csv')

In [3]:
snp_companies

Unnamed: 0,ticker,company_name,gics_sector,gics_sub_industry,is_esg
0,A,Agilent Technologies,Health Care,Health Care Equipment,1
1,AAL,American Airlines Group,Industrials,Airlines,1
2,AAP,Advance Auto Parts,Consumer Discretionary,Automotive Retail,0
3,AAPL,Apple,Information Technology,"Technology Hardware, Storage & Peripherals",1
4,ABBV,AbbVie,Health Care,Pharmaceuticals,1
...,...,...,...,...,...
502,YUM,Yum! Brands,Consumer Discretionary,Restaurants,1
503,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,0
504,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,1
505,ZION,Zions Bancorp,Financials,Regional Banks,0


In [4]:
# Filter for ESG companies and the benchmarks
snp_companies_esg = snp_companies[snp_companies['is_esg']==1]
snp_companies_etf = snp_companies[snp_companies['gics_sector']=='ETF']
snp_companies_esg_etf = snp_companies_esg.append(snp_companies_etf)

# Data cleaning
snp_companies_esg_etf['ticker'] = snp_companies_esg_etf['ticker'].str.replace(r'.', '-',regex=True)

## Preparing the dataframe to store daily stock prices

In [5]:
stock_prices = pd.DataFrame()

# Seperated from stock price query because the date range in getting stock price will be converted to incremental loading
# This section is akin to backfilling
start_date = '2021-01-01' 
end_date = '2022-01-14'

# This is done because Yahoo strangely takes your intended query date - 1 day. Hence the base table should match this
base_start_date = (datetime.strptime(start_date, "%Y-%m-%d") - timedelta(days=1)).strftime(format="%Y-%m-%d")
base_end_date = end_date

# This behaves as an index
stock_prices['date'] = pd.Series(pd.date_range(start=base_start_date, end=base_end_date,freq='D'))

# Business Days
biz_date = pd.DataFrame(pd.bdate_range(start=base_start_date, end=base_end_date,freq='B'), columns=['biz_date'])

# Join dates and biz_date to detect weekends or hidden holidays
stock_prices = stock_prices.merge(biz_date,
                                  how='left',
                                  left_on='date',
                                  right_on='biz_date')

# The day of the week with Monday=0, Sunday=6.
stock_prices['day'] = stock_prices['date'].dt.dayofweek

## Retrieving Stock Prices

In [6]:
# Need to add data quality check here to ensure dates do not go out of range

In [7]:
# Column checker
stock_name_list = snp_companies_esg_etf.ticker.to_list()

print(f"""
We expect an additional {len(stock_name_list)} when the query is done

There are {stock_prices.shape[1]} columns with shape as {stock_prices.shape} in the base dataframe with column names as : {stock_prices.columns.format()}

Therefore, when the data is loaded, there should be {len(stock_name_list) + stock_prices.shape[1]} columns
""")


We expect an additional 313 when the query is done

There are 3 columns with shape as (380, 3) in the base dataframe with column names as : ['date', 'biz_date', 'day']

Therefore, when the data is loaded, there should be 316 columns



In [8]:
count = 1
error_tickers = []
for stock in stock_name_list:
    try:
        temp_stock_info_df = pd.DataFrame()

        print(f'Processing {stock}')
        print(f'Stock {count} of {len(stock_name_list)}')

        # temp_stock_info_df = (pd
        #                       .DataFrame(
        #                           pn.data.get(stock, base_start_date, base_end_date).loc[:,'Close'])
        #                       .rename({'Close':f'{stock}'},axis=1)
        #                      )
        
        temp_stock_info_df = (pd.DataFrame((web.DataReader(stock, 'yahoo', start=start_date, end=end_date)
                                               .loc[:,'Close']))
                                .rename({'Close':f'{stock}'},axis=1))

        print(f'Successfully processed {stock} at {datetime.today()}!')

        stock_prices = (stock_prices.merge(temp_stock_info_df, 
                                           how='left', 
                                           left_on='date', 
                                           right_on='Date'))

        print(f'DataFrame size is {stock_prices.shape}.\n')

        count += 1

        # Reduce RAM consumption
        del temp_stock_info_df
        gc.collect()
    
    except KeyError:
        error_tickers.append(stock)
        print(f'This ticker {stock} is not found.\n')
        continue

Processing A
Stock 1 of 313
Successfully processed A at 2022-01-16 14:52:20.181128!
DataFrame size is (380, 4).

Processing AAL
Stock 2 of 313
Successfully processed AAL at 2022-01-16 14:52:21.634614!
DataFrame size is (380, 5).

Processing AAPL
Stock 3 of 313
Successfully processed AAPL at 2022-01-16 14:52:22.657441!
DataFrame size is (380, 6).

Processing ABBV
Stock 4 of 313
Successfully processed ABBV at 2022-01-16 14:52:23.575415!
DataFrame size is (380, 7).

Processing ABC
Stock 5 of 313
Successfully processed ABC at 2022-01-16 14:52:24.497356!
DataFrame size is (380, 8).

Processing ABT
Stock 6 of 313
Successfully processed ABT at 2022-01-16 14:52:25.583848!
DataFrame size is (380, 9).

Processing ACN
Stock 7 of 313
Successfully processed ACN at 2022-01-16 14:52:26.958393!
DataFrame size is (380, 10).

Processing ADBE
Stock 8 of 313
Successfully processed ADBE at 2022-01-16 14:52:27.882624!
DataFrame size is (380, 11).

Processing ADI
Stock 9 of 313
Successfully processed ADI at 

In [9]:
# my notes
# https://pandas-datareader.readthedocs.io/en/latest/remote_data.html

# In [39]: import pandas_datareader.data as web

# In [40]: import pandas as pd

# In [41]: import datetime as dt

# In [42]: df = web.DataReader('GE', 'yahoo', start='2019-09-10', end='2019-10-09')

## Pynance
# pn.data.get('AAPL', base_start_date, base_end_date)

In [12]:
stock_prices.to_csv(f"{transform_path}/fact_esg_stock_prices.csv",index=False)

In [13]:
print(error_tickers)

[]
