# 1 DataSet Construction

## 1.1 Components Data
### Scrapping Index Components From BlackRock ETFs

In [1]:
"""
This notebook extracts Historical Data of the Stock Components of several ETF 
issued by _BlackRock_. A Long History (from 2006Q3) is Publicly Available for:
+ SP500 (IVV), 
+ RUSSELL1000 (IWB), 
+ RUSSELL2000 (IWM), 
+ RUSSELL3000 (IWV).
_Vanguard_ site (the largest Global issuer for passive indexing) 
does not record historical data of index (ETF) components,
Moreover their corresponding ETFs were issued much later than year 2K
"""

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import json

BLACKROCK_CL_URL_ROOT = "https://www.blackrock.com/cl/productos/"

def index_components_finder(ticker="IVV", path = "../data/pkl/"):
    """Scrapes BlackRock website public component time-series data for supported financial ETF products.
    Args:
        tickers : {‘IVV’, ‘IWB’, ‘IWM’, ‘IWV’}, default ‘IVV’ (S&P500)
        path : str, default './data/pkl'
    """
    
    etf_url = {        
        "IVV": "239726/ishares-core-sp-500-etf", # iShares Core S&P 500 ETF
        "IWB": "239707/ishares-russell-1000-etf", # iShares Russell 1000 ETF
        "IWM": "239710/ishares-russell-2000-etf", # iShares Russell 2000 ETF
        "IWV": "239714/ishares-russell-3000-etf", # iShares Russell 3000 ETF
    }

    url = BLACKROCK_CL_URL_ROOT + etf_url[ticker] + "#tabsAll"    
    # request page
    html = requests.get(url).content
    soup = BeautifulSoup(html)

    # find available dates
    holdings = soup.find("div", {"id": "holdings"})
    dates_div = holdings.find_all("div", "component-date-list")[1]
    dates_div.find_all("option")
    dates = [option.attrs["value"] for option in dates_div.find_all("option")]

    # download constituents for each date
    constituents = pd.Series(dtype=object)
    for date in dates:
        resp = requests.get(BLACKROCK_CL_URL_ROOT
        +etf_url[ticker]
        +f"/1506433277024.ajax?tab=all&fileType=json&asOfDate={date}").content[3:]
        tickers = json.loads(resp)
        tickers = [(arr[0], arr[1]) for arr in tickers['aaData']]
        date = datetime.strptime(date, "%Y%m%d")
        constituents[date] = tickers

    constituents = constituents.iloc[::-1] # reverse into cronlogical order
    
    # for pickle filename construction and saving (serial date)
    last_date = constituents.index[-1].strftime("%Y%m%d")
    pklfile_fullpath = path + ticker + "_historical_components_" + last_date + ".pkl"
    #constituents = constituents.to_frame(name="components")
    constituents.to_pickle(pklfile_fullpath)
    return constituents

### A Pretty Simple Bulk-Downloader Demo


In [2]:
index_components = []
tickers = ["IVV", "IWB", "IWM", "IWV"]
for ticker in tickers:
    idx_compos = index_components_finder(ticker=ticker)
    index_components.append(idx_compos)

In [3]:
index_components[0] # IVV tracks S&P500. df with components history

Unnamed: 0,components
2006-09-29,"[(PMCS, PMC-SIERRA INC.), (ANDW, ANDREW CORP.)..."
2006-10-31,"[(PMCS, PMC-SIERRA INC.), (PGL, PEOPLES ENERGY..."
2006-11-30,"[(PMCS, PMC-SIERRA INC.), (ADCT, ADC TELECOMMU..."
2006-12-29,"[(PMCS, PMC-SIERRA INC.), (ADCT, ADC TELECOMMU..."
2007-01-31,"[(PMCS, PMC-SIERRA INC.), (PGL, PEOPLES ENERGY..."
...,...
2022-03-31,"[(AAPL, APPLE INC), (MSFT, MICROSOFT CORP), (A..."
2022-04-29,"[(AAPL, APPLE INC), (MSFT, MICROSOFT CORP), (A..."
2022-05-31,"[(AAPL, APPLE INC), (MSFT, MICROSOFT CORP), (A..."
2022-06-30,"[(AAPL, APPLE INC), (MSFT, MICROSOFT CORP), (A..."
