# Extract economic table and scrape informations for each indicator

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from bs4 import BeautifulSoup as bs
from glob import glob
import pandas as pd
from tqdm import tqdm

from selenium import webdriver 
import time
sleep_time_sec = 1
CHROME_OPTIONS = webdriver.chrome.options.Options()
CHROME_OPTIONS.add_argument('--no-sandbox')
CHROME_OPTIONS.add_argument('--headless')
CHROMEDRIVER_PATH='/usr/local/bin/chromedriver'

In [2]:
def get_importance_stars(tr) -> int:
    """Given a Row"""
    bullish_list = tr.find('td', class_='left textNum sentiment noWrap').find_all('i')
    nfullstars = 0
    for bullish in bullish_list:
        if bullish['class'][0] == 'grayFullBullishIcon':
            nfullstars += 1
    return nfullstars

In [3]:
files = sorted(glob("./html_calendars/*html"))

href_flag_list = []
for file in files:
    with open(file) as fp:
        soup = bs(fp, 'html.parser')
    for tr in soup.find_all('tr', class_='js-event-item'): # Not inclding holidays rows
        tr_tuple = (
            tr.find('td', class_='left event').a['href'],                       # link to indicator page
            tr.find('td', class_='left flagCur noWrap').find('span')['title'],  # country name
            get_importance_stars(tr)                                            # importance
        )
        href_flag_list.append(tr_tuple)

In [4]:
print("Total number of rows:", len(href_flag_list))
href_flag_set = set(href_flag_list)
print("Number of unique indicators:", len(href_flag_set))

Total number of rows: 7262
Number of unique indicators: 1715


In [5]:
row_list = []
for href, flag, importance in set(href_flag_set):
    row = {
        'ID': int(href.split('-')[-1]),
        'Nation': flag,
        'Importance': importance,
        'Title': None, 
        'Name': " ".join(href.split('/')[-1].split('-')[:-1]),
        'URL': href,
    }
    row_list.append(row)
df = pd.DataFrame(row_list).sort_values(by='ID').reset_index(drop=True)
df

Unnamed: 0,ID,Nation,Importance,Title,Name,URL
0,1,United States,3,,adp nonfarm employment change,https://www.investing.com/economic-calendar/ad...
1,5,New Zealand,1,,anz commodity price index,https://www.investing.com/economic-calendar/an...
2,6,Japan,1,,average cash earnings,https://www.investing.com/economic-calendar/av...
3,7,United Kingdom,2,,average earnings index bonus,https://www.investing.com/economic-calendar/av...
4,8,United States,3,,average hourly earnings,https://www.investing.com/economic-calendar/av...
...,...,...,...,...,...,...
1710,2255,Türkiye,1,,trade ministry trade balance,https://www.investing.com/economic-calendar/tr...
1711,2256,Taiwan,1,,m3 money supply,https://www.investing.com/economic-calendar/m3...
1712,2257,South Africa,1,,leading indicator,https://www.investing.com/economic-calendar/le...
1713,2258,Australia,1,,rba gov bullock speaks,https://www.investing.com/economic-calendar/rb...


In [6]:
# Merge with already existing data
df_old = pd.read_csv('./economic_table.csv')
df = (
    pd.concat([df_old, df])
    .drop_duplicates(subset=['ID'], keep="first") # first is old and should have the Title already
    .sort_values(by='ID')
    .reset_index(drop=True)
)
df

Unnamed: 0,ID,Nation,Importance,Title,Name,URL,Source,Source_link,Currency,Description
0,1,United States,3,United States ADP Nonfarm Employment Change,adp nonfarm employment change,https://www.investing.com/economic-calendar/ad...,Automatic Data Processing (ADP),http://www.adpemploymentreport.com/,USD,The ADP National Employment Report is a measur...
1,5,New Zealand,1,New Zealand ANZ Commodity Price Index MoM,anz commodity price index,https://www.investing.com/economic-calendar/an...,Australia & New Zealand Banking Group,http://www.anz.co.nz/about-us/economic-markets...,NZD,The ANZ Commodity Price Index measures the cha...
2,6,Japan,1,Japan Average Cash Earnings YoY,average cash earnings,https://www.investing.com/economic-calendar/av...,"Japanese Ministry of Health, Labour and Welfare",http://www.mhlw.go.jp/english/database/db-l/in...,JPY,Average Cash Earnings measures the change in e...
3,7,United Kingdom,2,United Kingdom Average Earnings Index +Bonus,average earnings index bonus,https://www.investing.com/economic-calendar/av...,Office for National Statistics,https://www.ons.gov.uk/,GBP,The Average Earnings Index measures change in ...
4,8,United States,3,United States Average Hourly Earnings MoM,average hourly earnings,https://www.investing.com/economic-calendar/av...,"U.S. Bureau of Labor Statistics, Department of...",https://www.bls.gov/news.release/empsit.toc.htm,USD,Average Hourly Earnings measures the change in...
...,...,...,...,...,...,...,...,...,...,...
1657,2255,Türkiye,1,Turkish Trade Ministry Trade Balance,trade ministry trade balance,https://www.investing.com/economic-calendar/tr...,Turkish Statistical Institute,https://www.tuik.gov.tr/,TRY,The Trade Balance index measures the differenc...
1658,2256,Taiwan,1,M3 Money Supply,m3 money supply,https://www.investing.com/economic-calendar/m3...,Central Bank of The Republic of China (Taiwan),https://www.cbc.gov.tw/en/mp-2.html,TWD,M3 Money Supply measures the change in the tot...
1659,2257,South Africa,1,South Africa Leading Indicator,leading indicator,https://www.investing.com/economic-calendar/le...,OECD,https://stats.oecd.org/,ZAR,The Leading Indicators Index is a composite in...
1660,2258,Australia,1,,rba gov bullock speaks,https://www.investing.com/economic-calendar/rb...,,,,


In [7]:
df.to_csv('./economic_table.csv', index=False)

## Retrieve information from each page


In [8]:
# Try with requests
url = "https://www.investing.com/economic-calendar/french-cpi-112"
# get html
import requests
r = requests.get(url)
# beautify html
soup = bs(r.text, 'html.parser')
soup.find('title').text
# NOTE: not working! It seems that the website is blocking requests

'Attention Required! | Cloudflare'

In [9]:
# Use Selenium on the URL page to extract page-info
def scrape_page(url):
    blacklist = [
        "We're temporarily down for maintenance; Please check back soon..." # Some pages take several seconds to load and then throw this error
    ]
    browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=CHROME_OPTIONS)
    
    try:
        browser.get(url)
    except:
        print(f"Skipped: {row['URL']} due to browser.get(url) failure")
        res = {k: None for k in ['Title', 'Source', 'Source_link', 'Currency', 'Description']}
        soup = None
        return res, soup
    
    time.sleep(sleep_time_sec) # To leave the time for the page to load 
    html = browser.page_source
    browser.close()
    soup = bs(html,'html.parser')

    res = {}
    res['Title'] = soup.find('title').text.strip()
    if res['Title'] in blacklist:
        print(f"Skipped: {row['URL']} due to `{res['Title']}`")
        res = {k: None for k in ['Title', 'Source', 'Source_link', 'Currency', 'Description']}
        return res, soup

    source_span = soup.find('span', string='Source:')
    res['Source'] = source_span.find_next_sibling('span').find('a')['title']
    res['Source_link'] = source_span.find_next_sibling('span').find('a')['href']

    curr_span = soup.find('span', string='Currency:')
    res['Currency'] = curr_span.find_next_sibling('span').text

    overViewBox_div = soup.find('div', class_='overViewBox')
    overViewBox_left = overViewBox_div.find('div', class_='left')
    if overViewBox_left:
        res['Description'] = (
            overViewBox_left
            .get_text()
            .replace("<br>", " ")
            .replace("\n", " ")
        )
    else:
        res['Description'] = None

    return res, soup

# Test
url = "https://www.investing.com/economic-calendar/milk-auctions-2244"
res, soup = scrape_page(url)
for k, v in res.items():
    print(f" - {k}: {v}")

 - Title: New Zealand Milk Auctions
 - Source: 
 - Source_link: https://www.dairyglobal.net/industry-and-markets/market-trends/healthy-milk-price-expected-in-new-zealand/
 - Currency: USD
 - Description: Measures the weighted-average price of 9 dairy products sold at auction every 2 weeks. It is viewed as a leading indicator of New Zealand's trade balance because rising commodity prices boost export income. The dairy industry is New Zealand's biggest export earner, accounting for more than 29% by value of the country's exports.


In [10]:
#df = pd.read_csv('./economic_table.csv')

In [11]:
# Run the script for all rows

#Choose which rows to fill with page-info
#df_to_fill = df #all
df_to_fill = df[df.Source.isna()] # Only the one with missing Source

for i, row in tqdm(df_to_fill.iterrows(), total=len(df_to_fill)):
    res, soup = scrape_page(row['URL'])
    for k, v in res.items():
        df.loc[i, k] = v
    df.to_csv('economic_table.csv', index=False)

  2%|▏         | 1/60 [01:02<1:01:30, 62.56s/it]

Skipped: https://www.investing.com/economic-calendar/swiss-cpi-71 due to `We're temporarily down for maintenance; Please check back soon...`


100%|██████████| 60/60 [05:11<00:00,  5.19s/it] 


In [12]:
# Most important indicators to copy in the README
for i, row in enumerate(df.query('Importance == 3').sort_values(by='ID').reset_index(drop=True).itertuples()):
    print(f"{i+1}. [{row.Title} - ID #{row.ID}]({row.URL})")

1. [United States ADP Nonfarm Employment Change - ID #1](https://www.investing.com/economic-calendar/adp-nonfarm-employment-change-1)
2. [United States Average Hourly Earnings MoM - ID #8](https://www.investing.com/economic-calendar/average-hourly-earnings-8)
3. [United States Building Permits - ID #25](https://www.investing.com/economic-calendar/building-permits-25)
4. [United States CB Consumer Confidence - ID #48](https://www.investing.com/economic-calendar/cb-consumer-confidence-48)
5. [United States Core Consumer Price Index (CPI) MoM - ID #56](https://www.investing.com/economic-calendar/core-cpi-56)
6. [United States Core Durable Goods Orders MoM - ID #59](https://www.investing.com/economic-calendar/core-durable-goods-orders-59)
7. [United States Core PCE Price Index MoM - ID #61](https://www.investing.com/economic-calendar/core-pce-price-index-61)
8. [United States Core Retail Sales MoM - ID #63](https://www.investing.com/economic-calendar/core-retail-sales-63)
9. [United Kingdo