# Extract economic table and scrape titles

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from bs4 import BeautifulSoup as bs
from glob import glob
import pandas as pd
from tqdm import tqdm

from selenium import webdriver 
import time
sleep_time_sec = 1
CHROME_OPTIONS = webdriver.chrome.options.Options()
CHROME_OPTIONS.add_argument('--no-sandbox')
CHROME_OPTIONS.add_argument('--headless')
CHROMEDRIVER_PATH='/usr/local/bin/chromedriver'

In [2]:
def get_importance_stars(tr) -> int:
    """Given a Row"""
    bullish_list = tr.find('td', class_='left textNum sentiment noWrap').find_all('i')
    nfullstars = 0
    for bullish in bullish_list:
        if bullish['class'][0] == 'grayFullBullishIcon':
            nfullstars += 1
    return nfullstars
    

In [3]:
files = sorted(glob("./html_calendars/*html"))

href_flag_list = []
for file in files:
    with open(file) as fp:
        soup = bs(fp, 'html.parser')
    for tr in soup.find_all('tr', class_='js-event-item'): # Not inclding holidays rows
        tr_tuple = (
            tr.find('td', class_='left event').a['href'],                       # link to indicator page
            tr.find('td', class_='left flagCur noWrap').find('span')['title'],  # country name
            get_importance_stars(tr)                                            # importance
        )
        href_flag_list.append(tr_tuple)

In [4]:
display(len(href_flag_list))
href_flag_set = set(href_flag_list)
display(len(href_flag_set))

1974

1233

In [5]:
row_list = []
for href, flag, importance in set(href_flag_set):
    row = {
        'ID': int(href.split('-')[-1]),
        'Nation': flag,
        'Importance': importance,
        'Title': None, 
        'Name': " ".join(href.split('/')[-1].split('-')[:-1]),
        'URL': href,
    }
    row_list.append(row)
df = pd.DataFrame(row_list).sort_values(by='ID').reset_index(drop=True)
df

Unnamed: 0,ID,Nation,Importance,Title,Name,URL
0,5,New Zealand,1,,anz commodity price index,https://www.investing.com/economic-calendar/an...
1,7,United Kingdom,2,,average earnings index bonus,https://www.investing.com/economic-calendar/av...
2,18,United Kingdom,2,,brc retail sales monitor,https://www.investing.com/economic-calendar/br...
3,21,Japan,2,,bsi large manufacturing conditions,https://www.investing.com/economic-calendar/bs...
4,22,Australia,2,,building approvals,https://www.investing.com/economic-calendar/bu...
...,...,...,...,...,...,...
1228,2204,Singapore,1,,cpi nsa,https://www.investing.com/economic-calendar/cp...
1229,2205,Türkiye,1,,"cpi ex e,f,b,t g",https://www.investing.com/economic-calendar/cp...
1230,2206,Türkiye,1,,"cpi ex e,f,b,t g",https://www.investing.com/economic-calendar/cp...
1231,2208,Ukraine,1,,cpi,https://www.investing.com/economic-calendar/cp...


In [6]:
# Merge with already existing data
df_old = pd.read_csv('./economic_table.csv')
df = (
    pd.concat([df_old, df])
    .drop_duplicates(subset=['ID'], keep="first") # first is old and should have the Title already
    .sort_values(by='ID')
    .reset_index(drop=True)
)
df

Unnamed: 0,ID,Nation,Importance,Title,Name,URL
0,5,New Zealand,1,New Zealand ANZ Commodity Price Index MoM,anz commodity price index,https://www.investing.com/economic-calendar/an...
1,7,United Kingdom,2,United Kingdom Average Earnings Index +Bonus,average earnings index bonus,https://www.investing.com/economic-calendar/av...
2,18,United Kingdom,2,United Kingdom BRC Retail Sales Monitor YoY,brc retail sales monitor,https://www.investing.com/economic-calendar/br...
3,21,Japan,2,Japan BSI Large Manufacturing Conditions,bsi large manufacturing conditions,https://www.investing.com/economic-calendar/bs...
4,22,Australia,2,Australia Building Approvals MoM,building approvals,https://www.investing.com/economic-calendar/bu...
...,...,...,...,...,...,...
1225,2204,Singapore,1,,cpi nsa,https://www.investing.com/economic-calendar/cp...
1226,2205,Türkiye,1,"Turkey CPI Ex E,F,B,T&G","cpi ex e,f,b,t g",https://www.investing.com/economic-calendar/cp...
1227,2206,Türkiye,1,"Turkey CPI Ex E,F,B,T&G","cpi ex e,f,b,t g",https://www.investing.com/economic-calendar/cp...
1228,2208,Ukraine,1,Ukraine CPI,cpi,https://www.investing.com/economic-calendar/cp...


In [7]:
# Try with requests
url = "https://www.investing.com/economic-calendar/french-cpi-112"
# get html
import requests
r = requests.get(url)
# beautify html
soup = bs(r.text, 'html.parser')
soup.find('title').text


'Attention Required! | Cloudflare'

In [8]:
# Try with Selenium
browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=CHROME_OPTIONS)
url = "https://www.investing.com/economic-calendar/french-cpi-112"
browser.get(url)
time.sleep(sleep_time_sec) # To leave the time for the page to load 
html = browser.page_source
browser.close()
soup = bs(html,'html.parser')
soup.find('title').text

'France Consumer Price Index (CPI) MoM'

In [9]:
blacklist = [
    "We're temporarily down for maintenance; Please check back soon..." # Some pages take several seconds to load and then throw this error
]

df_to_fill = df[df.Title.isna()]
for i, row in tqdm(df_to_fill.iterrows(), total=len(df_to_fill)):
    browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=CHROME_OPTIONS)
    browser.get(row['URL'])
    time.sleep(sleep_time_sec)
    html = browser.page_source
    browser.close()
    soup = bs(html,'html.parser')
    title = soup.find('title').text
    if title in blacklist:
        continue
    df.loc[i, 'Title'] = title
    df.to_csv('economic_table.csv', index=False)

100%|██████████| 81/81 [08:32<00:00,  6.33s/it]


In [11]:
# Most important indicators to copy in the README
for i, row in enumerate(df.query('Importance == 3').sort_values(by='ID').reset_index(drop=True).itertuples()):
    print(f"{i+1}. [{row.Title} - ID #{row.ID}]({row.URL})")

1. [United States Building Permits - ID #25](https://www.investing.com/economic-calendar/building-permits-25)
2. [United States CB Consumer Confidence - ID #48](https://www.investing.com/economic-calendar/cb-consumer-confidence-48)
3. [United States Core Consumer Price Index (CPI) MoM - ID #56](https://www.investing.com/economic-calendar/core-cpi-56)
4. [United States Core Durable Goods Orders MoM - ID #59](https://www.investing.com/economic-calendar/core-durable-goods-orders-59)
5. [United States Core PCE Price Index MoM - ID #61](https://www.investing.com/economic-calendar/core-pce-price-index-61)
6. [United States Core Retail Sales MoM - ID #63](https://www.investing.com/economic-calendar/core-retail-sales-63)
7. [United Kingdom Consumer Price Index (CPI) YoY - ID #67](https://www.investing.com/economic-calendar/cpi-67)
8. [European Consumer Price Index (CPI) YoY - ID #68](https://www.investing.com/economic-calendar/cpi-68)
9. [United States Consumer Price Index (CPI) MoM - ID #69](