# Extract economic table and scrape titles

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from bs4 import BeautifulSoup as bs
from glob import glob
import pandas as pd
from tqdm import tqdm

from selenium import webdriver 
import time
sleep_time_sec = 1
CHROME_OPTIONS = webdriver.chrome.options.Options()
CHROME_OPTIONS.add_argument('--no-sandbox')
CHROME_OPTIONS.add_argument('--headless')
CHROMEDRIVER_PATH='/usr/local/bin/chromedriver'

In [2]:
files = sorted(glob("./html_calendars/*html"))

href_flag_list = []
for file in files:
    with open(file) as fp:
        soup = bs(fp, 'html.parser')
    for tr in soup.find_all('tr', class_='js-event-item'):
        tr_tuple = (
            tr.find('td', class_='left event').a['href'],
            tr.find('td', class_='left flagCur noWrap').find('span')['title']
        )
        href_flag_list.append(tr_tuple)

In [3]:
display(len(href_flag_list))
href_flag_set = set(href_flag_list)
display(len(href_flag_set))

1009

910

In [4]:
row_list = []
for href, flag in set(href_flag_set):
    row = {
        'ID': int(href.split('-')[-1]),
        'Nation': flag,
        'Title': None, 
        'Name': " ".join(href.split('/')[-1].split('-')[:-1]),
        'URL': href,
    }
    row_list.append(row)
df = pd.DataFrame(row_list).sort_values(by='ID').reset_index(drop=True)
df

Unnamed: 0,ID,Nation,Title,Name,URL
0,7,United Kingdom,,average earnings index bonus,https://www.investing.com/economic-calendar/av...
1,21,Japan,,bsi large manufacturing conditions,https://www.investing.com/economic-calendar/bs...
2,25,United States,,building permits,https://www.investing.com/economic-calendar/bu...
3,27,New Zealand,,anz business confidence,https://www.investing.com/economic-calendar/an...
4,29,United States,,business inventories,https://www.investing.com/economic-calendar/bu...
...,...,...,...,...,...
905,2191,France,,cpi nsa,https://www.investing.com/economic-calendar/cp...
906,2192,France,,cpi nsa,https://www.investing.com/economic-calendar/cp...
907,2193,United Kingdom,,gdp estimate,https://www.investing.com/economic-calendar/gd...
908,2194,Hong Kong,,cpi,https://www.investing.com/economic-calendar/cp...


In [5]:
# Merge with already existing data
df_old = pd.read_csv('./economic_table.csv')
df = (
    pd.concat([df_old, df])
    .drop_duplicates(subset=['ID'], keep="first") # first is old and should have the Title already
    .sort_values(by='ID')
    .reset_index(drop=True)
)
df

Unnamed: 0,ID,Nation,Title,Name,URL
0,7,United Kingdom,,average earnings index bonus,https://www.investing.com/economic-calendar/av...
1,21,Japan,,bsi large manufacturing conditions,https://www.investing.com/economic-calendar/bs...
2,25,United States,United States Building Permits,building permits,https://www.investing.com/economic-calendar/bu...
3,27,New Zealand,New Zealand ANZ Business Confidence,anz business confidence,https://www.investing.com/economic-calendar/an...
4,29,United States,,business inventories,https://www.investing.com/economic-calendar/bu...
...,...,...,...,...,...
905,2191,France,,cpi nsa,https://www.investing.com/economic-calendar/cp...
906,2192,France,,cpi nsa,https://www.investing.com/economic-calendar/cp...
907,2193,United Kingdom,,gdp estimate,https://www.investing.com/economic-calendar/gd...
908,2194,Hong Kong,Hong Kong CPI NSA,cpi,https://www.investing.com/economic-calendar/cp...


In [6]:
# Try with requests
url = "https://www.investing.com/economic-calendar/french-cpi-112"
# get html
import requests
r = requests.get(url)
# beautify html
soup = bs(r.text, 'html.parser')
soup.find('title').text


'Attention Required! | Cloudflare'

In [7]:
# Try with Selenium
browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=CHROME_OPTIONS)
url = "https://www.investing.com/economic-calendar/french-cpi-112"
browser.get(url)
time.sleep(sleep_time_sec) # To leave the time for the page to load 
html = browser.page_source
browser.close()
soup = bs(html,'html.parser')
soup.find('title').text

'France Consumer Price Index (CPI) MoM'

In [10]:
blacklist = [
    "We're temporarily down for maintenance; Please check back soon..." # Some pages take several seconds to load and then throw this error
]

df_to_fill = df[df.Title.isna()]
for i, row in tqdm(df_to_fill.iterrows(), total=len(df_to_fill)):
    browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=CHROME_OPTIONS)
    browser.get(row['URL'])
    time.sleep(sleep_time_sec)
    html = browser.page_source
    browser.close()
    soup = bs(html,'html.parser')
    title = soup.find('title').text
    if title in blacklist:
        continue
    df.loc[i, 'Title'] = title
    df.to_csv('economic_table.csv', index=False)

100%|██████████| 358/358 [28:27<00:00,  4.77s/it] 
