# Extract economic table and scrape titles

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from bs4 import BeautifulSoup as bs
from glob import glob
import pandas as pd
from tqdm import tqdm

from selenium import webdriver 
import time
sleep_time_sec = 1
CHROME_OPTIONS = webdriver.chrome.options.Options()
CHROME_OPTIONS.add_argument('--no-sandbox')
CHROME_OPTIONS.add_argument('--headless')
CHROMEDRIVER_PATH='/usr/local/bin/chromedriver'

In [2]:
files = sorted(glob("./html_calendars/*html"))

href_flag_list = []
for file in files:
    with open(file) as fp:
        soup = bs(fp, 'html.parser')
    for tr in soup.find_all('tr', class_='js-event-item'):
        tr_tuple = (
            tr.find('td', class_='left event').a['href'],
            tr.find('td', class_='left flagCur noWrap').find('span')['title']
        )
        href_flag_list.append(tr_tuple)

In [3]:
display(len(href_flag_list))
href_flag_set = set(href_flag_list)
display(len(href_flag_set))

583

556

In [4]:
row_list = []
for href, flag in set(href_flag_set):
    row = {
        'ID': int(href.split('-')[-1]),
        'Nation': flag,
        'Title': None, 
        'Name': " ".join(href.split('/')[-1].split('-')[:-1]),
        'URL': href,
    }
    row_list.append(row)
df = pd.DataFrame(row_list).sort_values(by='ID').reset_index(drop=True)
df

Unnamed: 0,ID,Nation,Name,Title,URL
0,25,United States,building permits,,https://www.investing.com/economic-calendar/bu...
1,27,New Zealand,anz business confidence,,https://www.investing.com/economic-calendar/an...
2,30,United Kingdom,business investment,,https://www.investing.com/economic-calendar/bu...
3,33,United Kingdom,cbi distributive trades realized,,https://www.investing.com/economic-calendar/cb...
4,34,United Kingdom,cbi industrial trends orders,,https://www.investing.com/economic-calendar/cb...
...,...,...,...,...,...
551,2183,Austria,austria cpi nsa,,https://www.investing.com/economic-calendar/au...
552,2187,Denmark,gdp revised,,https://www.investing.com/economic-calendar/gd...
553,2188,Denmark,gdp revised,,https://www.investing.com/economic-calendar/gd...
554,2194,Hong Kong,cpi,,https://www.investing.com/economic-calendar/cp...


In [5]:
# Try with requests
url = "https://www.investing.com/economic-calendar/french-cpi-112"
# get html
import requests
r = requests.get(url)
# beautify html
soup = bs(r.text, 'html.parser')
soup.find('title').text


'Attention Required! | Cloudflare'

In [6]:
# Try with Selenium
browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=CHROME_OPTIONS)
url = "https://www.investing.com/economic-calendar/french-cpi-112"
browser.get(url)
time.sleep(sleep_time_sec) # To leave the time for the page to load 
html = browser.page_source
browser.close()
soup = bs(html,'html.parser')
soup.find('title').text

'France Consumer Price Index (CPI) MoM'

In [13]:
df_to_fill = df[df.Title.isna()]
for i, row in tqdm(df_to_fill.iterrows(), total=len(df_to_fill)):
    browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=CHROME_OPTIONS)
    browser.get(row['URL'])
    time.sleep(sleep_time_sec)
    html = browser.page_source
    browser.close()
    soup = bs(html,'html.parser')
    title = soup.find('title').text
    df.loc[i, 'Title'] = title
    df.to_csv('economic_table.csv', index=False)

100%|██████████| 507/507 [34:04<00:00,  4.03s/it] 
