# Extract economic table and scrape informations for each indicator

In [2]:
import os

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from bs4 import BeautifulSoup as bs
from glob import glob
import pandas as pd
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

import time
sleep_time_sec = 1
CHROME_OPTIONS = Options()
CHROME_OPTIONS.add_argument('--no-sandbox')
CHROME_OPTIONS.add_argument('--headless')

## Retrieve information from each page


In [22]:
# Use Selenium on the URL page to extract page-info
def scrape_page(url):
    blacklist = [
        "We're temporarily down for maintenance; Please check back soon..." # Some pages take several seconds to load and then throw this error
    ]
    service = Service(ChromeDriverManager().install())
    browser = webdriver.Chrome(service=service, options=CHROME_OPTIONS)
    
    try:
        browser.get(url)
    except:
        print(f"Skipped: {url} to browser.get(url) failure")
        res = {k: None for k in ['Title', 'Source', 'Source_link', 'Currency', 'Description']}
        soup = None
        return res, soup
    
    time.sleep(sleep_time_sec) # To leave the time for the page to load 
    html = browser.page_source
    browser.close()
    soup = bs(html,'html.parser')

    res = {}

    res["Index"] = int(url.split("/")[-1])
    
    # Link to the source, e.g., adp-nonfarm-employment-change-1
    link_tag = soup.find('link', rel='canonical')
    
    if not link_tag: # page not existing e.g., https://www.investing.com/economic-calendar/002
        return res, None
    
    res["URL"] = link_tag['href'] + " " # e.g., "https://www.investing.com/economic-calendar/adp-nonfarm-employment-change-1 " NOTE: leave a space at the end to make it clickable in the csv
    res["Name"] = res["URL"].split("/")[-1].rpartition('-')[0].replace("-", " ") # e.g., adp nonfarm employment change
    
    country_label = soup.find('span', string='Country:')
    country_tag = country_label.find_next('i', class_='ceFlags')
    res["Country"] = country_tag['title']

    res['Title'] = soup.find('title').text.strip()
    if res['Title'] in blacklist:
        print(f"Skipped: {url} due to `{res['Title']}`")
        res = {k: None for k in ['Title', 'Source', 'Source_link', 'Currency', 'Description']}
        return res, soup

    source_span = soup.find('span', string='Source:')
    res['Source'] = source_span.find_next_sibling('span').find('a')['title']
    res['Source_link'] = source_span.find_next_sibling('span').find('a')['href'] + " " # NOTE: leave a space at the end to make it clickable in the csv

    curr_span = soup.find('span', string='Currency:')
    res['Currency'] = curr_span.find_next_sibling('span').text
    
    stars = soup.find_all('i', class_='grayFullBullishIcon')
    res["Importance"] = len(stars)

    overViewBox_div = soup.find('div', class_='overViewBox')
    overViewBox_left = overViewBox_div.find('div', class_='left')
    if overViewBox_left:
        res['Description'] = (
            overViewBox_left
            .get_text()
            .replace("<br>", " ")
            .replace("\n", " ")
        )
    else:
        res['Description'] = None

    return res, soup

# Test
url = "https://www.investing.com/economic-calendar/200"
res, soup = scrape_page(url)
for k, v in res.items():
    print(f" - {k}: {v}")

 - Index: 200
 - URL: https://www.investing.com/economic-calendar/machine-tool-orders-200 
 - Name: machine tool orders
 - Country: Japan
 - Title: Japan Machine Tool Orders YoY
 - Source: Japan Machine Tool Builders' Association
 - Source_link: http://www.jmtba.or.jp/english/ 
 - Currency: JPY
 - Importance: 1
 - Description: Machine Tool Orders measures the change in the total value of new orders placed with machine tool manufacturers. Two versions of this report are released, preliminary and final. The preliminary report had the biggest impact.A higher than expected reading should be taken as positive/bullish for the JPY, while a lower than expected reading should be taken as negative/bearish for the JPY.


In [23]:
filename = "economic_indicators.csv"

if os.path.isfile(filename):
    df = pd.read_csv(filename)
else:
    df = pd.DataFrame(columns=['Index', 'URL', 'Name', 'Country', 'Title', 'Source', 'Source_link', 'Importance', 'Currency', 'Description'])
    df.to_csv(filename, index=False)
    
for i in range(1, 1000):
    url = f"https://www.investing.com/economic-calendar/{i:03d}"
    
    row = df[df['Index'] == i]
    if row.shape[0] > 0:
        if pd.isna(row['URL'].values[0]):
            print(f"{url} failed to scrape")
        else:
            print(f"{url} already scraped as {row['URL'].values[0]}")
        continue
    
    res, soup = scrape_page(url)
    df = pd.concat([df, pd.DataFrame.from_records([res])]) # Equivalent to df.append(res, ignore_index=True)
    df.to_csv(filename, index=False)
    if "URL" in res:
        print(f"{res['URL']} scraped")
    else:
        print(f"{url} failed to scrape")
    time.sleep(sleep_time_sec)


https://www.investing.com/economic-calendar/adp-nonfarm-employment-change-1  scraped
https://www.investing.com/economic-calendar/002 failed to scrape
https://www.investing.com/economic-calendar/003 failed to scrape
https://www.investing.com/economic-calendar/all-industries-activity-index.-4  scraped
https://www.investing.com/economic-calendar/anz-commodity-price-index-5  scraped
https://www.investing.com/economic-calendar/average-cash-earnings-6  scraped
https://www.investing.com/economic-calendar/average-earnings-index-bonus-7  scraped
https://www.investing.com/economic-calendar/average-hourly-earnings-8  scraped
https://www.investing.com/economic-calendar/gross-mortgage-approvals-9  scraped
https://www.investing.com/economic-calendar/beige-book-10  scraped
https://www.investing.com/economic-calendar/boc-review-11  scraped
https://www.investing.com/economic-calendar/boc-deputy-governor-duguay-speaks-12  scraped
https://www.investing.com/economic-calendar/boc-deputy-governor-jenkins-sp

In [4]:
# MAke the README
readme_text = """# Investing.com Economic Indicators

List of all the economic indicators from [Investing.com](https://www.investing.com/).

Available as a [CSV table](economic_indicators.csv).
"""

readme_text += "\n## Most Important Indicators (three stars)\n\n"

filename = "economic_indicators.csv"
df = pd.read_csv(filename)
df = df.sort_values(by=['Country', 'Index'], ascending=True)

for i, row in df[df['Importance'] == 3].iterrows():
    readme_text += f"1. [{row['Country']} - {row['Title']}]({row['URL'].strip()})\n"
    
readme_text += "\n## Middle Importance Indicators (two stars)\n\n"

for i, row in df[df['Importance'] == 2].iterrows():
    readme_text += f"1. [{row['Country']} - {row['Title']}]({row['URL'].strip()})\n"
    
readme_text += "\n## Least Important Indicators (one star)\n\n"

for i, row in df[df['Importance'] == 1].iterrows():
    readme_text += f"1. [{row['Country']} - {row['Title']}]({row['URL'].strip()})\n"

with open("README.md", "w") as f:
    f.write(readme_text)