In [1]:
# Install selenium packages
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait

# Install other packages
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv
import pandas as pd

# Import utils
from etl_utils import load_query

In [2]:
# Load chrome driver path
load_dotenv()
chrome_driver_path = os.getenv("CHROME_DRIVER_PATH")

# Create db connection string
db_username = os.getenv("DB_USERNAME")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_conn_str = f"mysql+pymysql://{db_username}:{db_password}@{db_host}:{db_port}/tsx_composite_index"

# Set url for page on tmx website
tmx_link = "https://money.tmx.com/en/quote/%5ETSX/constituents"

# Set name for table in database
table_name = "ticker_staging_table"

In [3]:
# Create dict for company names and symbols
tsx_constituents = {"company_name": [],
                    "company_symbol": []}


In [4]:
# Configure webdriver
service = Service(executable_path=chrome_driver_path)
driver = webdriver.Chrome(service=service)

In [5]:
# Setup page counter
i = 1

# Setup main function
try:
    driver.get(tmx_link)
    # Create while loop
    while True:
        # Load list from page
        tmx_soup = BeautifulSoup(driver.page_source, "html.parser")

        # Get the constituent list table
        constituent_table = tmx_soup.find('div','ConstituentsList__ConstituentsTable-sc-q9ist-1')

        # Get the company names and tickers from this table
        company_names = constituent_table.find_all('div', 'ConstituentsList__CompanyName-sc-q9ist-6')
        company_symbols = constituent_table.find_all('span', 'ConstituentsList__SymbolLink-sc-q9ist-7')

        # Extract names and symbols
        for name_div, symbol_div in zip(company_names, company_symbols):
            tsx_constituents['company_name'].append(name_div.text)
            tsx_constituents['company_symbol'].append(symbol_div.text)

        # Find the next page button
        next_button = WebDriverWait(driver, 10).until(lambda x: x.find_element(By.CSS_SELECTOR, "button[data-testid='paginator-next']"))

        # Check if it is disabled
        is_disabled = next_button.get_attribute("disabled")

        if is_disabled is not None:
            print("All symbols have been collected. Terminating data collection and closing driver session.")
            driver.quit()
            break

        # Click button with execute script to bypass sticky banner ads
        driver.execute_script("arguments[0].click();", next_button)
        print(f"Company data extracted from page {i}")
        i+=1

# Catch any exception
except Exception as e:
    print(f"An unexpected error occurred on page {i}: {e}")

Company data extracted from page 1
Company data extracted from page 2
Company data extracted from page 3
Company data extracted from page 4
Company data extracted from page 5
Company data extracted from page 6
Company data extracted from page 7
Company data extracted from page 8
Company data extracted from page 9
Company data extracted from page 10
Company data extracted from page 11
Company data extracted from page 12
Company data extracted from page 13
Company data extracted from page 14
Company data extracted from page 15
Company data extracted from page 16
Company data extracted from page 17
Company data extracted from page 18
Company data extracted from page 19
Company data extracted from page 20
Company data extracted from page 21
All symbols have been collected. Terminating data collection and closing driver session.


In [6]:
# Put data into a dataframe
df = pd.DataFrame(tsx_constituents)

In [7]:
# Load table into database
load_query(table_name=table_name, df=df, append=False, db_conn_str=db_conn_str)

211 rows uploaded successfully to ticker_staging_table.
