In [55]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import numpy as np
import os
from data.datemath import to_ymd

In [15]:
events_out = "../data/raw/events-2024-mccormick.json"
attend_in = "../data/interim/sports.csv"
attend_out = "../data/interim/attendance.csv"

# Scrape

In [16]:
# Base URL
base_url = "https://tradefest.io"

# Target URL
main_url = "https://tradefest.io/en/selection/events-at-mccormick-place-convention-center"

# Headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

In [17]:
def get_event_links(url):
    """Scrape the main page to collect event links."""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find event links in the main list
    event_links = []
    for link in soup.select("div.chakra-stack.css-10r84p2 a[href^='/en/event/']"):
        event_links.append(base_url + link["href"])

    return event_links

def scrape_event_page(url):
    """Extract Date, Venue, and Attendees from an event page."""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract data fields
    data = {}
    data["url"] = url

    # Date
    date_element = soup.select_one("p.chakra-text.css-1wezswy:-soup-contains('Date:')")
    data["date"] = date_element.text.strip() if date_element else "N/A"

    # Venue
    venue_element = soup.select_one("p.chakra-text.css-1wezswy:-soup-contains('Venue')")
    data["venue"] = venue_element.text.strip() if venue_element else "N/A"

    # Attendees
    attendees_element = soup.select_one("p.chakra-text.css-1wezswy:-soup-contains('Expected number of attendees')")
    data["attendees"] = attendees_element.text.strip() if attendees_element else "N/A"

    return data


In [18]:

# Get all event links from the main page
event_links = get_event_links(main_url)
print(f"Found {len(event_links)} event links.")


Found 30 event links.


In [19]:

# Scrape data from each event page
if not os.path.exists(events_out):
    all_data = []
    for i, link in enumerate(event_links):
        print(f"Scraping {i + 1}/{len(event_links)}: {link}")
        try:
            event_data = scrape_event_page(link)
            all_data.append(event_data)
        except Exception as e:
            print(f"Error scraping {link}: {e}")
        time.sleep(1)  # Add a delay to avoid overwhelming the server
else:
    with open(events_out, "r", encoding="utf-8") as f:
        all_data = json.load(f)

In [20]:
# Save the data to a JSON file
with open(events_out, "w", encoding="utf-8") as f:
    json.dump(all_data, f, ensure_ascii=False, indent=4)

# Clean

In [56]:
df = pd.DataFrame.from_records(all_data)

In [57]:
df['name'] = df['url'].str.extract(r"https://tradefest.io/en/event/(.*)")
df['attendees'] = df['attendees'].str.extract(r"(\d+)").astype(float)
assert df.venue.str.contains('McCormick Place').all()
def extract_date(x: pd.DataFrame):
    date_from = x.date.str.extract(r"Date: from (.*) to .* \(\d+ days?\)")
    date_to = x.date.str.extract(r"Date: from .* to (.*) \(\d+ days?\)")
    date_fromto = x.date.str.extract(r"Date: (.*) \(1 day\)")
    x['date_from'] = date_from.fillna(date_fromto)
    x['date_to'] = date_to.fillna(date_fromto)
    return x
df = df.pipe(extract_date)

In [58]:
df['dates'] = df.apply(lambda x: pd.date_range(x.date_from, x.date_to), axis=1)

In [59]:
exploded = df.explode('dates')
exploded['dates'] = exploded['dates'].apply(to_ymd)

In [69]:
mc_attend = (exploded.groupby('dates',as_index=False)
    .agg({'attendees':'sum'})
    .rename(columns={'name': 'n_events', 'dates': 'date', 'attendees': 'attendance'})
    .assign(team = 'events', stadium = 'mccormick place')
    .dropna(subset='attendance'))

# Merge to attendance

In [70]:
attend = pd.read_csv(attend_in)

In [71]:
assert all(~attend.loc[attend.stadium == 'mccormick place'].date.isin(mc_attend.date))

In [72]:
attend = pd.concat([attend, mc_attend])

In [73]:
attend.to_csv(attend_out, index=False)