# 🏃 Chicago Marathon Scraping Tutorial

In this notebook, we'll parse a static HTML file saved from the official Boston Marathon qualifying page.
This approach avoids browser automation and is ideal for lightweight tutorials and teaching `BeautifulSoup`.


In [1]:
from bs4 import BeautifulSoup
import pandas as pd

# Load HTML file content
html_content = None
with open("/lakehouse/default/Files/2025 Bank of America Chicago Marathon Race Reviews _ Chicago, IL.html") as file:
    html = file.read()

displayHTML(html_content)

# Step 2: Create soup
soup = BeautifulSoup(html, "html.parser")
print(soup.title)

StatementMeta(, 24d59c39-3995-4eda-ac75-52db00cb0544, 3, Finished, Available, Finished)

<title>2025 Bank of America Chicago Marathon Race Reviews | Chicago, IL</title>


In [2]:
# Just see what's in the page
print(soup.prettify()[:15000])

StatementMeta(, 24d59c39-3995-4eda-ac75-52db00cb0544, 4, Finished, Available, Finished)

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1" name="viewport"/>
  <meta content="0217C1162AEBB9DD0DF07648F951AD92" name="msvalidate.01">
   <link href="https://gmpg.org/xfn/11" rel="profile"/>
   <link href="https://raceraves.com/xmlrpc.php" rel="pingback"/>
   <script type="text/javascript">
    window.history.replaceState && window.location.hash === '#_=_' && window.history.replaceState( '','', window.location.href.substr( 0, window.location.href.indexOf( '#' ) ) );
   </script>
   <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
    <style>
     img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }
    </style>
    <!-- This site is optimized with the Yoast SEO plugin v25.2 - https://yoast.com/wordpress/plugins/seo/ -->
    <title>
     2025 Bank of America Chicago Marathon Race Rev

In [3]:
# Step 1: Just list out the titles
print("🎯 Titles found:")
for h3 in soup.find_all("h3"):
    title = h3.get_text(strip=True)
    print("-", title)

StatementMeta(, 24d59c39-3995-4eda-ac75-52db00cb0544, 5, Finished, Available, Finished)

🎯 Titles found:
- By Distance
- By City
- Chicago, always be Chicago!
- Great Race, Great City. No Headaches.
- Major Marathon
- Great crowd support
- The most FUN race I have ever run
- What do you say, it’s a world major
- Second Major in the Second City
- Amazing experience!
- My new favorite marathon
- Chicago did not disappoint!
- Bucket List Marathon!
- Chicago does it right!
- Running in the Windy City With no wind
- Great, great, great
- Bucket list race
- Amazing race!
- Amazing crowds
- My favorite major (so far)
- Let's do it again
- Chicago Marathon!!


In [4]:
# Pick a real review (not filter)
for h3 in soup.find_all("h3"):
    title = h3.get_text(strip=True)
    if title.startswith("By "):
        continue

    print(f"\n🔍 Review title: {title}")
    parent = h3.find_parent("div")
    print(parent.prettify()[:1500])  # only first 1500 chars of block
    break



StatementMeta(, 24d59c39-3995-4eda-ac75-52db00cb0544, 6, Finished, Available, Finished)


🔍 Review title: Chicago, always be Chicago!
<div class="comment-metadata">
 <h3 class="race-comment__title display-from-tablet-landscape">
  Chicago, always be Chicago!
 </h3>
 <div class="race-comment__metadata display-from-tablet-landscape">
  <a class="race-comment__subtitle" href="https://raceraves.com/races/bank-of-america-chicago-marathon/#comment-41325">
   <time datetime="2025-04-30T09:01:09-07:00">
    Apr 30, 2025
    <span class="text display-from-tablet-landscape">
     |
    </span>
    <span>
     Marathon
    </span>
   </time>
  </a>
 </div>
 <div class="race-comment__score-title race-comment__score-title--mobile">
  Overall Rating:
 </div>
 <div class="race-comment__score">
  <div class="sneakers">
   <div class="sneakers__nill">
    <span class="sneakers__shoe">
    </span>
    <span class="sneakers__shoe">
    </span>
    <span class="sneakers__shoe">
    </span>
    <span class="sneakers__shoe">
    </span>
    <span class="sneakers__shoe">
    </span>
   </div>
  

🔹 Step 1: Get the first \<h3> (a review title)

In [5]:
h3 = soup.find("h3", class_="race-comment__title")
print(h3.get_text(strip=True))  # Should print: Chicago, always be Chicago!


StatementMeta(, 24d59c39-3995-4eda-ac75-52db00cb0544, 7, Finished, Available, Finished)

Chicago, always be Chicago!


In [6]:
reviews = []

# Step 1: Find each full review block
for article in soup.find_all("article", class_="race-comment__comment--full"):

    # Step 2: Combine all <p> tags in this review
    paragraphs = article.find_all("p")
    review_text = " ".join(p.get_text(strip=True) for p in paragraphs)

    reviews.append({
        "review_text": review_text
    })

# Step 3: Create DataFrame
df = pd.DataFrame(reviews)
display(df.head())
df.to_csv("only_review_texts.csv", index=False)

StatementMeta(, 24d59c39-3995-4eda-ac75-52db00cb0544, 8, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, f7668efa-8aeb-42da-87f4-644a33a25a9a)

In [7]:
reviewsFull = []

# Step 1: Loop through each review <article>
for article in soup.find_all("article", class_="race-comment__comment--full"):
    
    # Review text
    paragraphs = article.find_all("p")
    review_text = " ".join(p.get_text(strip=True) for p in paragraphs)

    # Step 2: Go upward to find the nearest <h3> (title)
    title_tag = article.find_previous("h3")
    title = title_tag.get_text(strip=True) if title_tag else None

    # Step 3: Go upward to find the nearest <time> (date)
    time_tag = article.find_previous("time")
    date = time_tag.get_text(strip=True) if time_tag else None

    reviewsFull.append({
        "title": title,
        "date": date,
        "review_text": review_text
    })

dfFull = pd.DataFrame(reviewsFull)
display(dfFull.head())
dfFull.to_csv("/lakehouse/default/Files/chicago_reviews_full.csv", index=False)

StatementMeta(, 24d59c39-3995-4eda-ac75-52db00cb0544, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 285917b2-aa6b-4df2-bde7-0b02985fe30e)