In [None]:
# Title: Hockey Team Data Scraping
# Name: Daniel Muthama
# Date: 18 may 2025
# Description: Extracts multi-page hockey team data into a structured CSV.

# Import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

# Configure headers and base URL
base_url = 'https://www.scrapethissite.com/pages/forms/'
http_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}  # Renamed to http_headers

# Initialize DataFrame with column headers
response = requests.get(base_url, headers=http_headers)  # Use http_headers here
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', class_='table')

# Corrected: Use a different variable name for table headers
column_headers = [th.text.strip() for th in table.find_all('th')]  # Renamed to column_headers
df = pd.DataFrame(columns=column_headers)

# Loop through all pages (pagination handling)
page_num = 1
while True:
    # Build URL for each page
    url = base_url if page_num == 1 else f"{base_url}?page_num={page_num}"
    response = requests.get(url, headers=http_headers)  # Use http_headers here
    
    # Break loop if page fetch fails
    if response.status_code != 200:
        print(f"Stopped at page {page_num}. Status code: {response.status_code}")
        break
    
    # Parse HTML and extract table rows
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', class_='table')
    if not table:
        print(f"No table found on page {page_num}. Exiting loop.")
        break
    
    # Extract and clean data rows
    rows = table.find_all('tr')[1:]  # Skip header row
    for row in rows:
        cols = row.find_all('td')
        row_data = [col.text.strip() for col in cols]
        if len(row_data) == len(df.columns):
            df.loc[len(df)] = row_data  # Append to DataFrame
    
    print(f"Page {page_num} scraped successfully.")
    page_num += 1
    time.sleep(1)  # Avoid overwhelming the server

# Clean numeric columns (example: "Wins" and "Losses")
df['Wins'] = df['Wins'].astype(int)
df['Losses'] = df['Losses'].astype(int)

# Save to CSV
df.to_csv('hockey_teams.csv', index=False)
print("Data exported to hockey_teams.csv")

# Display preview
print("\nPreview of the DataFrame:")
print(df.head())

Page 1 scraped successfully.
Page 2 scraped successfully.
Page 3 scraped successfully.
Page 4 scraped successfully.
Page 5 scraped successfully.
