# Assignment

## Brief

Write the Python codes for the following questions.

## Instructions

Paste the answer as Python in the answer code section below each question.

### Question 1

Question: The scraping of `https://www.scrapethissite.com/pages/forms/` in the last section assumes a hardcoded (fixed) no of pages. Can you improve the code by removing the hardcoded no of pages and instead use the `»` button to determine if there are more pages to scrape? Hint: Use a `while` loop.


In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def parse_and_extract_rows(soup: BeautifulSoup):
    """
    Extract table rows from the parsed HTML.
    
    Args:
        soup: The parsed HTML.
    
    Returns:
        An iterator of dictionaries with the data from the current page.
    """
    header = soup.find('tr')
    headers = [th.text.strip() for th in header.find_all('th')]
    teams = soup.find_all('tr', 'team')
    for team in teams:
        row_dict = {}
        for header, col in zip(headers, team.find_all('td')):
            row_dict[header] = col.text.strip()
        yield row_dict

def has_next_page(soup: BeautifulSoup):
    """
    Check if there's a next page by looking for the '»' button.
    
    Args:
        soup: The parsed HTML.
    
    Returns:
        bool: True if next page exists, False otherwise.
    """
    # Look for the '»' symbol in pagination
    next_button = soup.find('a', {'aria-label': 'Next'})
    if next_button:
        return '»' in next_button.text
    
    # Alternative: Look for any link containing '»'
    next_links = soup.find_all('a', string='»')
    return len(next_links) > 0

def scrape_all_hockey_pages(base_url="https://www.scrapethissite.com/pages/forms/"):
    """
    Scrape all pages of hockey team data using dynamic pagination detection.
    
    Args:
        base_url: The base URL for the hockey teams page.
    
    Returns:
        list: List of dictionaries containing all team data.
    """
    all_teams = []
    page = 1
    
    while True:
        print(f"Scraping page {page}...")
        
        # Construct URL for current page
        url = f"{base_url}?page_num={page}"
        
        # Make request
        try:
            r = requests.get(url)
            r.raise_for_status()  # Raises an HTTPError for bad responses
        except requests.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            break
        
        # Parse HTML
        soup = BeautifulSoup(r.text, "html.parser")
        
        # Extract data from current page
        page_teams = list(parse_and_extract_rows(soup))
        
        # If no teams found on this page, we've gone too far
        if not page_teams:
            print(f"No data found on page {page}. Stopping.")
            break
        
        # Add teams from this page to our collection
        all_teams.extend(page_teams)
        print(f"Found {len(page_teams)} teams on page {page}")
        
        # Check if there's a next page using the '»' button
        if not has_next_page(soup):
            print("No more pages found (no '»' button). Scraping complete!")
            break
        
        # Move to next page
        page += 1
        
        # Be respectful - add a small delay between requests
        time.sleep(1)
    
    return all_teams

# Main execution
if __name__ == "__main__":
    print("Starting dynamic hockey team scraping...")
    
    # Scrape all pages
    hockey_data = scrape_all_hockey_pages()
    
    # Display results
    print(f"\nScraping complete! Found {len(hockey_data)} total teams.")
    
    # Convert to DataFrame for analysis
    df = pd.DataFrame(hockey_data)
    print(f"DataFrame shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())
    
    # Optional: Save to CSV
    df.to_csv("hockey_teams_all_pages.csv", index=False)
    print("\nData saved to 'hockey_teams_all_pages.csv'")


Starting dynamic hockey team scraping...
Scraping page 1...
Found 25 teams on page 1
Scraping page 2...
Found 25 teams on page 2
Scraping page 3...
Found 25 teams on page 3
Scraping page 4...
Found 25 teams on page 4
Scraping page 5...
Found 25 teams on page 5
Scraping page 6...
Found 25 teams on page 6
Scraping page 7...
Found 25 teams on page 7
Scraping page 8...
Found 25 teams on page 8
Scraping page 9...
Found 25 teams on page 9
Scraping page 10...
Found 25 teams on page 10
Scraping page 11...
Found 25 teams on page 11
Scraping page 12...
Found 25 teams on page 12
Scraping page 13...
Found 25 teams on page 13
Scraping page 14...
Found 25 teams on page 14
Scraping page 15...
Found 25 teams on page 15
Scraping page 16...
Found 25 teams on page 16
Scraping page 17...
Found 25 teams on page 17
Scraping page 18...
Found 25 teams on page 18
Scraping page 19...
Found 25 teams on page 19
Scraping page 20...
Found 25 teams on page 20
Scraping page 21...
Found 25 teams on page 21
Scraping pa