In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def extract_jackpot_data(text):
    """
    Extract structured jackpot data from text using regex patterns.
    
    Args:
        text (str): Raw text containing jackpot information
        
    Returns:
        dict: Structured jackpot data
    """
    record = {
        'Date': None,
        'Jackpot': None,
        'Cash Value': None,
        'Place': None,
        'Ticket Info': None
    }
    
    # Extract date
    date_match = re.search(r'\b((?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4})\b', text)
    if date_match:
        record['Date'] = date_match.group(1)
    
    # Extract jackpot amount
    jackpot_match = re.search(r'\$([\d.,]+ (?:million|billion))', text)
    if jackpot_match:
        record['Jackpot'] = jackpot_match.group(0)
    
    # Extract cash value (in parentheses)
    cash_match = re.search(r'\(\$([\d.,]+ (?:million|billion) cash)\)', text)
    if cash_match:
        record['Cash Value'] = f"${cash_match.group(1)}"
    
    # Extract place information
    place_patterns = [
        r'won in ([A-Za-z\s]+);',
        r'won in ([A-Za-z\s]+),',
        r'won in ([A-Za-z\s]+)\.',
        r'won in ([A-Za-z\s]+)$'
    ]
    
    for pattern in place_patterns:
        place_match = re.search(pattern, text)
        if place_match:
            record['Place'] = place_match.group(1).strip()
            break
    
    # Handle special cases like trusts or anonymous winners
    if "won by" in text and not record['Place']:
        trust_match = re.search(r'won by (the .+?Trust|an? .+?Trust) of (.+?)\.', text)
        if trust_match:
            record['Ticket Info'] = trust_match.group(1)
            record['Place'] = trust_match.group(2).strip()
        else:
            anon_match = re.search(r'won by an anonymous player in ([A-Za-z\s]+)', text)
            if anon_match:
                record['Place'] = anon_match.group(1).strip()
                record['Ticket Info'] = "Anonymous winner"
    
    # Extract ticket information
    if not record['Ticket Info']:
        ticket_patterns = [
            r'ticket sold in ([^;,.]+)',
            r'ticket sold at ([^;,.]+)',
            r'ticket purchased at ([^;,.]+)',
            r'ticket purchased in ([^;,.]+)',
            r'ticket purchased online through ([^;,.]+)'
        ]
        
        for pattern in ticket_patterns:
            ticket_match = re.search(pattern, text)
            if ticket_match:
                record['Ticket Info'] = ticket_match.group(1).strip()
                break
    
    return record

def scrape_lottery_by_year(soup, year):
    """
    Scrape lottery jackpot information for a specific year.
    
    Args:
        soup (BeautifulSoup): The parsed HTML
        year (str): The year to scrape
        
    Returns:
        list: List of dictionaries containing jackpot records
    """
    records = []
    
    # Try different ways to find the year section based on the HTML structure
    # 1. Direct h2/h3 with year
    year_header = soup.find(['h2', 'h3'], text=year)
    
    # 2. If not found, look for year in a div
    if not year_header:
        year_header = soup.find('div', text=year)
    
    # 3. As a last resort, find the text node with just the year
    if not year_header:
        for element in soup.find_all(text=True):
            if element.strip() == year:
                year_header = element.parent
                break
    
    if year_header:
        # Find the list items that follow the year header
        # This is a common pattern in HTML for jackpot histories
        current = year_header.next_sibling
        
        # Keep going until we find a list or reach the end
        while current and not current.name == 'ul':
            current = current.next_sibling
        
        if current and current.name == 'ul':
            # Process each list item
            for li in current.find_all('li'):
                text = li.text.strip()
                
                # Skip empty items
                if not text:
                    continue
                
                # Extract jackpot data
                record = extract_jackpot_data(text)
                
                # Only add records that have at least date and jackpot
                if record['Date'] and record['Jackpot']:
                    records.append(record)
    
    return records

def scrape_lottery_jackpot_history(url):
    """
    Scrape lottery jackpot history from a webpage.
    
    Args:
        url (str): URL of the jackpot history page
        
    Returns:
        pandas.DataFrame: Cleaned and structured jackpot history data
    """
    try:
        # Send a request to the website
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for 4XX/5XX responses
        
        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # List to store all jackpot records
        all_records = []
        
        # Find all year sections (common pattern in jackpot history pages)
        # Usually these are in reverse chronological order
        potential_years = []
        
        # Find all headings that might contain years
        for heading in soup.find_all(['h1', 'h2', 'h3', 'h4']):
            text = heading.text.strip()
            # Look for 4-digit years
            if re.match(r'^\d{4}$', text):
                potential_years.append(text)
        
        # If we found potential years, scrape each year's data
        if potential_years:
            for year in potential_years:
                logger.info(f"Scraping data for year {year}")
                year_records = scrape_lottery_by_year(soup, year)
                all_records.extend(year_records)
                logger.info(f"Found {len(year_records)} records for {year}")
        else:
            # Fallback: look for any list items that might contain jackpot info
            logger.info("No year headers found, trying to find jackpot entries directly")
            for li in soup.find_all('li'):
                text = li.text.strip()
                
                # Skip items that don't contain date patterns
                if not re.search(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}\b', text):
                    continue
                
                # Extract jackpot data
                record = extract_jackpot_data(text)
                
                # Only add records that have at least date and jackpot
                if record['Date'] and record['Jackpot']:
                    all_records.append(record)
        
        # Create a DataFrame
        df = pd.DataFrame(all_records)
        
        # Sort by date (if possible)
        try:
            df['DateObj'] = pd.to_datetime(df['Date'])
            df = df.sort_values('DateObj', ascending=False)
            df = df.drop('DateObj', axis=1)
        except Exception as e:
            logger.warning(f"Could not sort by date: {e}")
        
        return df
        
    except Exception as e:
        logger.error(f"Error scraping data: {e}")
        raise

def main():
    # URL of the lottery jackpot history page
    url = "https://www.megamillions.com/jackpot-history"  # Replace with the actual URL
    
    try:
        # Scrape the data
        df = scrape_lottery_jackpot_history(url)
        
        # Print information about the scraped data
        logger.info(f"Successfully scraped {len(df)} jackpot records")
        print("\nSample data:")
        print(df.head())
        
        # Print statistics on how many entries have values in each column
        print("\nData Completeness:")
        for column in df.columns:
            filled = df[column].notna().sum()
            percentage = filled / len(df) * 100
            print(f"{column}: {filled}/{len(df)} entries filled ({percentage:.1f}%)")
        
        # Save to CSV
        output_file = 'scraped_lottery_jackpot_history.csv'
        df.to_csv(output_file, index=False)
        logger.info(f"Data saved to '{output_file}'")
        
    except Exception as e:
        logger.error(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

2025-02-27 19:31:14,302 - INFO - Scraping data for year 2025
  year_header = soup.find(['h2', 'h3'], text=year)
  year_header = soup.find('div', text=year)
  for element in soup.find_all(text=True):
2025-02-27 19:31:14,306 - INFO - Found 1 records for 2025
2025-02-27 19:31:14,307 - INFO - Scraping data for year 2024
2025-02-27 19:31:14,310 - INFO - Found 4 records for 2024
2025-02-27 19:31:14,311 - INFO - Scraping data for year 2023
2025-02-27 19:31:14,313 - INFO - Found 10 records for 2023
2025-02-27 19:31:14,314 - INFO - Scraping data for year 2022
2025-02-27 19:31:14,316 - INFO - Found 6 records for 2022
2025-02-27 19:31:14,317 - INFO - Scraping data for year 2021
2025-02-27 19:31:14,319 - INFO - Found 6 records for 2021
2025-02-27 19:31:14,319 - INFO - Scraping data for year 2020
2025-02-27 19:31:14,321 - INFO - Found 5 records for 2020
2025-02-27 19:31:14,321 - INFO - Scraping data for year 2019
2025-02-27 19:31:14,323 - INFO - Found 7 records for 2019
2025-02-27 19:31:14,324 - IN


Sample data:
                 Date         Jackpot           Cash Value  \
0    January 17, 2025    $112 million  $49.95 million cash   
1   December 27, 2024  $1.269 billion  $571.9 million cash   
2  September 10, 2024    $810 million  $409.3 million cash   
3        June 4, 2024    $552 million  $260.2 million cash   
4      March 26, 2024  $1.128 billion  $536.6 million cash   

                                               Place  \
0                                            Arizona   
1                                         California   
2                                  Sugar Land, Texas   
3  Illinois with a ticket purchased online throug...   
4                                         New Jersey   

                                Ticket Info  
0                                     Tempe  
1       Sunshine Food and Gas in Cottonwood  
2                      the Sol Living Trust  
3                          Anonymous winner  
4  ShopRite Liquor #781 in Neptune Township  
