In [140]:
# Importing packages needed for scrape
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from io import StringIO
import time

In [142]:
# Get Roster for specific team
def get_roster(team_code):
    """
    Scrape roster for a single team.
    Args:
        team_code (str): Team code (e.g., 'det' for Detroit Lions)
    Returns:
        pandas.DataFrame: Team roster data
    """
    try:
        # Set up Chrome in headless mode
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        driver = webdriver.Chrome(options=chrome_options)
        
        # Get the page
        url = f'https://www.pro-football-reference.com/teams/{team_code}/2024_roster.htm'
        driver.get(url)
        
        # Wait for roster table to load
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'roster'))
        )
        
        # Get table HTML
        table_html = table.get_attribute('outerHTML')
        
        # Convert to DataFrame
        df = pd.read_html(table_html)[0]
        
        # Clean up DataFrame
        if 'Rk' in df.columns:
            df = df.drop(columns=['Rk'])
        df['Team'] = team_code.upper()
        
        return df
        
    except Exception as e:
        print(f"Error scraping roster for {team_code}: {str(e)}")
        return None
    finally:
        driver.quit()

def get_all_rosters():
    """
    Get rosters for all NFL teams
    """
    # NFL team codes
    teams = {
        'crd': 'Arizona Cardinals',
        'atl': 'Atlanta Falcons',
        'rav': 'Baltimore Ravens',
        'buf': 'Buffalo Bills',
        'car': 'Carolina Panthers',
        'chi': 'Chicago Bears',
        'cin': 'Cincinnati Bengals',
        'cle': 'Cleveland Browns',
        'dal': 'Dallas Cowboys',
        'den': 'Denver Broncos',
        'det': 'Detroit Lions',
        'gnb': 'Green Bay Packers',
        'htx': 'Houston Texans',
        'clt': 'Indianapolis Colts',
        'jax': 'Jacksonville Jaguars',
        'kan': 'Kansas City Chiefs',
        'rai': 'Las Vegas Raiders',
        'sdg': 'Los Angeles Chargers',
        'ram': 'Los Angeles Rams',
        'mia': 'Miami Dolphins',
        'min': 'Minnesota Vikings',
        'nwe': 'New England Patriots',
        'nor': 'New Orleans Saints',
        'nyg': 'New York Giants',
        'nyj': 'New York Jets',
        'phi': 'Philadelphia Eagles',
        'pit': 'Pittsburgh Steelers',
        'sfo': 'San Francisco 49ers',
        'sea': 'Seattle Seahawks',
        'tam': 'Tampa Bay Buccaneers',
        'oti': 'Tennessee Titans',
        'was': 'Washington Commanders'
    }
    
    all_rosters = []
    
    for code, name in teams.items():
        print(f"Scraping roster for {name}...")
        df = get_roster(code)
        if df is not None:
            df['Team Name'] = name
            all_rosters.append(df)
        time.sleep(4)  # Respect rate limiting
    
    if not all_rosters:
        raise ValueError("No rosters were successfully scraped")
    
    return pd.concat(all_rosters, ignore_index=True)

In [148]:
def get_player_info_and_stats(player_name, team_code='det'):
    """
    Scrape player position and career game logs, adapting to their position
    """
    driver = None
    
    try:
        print("Setting up Chrome driver...")
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        chrome_service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        
        # First get player's URL from roster
        roster_url = f'https://www.pro-football-reference.com/teams/{team_code}/2024_roster.htm'
        print(f"Accessing roster at: {roster_url}")
        driver.get(roster_url)
        
        # Find player on roster
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'roster'))
        )
        player_links = driver.find_elements(By.XPATH, f"//a[contains(text(), '{player_name}')]")
        if not player_links:
            raise ValueError(f"Player {player_name} not found in roster")
        
        # Get player URL and position
        player_url = player_links[0].get_attribute('href')
        player_row = player_links[0].find_element(By.XPATH, "./ancestor::tr")
        position = player_row.find_elements(By.TAG_NAME, "td")[2].text
        
        print(f"Found player URL: {player_url}")
        print(f"Position: {position}")
        
        # Go directly to game log page
        gamelog_url = player_url.replace(".htm", "/gamelog/")
        print(f"Accessing game logs at: {gamelog_url}")
        driver.get(gamelog_url)
        time.sleep(2)
        
        # Print page source for debugging
        print("\nLooking for game log tables...")
        
        # Find all tables on the page
        tables = driver.find_elements(By.TAG_NAME, 'table')
        print(f"Found {len(tables)} tables on page")
        
        game_log_tables = []
        
        for table in tables:
            try:
                table_html = table.get_attribute('outerHTML')
                df = pd.read_html(StringIO(table_html))[0]
                
                # Check if this looks like a game log table
                required_columns = ['Date', 'Tm', 'Opp']
                if all(col in df.columns for col in required_columns):
                    print(f"Found valid game log table with {len(df)} rows")
                    
                    # Clean up the dataframe
                    df = df[df['Date'].notna()]
                    df = df[~df['Date'].str.contains('Date', na=False)]
                    
                    if not df.empty:
                        game_log_tables.append(df)
                        print(f"Added table with {len(df)} games")
                
            except Exception as e:
                print(f"Skipping table: {str(e)}")
                continue
        
        if not game_log_tables:
            raise ValueError("No game logs found")
            
        # Combine all tables
        combined_logs = pd.concat(game_log_tables, ignore_index=True)
        
        # Clean up column names
        combined_logs.columns = combined_logs.columns.str.strip()
        
        # Remove duplicate rows
        combined_logs = combined_logs.drop_duplicates()
        
        # Print column names for debugging
        print("\nAvailable columns:")
        print(combined_logs.columns.tolist())
        
        # Sort by date if possible
        try:
            combined_logs['Date'] = pd.to_datetime(combined_logs['Date'])
            combined_logs = combined_logs.sort_values('Date', ascending=False)
        except:
            print("Couldn't sort by date")
        
        print(f"\nFound {len(combined_logs)} total games")
        
        # Save all columns for now
        return {
            'position': position,
            'game_logs': combined_logs
        }
        
    except Exception as e:
        print(f"Error scraping stats for {player_name}: {str(e)}")
        if driver:
            print("\nPage source:")
            print(driver.page_source[:1000])  # Print first 1000 chars of page source
        return None
        
    finally:
        if driver:
            try:
                driver.quit()
            except:
                pass

In [109]:
# For just the Lions roster:
lions_roster = get_roster('det')
lions_roster.to_csv('lions_roster_2024.csv', index=False)

  df = pd.read_html(table_html)[0]


In [150]:
if __name__ == "__main__":
    player_name = "Alex Anzalone"
    print(f"Getting stats for {player_name}...")
    
    result = get_player_info_and_stats(player_name)
    if result:
        position = result['position']
        game_logs = result['game_logs']
        
        # Save to CSV
        filename = f"{player_name.replace(' ', '_')}_career_stats.csv"
        game_logs.to_csv(filename, index=False)
        print(f"Saved {position} career stats to {filename}")
        
        # Display first few rows
        print("\nFirst few games:")
        print(game_logs.head())

Getting stats for Alex Anzalone...
Setting up Chrome driver...
Accessing roster at: https://www.pro-football-reference.com/teams/det/2024_roster.htm
Found player URL: https://www.pro-football-reference.com/players/A/AnzaAl00.htm
Position: LB
Accessing game logs at: https://www.pro-football-reference.com/players/A/AnzaAl00/gamelog/

Looking for game log tables...
Found 38 tables on page
Error scraping stats for Alex Anzalone: No game logs found

Page source:
<html data-version="klecko-" data-root="/home/pfr/deploy/www" lang="en" class=" js cookies localstorage sessionstorage cors history csspositionsticky no-touchevents pointerevents matchmedia flexwrap desktop is_live" style=""><head><script src="https://widgets.outbrain.com/outbrain.js" async=""></script><script async="" type="text/javascript" src="https://static.criteo.net/js/ld/publishertag.prebid.144.js"></script><script src="https://rules.quantcount.com/rules-p-UeXruRVtZz7w6.js" async=""></script><script src="https://cdn.hadronid.