In [None]:
from bs4 import BeautifulSoup
import time
import requests
import pandas as pd
import ssl
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', None)

In [None]:
# Defining years
years = list(range(2016, 2024))

# Load initial team data
teams_df = pd.read_csv('clustered_df.csv')
teams_upper = teams_df["team"].unique().tolist()
teams_lower = [team.lower() for team in teams_upper]

In [None]:
# Getting hrefs

def fetch_team_hrefs(url="https://www.pro-football-reference.com/years/2023/#all_team_stats"):
    """Fetch team hrefs from the website."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    hrefs = [a['href'].replace('.htm', '').rsplit('/', 1)[0] for a in soup.find_all('a', href=True) 
             if '/teams/' in a['href']]
    return list(dict.fromkeys(hrefs[1:]))  # Remove duplicates and first irrelevant href

team_hrefs = fetch_team_hrefs()

In [None]:
# Helping with team abbreviations

def create_team_abbreviations():
    teams_abbreviation = pd.read_csv('teams_abbreviations.csv')
    teams_abbreviation['Lower'] = teams_lower
    teams_abbreviation['Upper'] = teams_upper
    reference_abbreviations = [href.split('/')[-1] for href in team_hrefs]
    
    for r in range(len(teams_abbreviation)):
        if teams_abbreviation.loc[r, 'Lower'] in reference_abbreviations:
            teams_abbreviation.loc[r, 'Reference'] = reference_abbreviations[
                reference_abbreviations.index(teams_abbreviation.loc[r, 'Lower'])]
    
    # Manual overrides for specific teams
    overrides = {0: 'crd', 2: 'rav', 11: 'gnb', 12: 'htx', 13: 'clt', 15: 'kan', 16: 'ram',
                 17: 'sdg', 18: 'rai', 21: 'nwe', 22: 'nor', 28: 'sfo', 29: 'tam', 30: 'oti'}
    for idx, ref in overrides.items():
        teams_abbreviation.loc[idx, 'Reference'] = ref
    
    teams_abbreviation.to_csv('team_abbreviations_updated.csv', index=False)

create_team_abbreviations()

In [None]:
# Additional utility functions

def get_team_upper_name(team, year, abbreviations_df):
    """Determine the correct upper team name based on team and year."""
    team_name = team.split('/')[-1]
    idx = abbreviations_df['Reference'].tolist().index(team_name)
    if team_name == 'rai' and 2016 <= year <= 2019:
        return abbreviations_df['Backup_Upper_Reference'].iloc[idx]
    elif team_name == 'sdg' and year == 2016:
        return abbreviations_df['Backup_Upper_Reference'].iloc[idx]
    return abbreviations_df['Upper_Reference'].iloc[idx]

def scrape_player_data(url, table_id=None):
    """Generic function to scrape table data from a URL."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'id': table_id}) if table_id else soup.find('table')
    return pd.read_html(str(table))[0]

def filter_year_data(df, year, upper_name):
    """Filter DataFrame to the specific year and team."""
    df['Year'] = df['Year'].apply(lambda x: str(x)[:4] if pd.notna(x) else x)
    for i in range(len(df)):
        if pd.notna(df.loc[i, 'Year']) and int(df.loc[i, 'Year']) == year:
            teams = str(df.loc[i, 'Tm'])
            if teams.endswith('TM'):
                for x in range(1, int(teams[0]) + 1):
                    if str(df.loc[i + x, 'Tm']) == upper_name:
                        df.iloc[i, 2:] = df.iloc[i + x, 2:]
                        break
            return df.loc[[i]].reset_index(drop=True)
    return df

In [None]:
# Scraping qb information

# Scraping single-season statistics for each quarterback
def scrape_qb_single_season():
    abbreviations_df = pd.read_csv('team_abbreviations_updated.csv')
    qb_exceptions = {
        6: {2018: 7}, 9: {2020: 7}, 10: {2016: 7, 2021: 7}, 11: {2022: 7},
        13: {2021: 7, 2023: 7}, 14: {2022: 7}, 15: {2023: 7}, 18: {2017: 7},
        19: {2019: 7}, 20: {2020: 7}, 21: {2018: 7}, 26: {2020: 7},
        27: {2019: 7, 2022: 7, 2023: 7}, 29: {2016: 7}
    }

    for num, team in enumerate(team_hrefs[:32]):
        response = requests.get(f"https://www.pro-football-reference.com/{team}/")
        response.raise_for_status()
        time.sleep(5)
        soup = BeautifulSoup(response.content, 'html.parser')
        href_list = [a['href'].replace('.htm', '') for a in soup.find_all('a', href=True)]
        relevant_hrefs = [href for year in years if f'{team}/{year}' in href_list 
                          for href in href_list[href_list.index(f'{team}/{year}'):][:9]]

        for year in years:
            if f'{team}/{year}' not in href_list:
                continue
            idx = relevant_hrefs.index(f'{team}/{year}')
            offset = qb_exceptions.get(num, {}).get(year, 6)
            qb_href = relevant_hrefs[idx + offset]

            # Scrape QB base stats
            qb_base_df = scrape_player_data(f'https://www.pro-football-reference.com{qb_href}.htm')
            qb_base_df = qb_base_df.iloc[:-1]
            upper_name = get_team_upper_name(team, year, abbreviations_df)
            qb_base_df = filter_year_data(qb_base_df, year, upper_name)

            columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'QBrec', 'Lng', 'Y/A', 'AY/A', 'Y/C', 
                              'Yds.1', 'NY/A', 'ANY/A', '4QC', 'GWD', 'AV', 'Awards', 'Y/G']
            qb_base_df = qb_base_df.drop(columns=[col for col in columns_to_drop if col in qb_base_df.columns])
            qb_base_df.columns = ['Year', 'Qb_Age', 'Qb_G', 'Cmp', 'Pass_Att', 'Cmp%', 'Pass_Yds', 
                                 'Pass_TD', 'TD%', 'Int', 'Rate', 'Sk', '1D_Passing', 'Pass_Succ%']

            # Scrape QB rushing stats
            time.sleep(5)
            qb_rushing_df = scrape_player_data(f'https://www.pro-football-reference.com{qb_href}.htm', 
                                             'rushing_and_receiving')
            qb_rushing_df.columns = qb_rushing_df.columns.droplevel()
            qb_rushing_df = filter_year_data(qb_rushing_df, year, upper_name)
            qb_rushing_df.columns = [col if i < 7 else f'Qb_{col}' for i, col in enumerate(qb_rushing_df.columns)]
            qb_rushing_df = qb_rushing_df.drop(columns=[col for col in qb_rushing_df.columns 
                                                       if col not in ['Year', 'Qb_Rush_Att', 'Qb_Rush_Yds', 
                                                                      'Qb_Rush_Td', 'Qb_Rush_1D', 'Qb_Rush_Succ%', 
                                                                      'Qb_Touches', 'Qb_Fmb']])

            # Scrape QB fantasy stats
            time.sleep(5)
            qb_fantasy_df = scrape_player_data(f'https://www.pro-football-reference.com{qb_href}/fantasy/')
            qb_fantasy_df.columns = qb_fantasy_df.columns.droplevel().droplevel()
            qb_fantasy_df = qb_fantasy_df.fillna(0)
            qb_fantasy_df = filter_year_data(qb_fantasy_df, year, upper_name)
            qb_fantasy_df = qb_fantasy_df.iloc[:, [-9, -4]]
            qb_fantasy_df.columns = ['Qb_Snap_Percentage', 'Qb_FantPt']
            qb_fantasy_df['Qb_Snap_Percentage'] = qb_fantasy_df['Qb_Snap_Percentage'].str.replace('%', '').astype(float) / 100

            # Combine and adjust stats
            qb_df = pd.concat([qb_base_df, qb_rushing_df.drop('Year', axis=1), qb_fantasy_df], axis=1)
            per_game_stats = ['Cmp', 'Pass_Att', 'Pass_TD', 'Pass_Yds', 'Int', '1D_Passing', 'Sk', 
                             'Qb_Rush_Att', 'Qb_Rush_Yds', 'Qb_Rush_Td', 'Qb_Rush_1D', 'Qb_Touches', 
                             'Qb_Fmb', 'Qb_FantPt']
            games = int(qb_df['Qb_G'].iloc[0])
            for stat in per_game_stats:
                qb_df[stat] = qb_df[stat].astype(float) / games

            # Update main DataFrame
            full_df = pd.read_csv('clustered_df.csv')
            for r in range(len(full_df)):
                if full_df.loc[r, 'team'] == upper_name and full_df.loc[r, 'season'] == year and full_df.loc[r, 'QB_Position'] == 1:
                    qb_df.index = [r]
                    for col in qb_df.columns:
                        full_df.loc[r, col] = qb_df.loc[r, col]
            full_df.to_csv('clustered_df_qb_single_season.csv', index=False)
            print(f"{year} quarterback scraped for {team}.")

scrape_qb_single_season()

In [None]:
# Scraping positional statistics

# Scraping career averages for players
def scrape_career_averages(position, hrefs, num_range, filename):
    abbreviations_df = pd.read_csv('team_abbreviations_updated.csv')
    for num in num_range:
        team = team_hrefs[num]
        for year in years:
            snap_df = scrape_player_data(f'https://www.pro-football-reference.com{team}/{year}-snap-counts.htm', 'snap_counts')
            players = [(str(row[0]), str(row[1]), int(row[2]), hrefs[i]) 
                       for i, row in enumerate(snap_df.itertuples(index=False)) if row[1] == position]
            counts = [p[2] for p in players]
            max_indices = sorted(range(len(counts)), key=lambda i: counts[i], reverse=True)[:{'QB': 1, 'RB': 2, 'WR': 3, 'TE': 1}[position]]
            pos_hrefs = [players[i][3].replace('.htm', '') for i in max_indices]

            for i, href in enumerate(pos_hrefs, 1):
                if position == 'QB' and href == '/players/H/HillTa00':
                    ssl._create_default_https_context = ssl._create_unverified_context
                    tables = pd.read_html(f"https://www.pro-football-reference.com{href}.htm")
                    base_df = tables[2]
                else:
                    base_df = scrape_player_data(f"https://www.pro-football-reference.com{href}.htm", 
                                               'passing' if position == 'QB' else 'rushing_and_receiving' if position == 'QB' else 'receiving_and_rushing')
                
                upper_name = get_team_upper_name(team, year, abbreviations_df)
                base_df['Year'] = base_df['Year'].apply(lambda x: str(x)[:4] if pd.notna(x) and x != '*' else x)
                age = next((int(base_df.loc[i, 'Age']) for i in range(len(base_df)) 
                           if pd.notna(base_df.loc[i, 'Year']) and int(base_df.loc[i, 'Year']) == year), 0)
                career_idx = base_df[base_df['Year'] == 'Career'].index[0]
                for x in range(career_idx + 1, len(base_df)):
                    if base_df.loc[x, 'Tm'] == upper_name:
                        base_df.iloc[career_idx, 1:] = base_df.iloc[x, 1:]
                        break
                base_df = base_df.loc[[career_idx]].reset_index(drop=True)
                base_df.loc[0, 'Age'] = age
                base_df.loc[0, 'Year'] = year

                # Column renaming and dropping based on position
                columns_map = {
                    'QB': {'Cmp': 'Qb_Cmp', 'Att': 'Qb_Att', 'Cmp%': 'Qb_Cmp%', 'Yds': 'Qb_Yds', 'TD': 'Qb_Pass_Td', 
                           'TD%': 'Qb_Td%', 'Int': 'Qb_Int', 'Int%': 'Qb_Int%', '1D': 'Qb_Pass_1D', 'Succ%': 'Qb_Pass_Succ%', 
                           'Rate': 'Qb_Rate', 'Sk': 'Qb_Sk', 'Sk%': 'Qb_Sk%'},
                    'RB': lambda u: {7: f'Rush_Att_{u}', 8: f'Rush_Yds_{u}', 9: f'Rush_Tds_{u}', 10: f'Rush_1D_{u}', 
                                    11: f'Rush_Succ%_{u}', 16: f'Rb_Tgt_{u}', 17: f'Rb_Rec_{u}', 18: f'Rb_Rec_Yds_{u}', 
                                    20: f'Rb_Rec_Td_{u}', 21: f'Rb_Rec_1D_{u}', 22: f'Rb_Rec_Succ%_{u}', 28: f'Rb_Touch_{u}', 
                                    32: f'Rb_Fmb_{u}'},
                    'WR': lambda u: {7: f'Rec_Rush_Att_{u}', 8: f'Rec_Rush_Yds_{u}', 9: f'Rec_Rush_Tds_{u}', 
                                    10: f'Rec_Rush_1D_{u}', 11: f'Rec_Rush_Succ%_{u}', 16: f'Rec_Tgt_{u}', 17: f'Wr_Rec_{u}', 
                                    18: f'Rec_Yds_{u}', 20: f'Rec_Tds_{u}', 21: f'Rec_1D_{u}', 22: f'Rec_Succ%_{u}', 
                                    28: f'Rec_Touch_{u}', 32: f'Rec_Fumb_{u}'},
                    'TE': {7: 'Te_Rec_Tgt', 8: 'Te_Rec', 9: 'Te_Rec_Yds', 11: 'Te_Rec_Tds', 12: 'Te_Rec_1D', 
                          13: 'Te_Rec_Succ%', 19: 'Te_Rec_Rush_Att', 20: 'Te_Rec_Rush_Yds', 21: 'Te_Rec_Rush_Tds', 
                          22: 'Te_Rec_Rush_1D', 23: 'Te_Rec_Rush_Succ%', 28: 'Te_Rec_Touch', 32: 'Te_Rec_Fumb'}
                }
                drop_cols = {
                    'QB': ['Tm', 'Pos', 'No.', 'GS', 'QBrec', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Yds.1', 'NY/A', 'ANY/A', 
                          '4QC', 'GWD', 'AV', 'Awards', 'Y/G', 'QBR'],
                    'RB': ['Tm', 'Pos', 'No.', 'GS', 'Lng', 'Y/A', 'Y/G', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 
                          'Y/Tch', 'YScm', 'RRTD', 'AV', 'A/G', 'Awards'],
                    'WR': ['Tm', 'Pos', 'No.', 'GS', 'Lng', 'Y/A', 'Y/G', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 
                          'Y/Tch', 'YScm', 'RRTD', 'AV', 'A/G', 'Awards'],
                    'TE': ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 
                          'A/G', 'Y/Tch', 'YScm', 'RRTD', 'AV', 'Awards']
                }

                if position == 'QB':
                    base_df.columns = [columns_map['QB'].get(col, col) for col in base_df.columns]
                    if href == '/players/H/HillTa00':
                        drop_cols['QB'].remove('AV')
                else:
                    base_cols = base_df.columns.tolist()
                    col_map = columns_map[position](i) if position in ['RB', 'WR'] else columns_map[position]
                    base_df.columns = [col_map.get(idx, col) for idx, col in enumerate(base_cols)]
                base_df = base_df.drop(columns=[col for col in drop_cols[position] if col in base_df.columns])

                if position == 'QB':
                    rushing_df = scrape_player_data(f"https://www.pro-football-reference.com{href}.htm", 
                                                  'rushing_and_receiving')
                    rushing_df.columns = rushing_df.columns.droplevel()
                    rushing_df = filter_year_data(rushing_df, year, upper_name)
                    rushing_df.columns = [f'Qb_{col}' if i >= 7 else col for i, col in enumerate(rushing_df.columns)]
                    rushing_df = rushing_df.drop(columns=[col for col in rushing_df.columns 
                                                         if col not in ['Year', 'Qb_Rush_Att', 'Qb_Rush_Yds', 
                                                                        'Qb_Rush_Td', 'Qb_Rush_1D', 'Qb_Rush_Succ%', 
                                                                        'Qb_Touches', 'Qb_Fmb']])
                    final_df = pd.concat([base_df, rushing_df.drop('Year', axis=1)], axis=1)
                else:
                    final_df = base_df

                final_df.columns = ['Year' if i == 0 else f'{position}_Age' if i == 1 else f'{position}_G' if i == 2 
                                   else col for i, col in enumerate(final_df.columns)]
                final_df = final_df.fillna(0)

                # Update main DataFrame
                full_df = pd.read_csv('clustered_df_careers_all_positions.csv')
                for r in range(len(full_df)):
                    if full_df.loc[r, 'team'] == upper_name and full_df.loc[r, 'season'] == year:
                        final_df.index = [r]
                        for col in final_df.columns:
                            full_df.loc[r, col] = final_df.loc[r, col]
                full_df.to_csv(filename, index=False)
                print(f"{year}, {i if position in ['RB', 'WR'] else ''} string {position.lower()} scraped for {team}.")

# Execute career scraping for each position
scrape_career_averages('QB', [href for _, href in pd.read_html(requests.get(
    f'https://www.pro-football-reference.com{team_hrefs[0]}/{years[0]}-snap-counts.htm'
).text)[0].itertuples(index=False) if _[1] == 'QB'], range(32), 'clustered_df_careers_qb.csv')
scrape_career_averages('RB', [href for _, href in pd.read_html(requests.get(
    f'https://www.pro-football-reference.com{team_hrefs[0]}/{years[0]}-snap-counts.htm'
).text)[0].itertuples(index=False) if _[1] == 'RB'], range(30, 32), 'clustered_df_careers_rb.csv')
scrape_career_averages('WR', [href for _, href in pd.read_html(requests.get(
    f'https://www.pro-football-reference.com{team_hrefs[0]}/{years[0]}-snap-counts.htm'
).text)[0].itertuples(index=False) if _[1] == 'WR'], range(30, 32), 'clustered_df_careers_wr.csv')
scrape_career_averages('TE', [href for _, href in pd.read_html(requests.get(
    f'https://www.pro-football-reference.com{team_hrefs[0]}/{years[0]}-snap-counts.htm'
).text)[0].itertuples(index=False) if _[1] == 'TE'], range(31, 32), 'clustered_df_careers_te.csv')

In [None]:
# Convert all career statistics to per-game
def convert_to_per_game():
    df = pd.read_csv('clustered_df_careers_all_positions.csv')
    stat_df = pd.read_csv('stats.csv')
    stat_groups = {
        'QB': (stat_df['Qb_Stats'].tolist(), 'Qb_G'),
        'RB_1': (stat_df['Rb_1_Stats'].tolist(), 'Rb_G_1'),
        'RB_2': (stat_df['Rb_2_Stats'].tolist(), 'Rb_G_2'),
        'WR_1': (stat_df['Wr_1_Stats'].tolist(), 'Wr_G_1'),
        'WR_2': (stat_df['Wr_2_Stats'].tolist(), 'Wr_G_2'),
        'WR_3': (stat_df['Wr_3_Stats'].tolist(), 'Wr_G_3'),
        'TE': (stat_df['Te_Stats'].tolist(), 'Te_G')
    }
    
    for stats, game_col in stat_groups.values():
        for col in stats:
            if col in df.columns:
                df[col] = df[col].astype(float) / df[game_col].astype(float)
    
    df.fillna(0).to_csv('career_per_game.csv', index=False)

convert_to_per_game()

In [None]:
# Predicting NFL tendency data based on past career average data
def predict_nfl_tendencies():
    # Load career average data
    df = pd.read_csv('career_per_game.csv')
    columns = df.columns.tolist()
    
    # Separate features and targets
    target = columns[6:42]  # Assuming these are the tendency-related columns
    features = columns[42:]  # Remaining columns as features
    
    X = df[features]
    y = df[target]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the RandomForestRegressor
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions and evaluate
    y_pred = rf_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error for tendency prediction: {mse}')
    
    # Predict tendencies for new data
    new_df = pd.read_csv('clustered_df_careers_all_positions_2024.csv')
    X_new = new_df[features]
    predictions = rf_model.predict(X_new)
    new_df[target] = predictions
    
    # Save the updated DataFrame
    new_df.to_csv('tendency_predictions_2024.csv', index=False)
    print("Tendency predictions for 2024 saved to 'tendency_predictions_2024.csv'")

# Training model on QB dataframe with fantasy results
def predict_qb_fantasy():
    # Load QB single-season data
    qb_df = pd.read_csv('clustered_df_qb_single_season.csv')
    columns = qb_df.columns.tolist()
    
    # Define features and target
    features = columns[6:43]  # Assuming these are QB-related stats excluding fantasy points
    target = 'Qb_FantPt'
    
    X = qb_df[features]
    y = qb_df[target]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the RandomForestRegressor
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions and evaluate
    y_pred = rf_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error for QB fantasy prediction: {mse}')
    
    # Predict fantasy points for new QB data
    new_df = pd.read_csv('qb_fantasy_predictions.csv')
    X_new = new_df[features]
    predictions = rf_model.predict(X_new)
    new_df[target] = predictions
    
    # Sort by fantasy points and save
    new_df = new_df.sort_values(by='Qb_FantPt', ascending=False)
    new_df.to_csv('qb_fantasy_predictions_updated.csv', index=False)
    print("QB fantasy predictions saved to 'qb_fantasy_predictions_updated.csv'")

# Execute prediction functions
predict_nfl_tendencies()
predict_qb_fantasy()