In [None]:
from bs4 import BeautifulSoup
import time
from urllib.request import urlopen
import pandas as pd
import requests
import ssl
pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Defining years

years = []
urls = []

for i in range(2016, 2024):
    years.append(i)

teams_df = pd.read_csv('clustered_df.csv')

teams_upper = []
teams_lower = []

for i in range(len(teams_df)):
    team = teams_df.loc[i, "team"]
    if team not in teams_upper:
        teams_upper.append(team)

for i in range(len(teams_upper)):
    teams_lower.append(teams_upper[i].lower())

In [None]:
# Getting all team hrefs from the website

url = "https://www.pro-football-reference.com/years/2023/#all_team_stats"

# Sending a GET request to fetch the page content
response = requests.get(url)
response.raise_for_status()

# Parsing the page content
soup = BeautifulSoup(response.content, 'html.parser')

# Finding all of the links
anchors = soup.find_all('a', href=True)
href_list = [anchor['href'] for anchor in anchors]

# Adding all relevant hrefs to a list
team_hrefs = []

for href in href_list:
    if '/teams/' in href:
        if href not in team_hrefs:
            team_hrefs.append(href)

team_hrefs = team_hrefs[1:]

team_hrefs = [href.replace('.htm', '') for href in team_hrefs]
team_hrefs = [href.rsplit('/', 1)[0] for href in team_hrefs]

team_hrefs

In [None]:
# Making team abbreviation csv to make indexing easier

teams_abbreviation = pd.read_csv('teams_abbreviations.csv')
teams_abbreviation['Lower'] = teams_lower
teams_abbreviation['Upper'] = teams_upper

reference_abbreviations = []
reference_abbreviations = [reference.split('/')[-1] for reference in team_hrefs]

for r in range(len(teams_abbreviation)):
    lowercase = teams_abbreviation.loc[r, 'Lower']
    if lowercase in reference_abbreviations:
        index = reference_abbreviations.index(lowercase)
        teams_abbreviation.loc[r, 'Reference'] = reference_abbreviations[index]

teams_abbreviation.loc[0, 'Reference'] = 'crd'
teams_abbreviation.loc[2, 'Reference'] = 'rav'
teams_abbreviation.loc[11, 'Reference'] = 'gnb'
teams_abbreviation.loc[12, 'Reference'] = 'htx'
teams_abbreviation.loc[13, 'Reference'] = 'clt'
teams_abbreviation.loc[15, 'Reference'] = 'kan'
teams_abbreviation.loc[16, 'Reference'] = 'ram'
teams_abbreviation.loc[17, 'Reference'] = 'sdg'
teams_abbreviation.loc[18, 'Reference'] = 'rai'
teams_abbreviation.loc[21, 'Reference'] = 'nwe'
teams_abbreviation.loc[22, 'Reference'] = 'nor'
teams_abbreviation.loc[28, 'Reference'] = 'sfo'
teams_abbreviation.loc[29, 'Reference'] = 'tam'
teams_abbreviation.loc[30, 'Reference'] = 'oti'

teams_abbreviation.to_csv('teams_abbreviations.csv', index = False)

In [None]:
# Scraping single-season statistics for each quarterback for each team and creating new .csv for each position

for num in range(0, 32):
    team = team_hrefs[num]
    
    # URL to scrape
    url = f"https://www.pro-football-reference.com/{team}/"

    # Sending a GET request to fetch the page content
    response = requests.get(url)
    response.raise_for_status()

    time.sleep(5)

    # Parsing the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Finding all of the links
    anchors = soup.find_all('a', href=True)
    href_list = [anchor['href'] for anchor in anchors]

    # Adding all relevant hrefs to a list
    relevant_hrefs = []

    for year in years:
        if f'{team}/{year}.htm' in href_list:
            index = href_list.index(f'{team}/{year}.htm')
            for j in range(index, index + 9):
                relevant_hrefs.append(href_list[j])

    relevant_hrefs = [href.replace('.htm', '') for href in relevant_hrefs]

    qb_years = []

    l = 0
        
    for j, year in enumerate(years):
        qb_data = []

        link = f'{team}/{year}'
        index = relevant_hrefs.index(link)

        # EDIT SEQUENCE OF HREFS IF COACH WAS FIRED DURING A SEASON

        # ONLY FOR THE BROWNS
        if num == 6:
            if year == 2018:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE TEXANS
        elif num == 9:
            if year == 2020:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE JAGUARS
        elif num == 10:
            if year == 2016 or year == 2021:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]
        
        # ONLY FOR THE COLTS
        elif num == 11:
            if year == 2022:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE RAIDERS
        elif num == 13:
            if year == 2021 or year == 2023:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE BRONCOS
        elif num == 14:
            if year == 2022:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE CHARGERS
        elif num == 15:
            if year == 2023:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE GIANTS:
        elif num == 18:
            if year == 2017:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE COMMANDERS:
        elif num == 19:
            if year == 2019:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE LIONS:
        elif num == 20:
            if year == 2020:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE PACKERS:
        elif num == 21:
            if year == 2018:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE FALCONS:
        elif num == 26:
            if year == 2020:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE PANTHERS:
        elif num == 27:
            if year == 2019 or year == 2022 or year == 2023:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ONLY FOR THE RAMS:
        elif num == 29:
            if year == 2016:
                qb_href = relevant_hrefs[index + 7]
            else:
                qb_href = relevant_hrefs[index + 6]

        # ACTUAL:
        else:
            qb_href = relevant_hrefs[index + 6]

        # Scraping QB base url that includes basic stats
        qb_base_url = f'https://www.pro-football-reference.com{qb_href}.htm'

        response = requests.get(qb_base_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        table = soup.find('table')

        qb_base_df = pd.read_html(str(table))[0]

        qb_base_df = qb_base_df.iloc[:-1]

        all_abbreviations = pd.read_csv('teams_abbreviations.csv')
        all_reference_abbreviation = all_abbreviations['Reference'].tolist()
        all_lower_abbreviation = all_abbreviations['Lower'].tolist()
        all_main_upper_abbreviation = all_abbreviations['Upper_Reference'].tolist()
        all_backup_upper_abbreviation = all_abbreviations['Backup_Upper_Reference'].tolist()

        team_name = team.split('/')[-1]
        index = all_reference_abbreviation.index(team_name)

        if num == 13:
            if year in range(2016, 2020):
                upper_name = all_backup_upper_abbreviation[index]
            else:
                upper_name = all_main_upper_abbreviation[index]

        elif num == 15:
            if year == 2016:
                upper_name = all_backup_upper_abbreviation[index]
            else:
                upper_name = all_main_upper_abbreviation[index]

        else:
            upper_name = all_main_upper_abbreviation[index]

        number_teams = 0

        for i in range(len(qb_base_df)):
            year_value = qb_base_df.loc[i, 'Year']
            
            if pd.notna(year_value):  # Check if the value is not NaN
                qb_base_df.loc[i, 'Year'] = str(year_value)[:4]
                df_year = int(qb_base_df.loc[i, 'Year'])

                if df_year == year:
                    df_year_index = i
                    teams = str(qb_base_df.loc[i, 'Tm'])
                    team_team = teams[1:]

                    if team_team == 'TM':
                        team_number = int(teams[0])
                        number_teams = team_number
                        for x in range(1, team_number + 1):
                            if str(qb_base_df.loc[i + x, 'Tm']) == upper_name:
                                team_proper_index = i + x
                                qb_base_df.iloc[df_year_index, 2:] = qb_base_df.iloc[team_proper_index, 2:]
                                break
                        break
                    else:
                        qb_base_df = qb_base_df.loc[[i]]
                        break
        
        qb_base_df = qb_base_df.reset_index(drop=True)

        for i in range(len(qb_base_df)):
            year_value = qb_base_df.loc[i, 'Year']
            
            if pd.notna(year_value):  # Check if the value is not NaN
                qb_base_df.loc[i, 'Year'] = str(year_value)[:4]
                df_year = int(qb_base_df.loc[i, 'Year'])

                if df_year == year:
                    qb_base_df = qb_base_df.loc[[i]]
                    break
            else:
                continue

        columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'QBrec', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Yds.1', 'NY/A', 'ANY/A', '4QC', 'GWD', 'AV', 'Awards', 'Y/G']
        try:
            qb_base_df = qb_base_df.drop(columns = columns_to_drop, axis = 1)
        except KeyError:
            # 'Awards' not in columns, remove 'Awards' from the list and drop again
            columns_to_drop.remove('Awards')
            qb_base_df = qb_base_df.drop(columns = columns_to_drop, axis = 1)

        qb_base_columns = qb_base_df.columns.tolist()
        qb_base_columns[4] = 'Pass_Att'
        qb_base_columns[6] = 'Pass_Yds'
        qb_base_columns[7] = 'Pass_TD'
        qb_base_columns[11] = '1D_Passing'
        qb_base_columns[12] = 'Pass_Succ%'
        qb_base_df.columns = qb_base_columns

        qb_base_df = qb_base_df.reset_index(drop=True)

        qb_data.append(qb_base_df)

        # Scraping QB Rushing stats

        time.sleep(5)

        url = f'https://www.pro-football-reference.com{qb_href}.htm#all_rushing_and_receiving'

        # Send a GET request to the webpage
        response = requests.get(url)

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the specific table by its ID
        table = soup.find('table', {'id': 'rushing_and_receiving'})

        # Read the HTML table into a pandas DataFrame
        qb_rushing_df = pd.read_html(str(table))[0]

        qb_rushing_df.columns = qb_rushing_df.columns.droplevel()

        for i in range(len(qb_rushing_df)):
            year_value = qb_rushing_df.loc[i, 'Year']
            
            if pd.notna(year_value):  # Check if the value is not NaN
                qb_rushing_df.loc[i, 'Year'] = str(year_value)[:4]
                df_year = int(qb_rushing_df.loc[i, 'Year'])

                if df_year == year:
                    df_year_index = i
                    teams = str(qb_rushing_df.loc[i, 'Tm'])
                    team_team = teams[1:]

                    if team_team == 'TM':
                        team_number = int(teams[0])
                        number_teams = team_number
                        for x in range(1, team_number + 1):
                            if str(qb_rushing_df.loc[i + x, 'Tm']) == upper_name:
                                team_proper_index = i + x
                                qb_rushing_df.iloc[df_year_index, 2:] = qb_rushing_df.iloc[team_proper_index, 2:]
                                break
                        break
                    else:
                        qb_rushing_df = qb_rushing_df.loc[[i]]
                        break
        
        qb_rushing_df = qb_rushing_df.reset_index(drop=True)

        for i in range(len(qb_rushing_df)):
            year_value = qb_rushing_df.loc[i, 'Year']
            
            if pd.notna(year_value):  # Check if the value is not NaN
                qb_rushing_df.loc[i, 'Year'] = str(year_value)[:4]
                df_year = int(qb_rushing_df.loc[i, 'Year'])

                if df_year == year:
                    qb_rushing_df = qb_rushing_df.loc[[i]]
                    break
            else:
                continue

        qb_rushing_columns = qb_rushing_df.columns.tolist()
        qb_rushing_columns[7] = 'Qb_Rush_Att'
        qb_rushing_columns[8] = 'Qb_Rush_Yds'
        qb_rushing_columns[9] = 'Qb_Rush_Td'
        qb_rushing_columns[10] = 'Qb_Rush_1D'
        qb_rushing_columns[11] = 'Qb_Rush_Succ%'
        qb_rushing_columns[28] = 'Qb_Touches'
        qb_rushing_columns[32] = 'Qb_Fmb'
        qb_rushing_df.columns = qb_rushing_columns

        drop_columns = [0, 1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31]
        qb_rushing_df = qb_rushing_df.drop(qb_rushing_df.columns[drop_columns], axis = 1)

        qb_rushing_df = qb_rushing_df.reset_index(drop=True)

        qb_data.append(qb_rushing_df)

        # Scraping QB fantasy stats
        qb_fantasy_url = f"https://www.pro-football-reference.com{qb_href}/fantasy/"

        response = requests.get(qb_fantasy_url)
        response.raise_for_status()

        time.sleep(5)

        soup = BeautifulSoup(response.content, 'html.parser')

        table = soup.find('table')

        qb_fantasy_df = pd.read_html(str(table))[0]

        # TRYING NEW METHOD FOR FANTASY

        qb_fantasy_df.columns = qb_fantasy_df.columns.droplevel()
        qb_fantasy_df.columns = qb_fantasy_df.columns.droplevel()

        qb_fantasy_df = qb_fantasy_df.reset_index(drop=True)

        qb_fantasy_df = qb_fantasy_df.fillna(0)

        qb_fantasy_df_columns = qb_fantasy_df.columns.tolist()
        qb_fantasy_df_columns[0] = 'Year'
        qb_fantasy_df.columns = qb_fantasy_df_columns

        for i in range(len(qb_fantasy_df)):
            year_value = qb_fantasy_df.loc[i, 'Year']
            
            if pd.notna(year_value):  # Check if the value is not NaN
                qb_fantasy_df.loc[i, 'Year'] = str(year_value)[:4]
                df_year = int(qb_fantasy_df.loc[i, 'Year'])

                if df_year == year:
                    df_year_index = i
                    teams = str(qb_fantasy_df.loc[i, 'Tm'])
                    team_team = teams[1:]

                    for x in range(1, number_teams + 1):
                        if str(qb_fantasy_df.loc[i + x, 'Tm']) == upper_name:
                            team_proper_index = i + x
                            qb_fantasy_df.iloc[df_year_index, 1:] = qb_fantasy_df.iloc[team_proper_index, 1:]
                            break
                        else:
                            qb_fantasy_df = qb_fantasy_df.loc[[i]]
                            break
                    break

        qb_fantasy_df = qb_fantasy_df.reset_index(drop=True)

        for i in range(len(qb_fantasy_df)):
            year_value = qb_fantasy_df.loc[i, 'Year']
            
            if pd.notna(year_value):  # Check if the value is not NaN
                qb_fantasy_df.loc[i, 'Year'] = str(year_value)[:4]
                df_year = int(qb_fantasy_df.loc[i, 'Year'])

                if df_year == year:
                    qb_fantasy_df = qb_fantasy_df.loc[[i]]
                    break
            else:
                continue

        qb_fantasy_df = qb_fantasy_df.iloc[:, [-9, -4]]

        qb_fantasy_df = qb_fantasy_df.reset_index(drop=True)

        qb_fantasy_df_columns = qb_fantasy_df.columns.tolist()
        qb_fantasy_df_columns[0] = 'Qb_Snap_Percentage'
        qb_fantasy_df_columns[1] = 'Qb_FantPt'
        qb_fantasy_df.columns = qb_fantasy_df_columns

        qb_snap_percentage = qb_fantasy_df.loc[0, 'Qb_Snap_Percentage']

        qb_snap_percentage = qb_snap_percentage.replace('%', '')
        percentage_float = float(qb_snap_percentage)
        decimal_value = percentage_float / 100

        qb_fantasy_df.loc[0, 'Qb_Snap_Percentage'] = decimal_value

        qb_fantasy_df = qb_fantasy_df.reset_index(drop=True)

        qb_data.append(qb_fantasy_df)

        qb_df = pd.concat(qb_data, axis = 1)

        # Convering the statsitics to per-game
        per_game_stats = ['Cmp', 'Pass_Att', 'Pass_TD', 'Pass_Yds', 'Int', '1D_Passing', 'Sk', 'Qb_Rush_Att', 'Qb_Rush_Yds', 'Qb_Rush_Td', 'Qb_Rush_1D', 'Qb_Touches', 'Qb_Fmb', 'Qb_FantPt']

        games = int(qb_df.loc[:, 'G'])
        for stat in per_game_stats:
            qb_df.loc[:, stat] = float(qb_df.loc[:, stat]) / games

        qb_df = qb_df.iloc[:, 1:]

        qb_df = qb_df.reset_index(drop=True)

        qb_df_columns = qb_df.columns.tolist()
        qb_df_columns[0] = 'Qb_Age'
        qb_df_columns[1] = 'Qb_G'
        qb_df_columns[7] = 'Qb_TD%'
        qb_df.columns = qb_df_columns

        full_clustered_df = pd.read_csv('clustered_df.csv')
        all_abbreviations = pd.read_csv('teams_abbreviations.csv')
        all_reference_abbreviation = all_abbreviations['Reference'].tolist()
        all_lower_abbreviation = all_abbreviations['Lower'].tolist()
        all_upper_abbreviation = all_abbreviations['Upper'].tolist()

        for r in range(len(full_clustered_df)):
            cluster_team_name = full_clustered_df.loc[r, 'team']
            team_name = team.split('/')[-1]
            season = full_clustered_df.loc[r, 'season']
            qb_position = full_clustered_df.loc[r, 'QB_Position']
            index = all_reference_abbreviation.index(team_name)
            upper_name = all_upper_abbreviation[index]

            quart = qb_df.columns.tolist()

            if cluster_team_name == upper_name:
                if season == year:
                    if qb_position == 1:
                        qb_df = qb_df.rename(index={qb_df.index[0]: r})
                        row_index = qb_df.index[0]
                        for col in quart:
                            full_clustered_df.loc[r, col] = qb_df.loc[row_index, col]
                        
        
        full_clustered_df.to_csv('clustered_df_qb.csv', index  = False)
        qb_years.append(qb_df)
        print(f"{year} quarterback scraped for {team}.")

        # Remove following break statement to do all years of a franchise
        # break
    # Remove following break statement to do all franchises
    # break

In [None]:
# SCRAPING CAREER AVERAGES FOR A PLAYER FOR A TEAM'S SEASON

for num in range(0, 32):
    team = team_hrefs[num]

    for j, year in enumerate(years):
        url = f'https://www.pro-football-reference.com{team}/{year}-snap-counts.htm'

        # Send a request to the website
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the first table
        table = soup.find('table', {'id': 'snap_counts'})

        # Extract table headers
        headers = [th.getText() for th in table.find('thead').findAll('th')]
        rows = table.find('tbody').findAll('tr')

        hrefs = []
        for row in rows:
            first_col = row.find('th') 
            if first_col:
                link = first_col.find('a')
                if link:
                    hrefs.append(link['href'])

        # Extract table rows
        data = []
        for row in rows:
            cols = row.findAll('td')
            if cols:
                data.append([col.getText() for col in row.findAll(['th', 'td'])])

        # Create a DataFrame
        df = pd.DataFrame(data)

        receiver_hrefs = []
        runningback_hrefs = []
        tightend_hrefs = []
        quarterback_hrefs = []

        receiver_counts = []
        runningback_counts = []
        tightend_counts = []
        quarterback_counts = []

        for i in range(len(df)):
            player = str(df.iloc[i, 0])
            position = str(df.iloc[i, 1])
            snap_counts = int(df.iloc[i, 2])
            if position == 'RB':
                runningback_hrefs.append(hrefs[i])
                runningback_counts.append(snap_counts)
            elif position == 'WR':
                receiver_hrefs.append(hrefs[i])
                receiver_counts.append(snap_counts)
            elif position == 'TE':
                tightend_hrefs.append(hrefs[i])
                tightend_counts.append(snap_counts)
            elif position == 'QB':
                quarterback_hrefs.append(hrefs[i])
                quarterback_counts.append(snap_counts)
            

        # Getting the appropriate number of players for each position
        receiver_max_indices = sorted(range(len(receiver_counts)), key=lambda i: receiver_counts[i], reverse=True)[:3]
        runningback_max_indices = sorted(range(len(runningback_counts)), key=lambda i: runningback_counts[i], reverse=True)[:2]
        tightend_max_indices = sorted(range(len(tightend_counts)), key=lambda i: tightend_counts[i], reverse=True)[:1]
        quarterback_max_indices = sorted(range(len(quarterback_counts)), key=lambda i: quarterback_counts[i], reverse=True)[:1]

        # Getting the hrefs for all players that are going to be used
        receiver_hrefs = [receiver_hrefs[i] for i in receiver_max_indices]
        receiver_hrefs = [href.replace('.htm', '') for href in receiver_hrefs]

        runningback_hrefs = [runningback_hrefs[i] for i in runningback_max_indices]
        runningback_hrefs = [href.replace('.htm', '') for href in runningback_hrefs]

        tightend_hrefs = [tightend_hrefs[i] for i in tightend_max_indices]
        tightend_hrefs = [href.replace('.htm', '') for href in tightend_hrefs]

        quarterback_hrefs = [quarterback_hrefs[i] for i in quarterback_max_indices]
        quarterback_hrefs = [href.replace('.htm', '') for href in quarterback_hrefs]

        for href in quarterback_hrefs:
            qb_data = []

            # Dealing with passing link for QBs

            # Special case for Tayson Hill
            if href == '/players/H/HillTa00':
                ssl._create_default_https_context = ssl._create_unverified_context

                # Load the URL into a DataFrame
                url = f"https://www.pro-football-reference.com{href}.htm"
                tables = pd.read_html(url)

                # Extract the 4th table
                fourth_table = tables[2]
                qb_base_df = fourth_table

            else:
                qb_passing_link = f"https://www.pro-football-reference.com{href}.htm#all_passing"

                # Dealing with passing link for quarterbacks

                time.sleep(5)

                response = requests.get(qb_passing_link)
                response.raise_for_status()

                soup = BeautifulSoup(response.content, 'html.parser')

                table = soup.find('table')

                qb_base_df = pd.read_html(str(table))[0]

            qb_rushing_link = f"https://www.pro-football-reference.com{href}.htm#all_rushing_and_receiving"

            # qb_base_df.columns = qb_base_df.columns.droplevel()

            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_main_upper_abbreviation = all_abbreviations['Upper_Reference'].tolist()
            all_backup_upper_abbreviation = all_abbreviations['Backup_Upper_Reference'].tolist()

            team_name = team.split('/')[-1]
            index = all_reference_abbreviation.index(team_name)

            if num == 13:
                if year in range(2016, 2020):
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            elif num == 15:
                if year == 2016:
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            else:
                upper_name = all_main_upper_abbreviation[index]

            qb_age = 0

            for i in range(len(qb_base_df)):
                year_value = qb_base_df.loc[i, 'Year']
                
                if pd.notna(year_value) and year_value != '*':  # Check if the value is not NaN
                    qb_base_df.loc[i, 'Year'] = str(year_value)[:4]
                    df_year = int(qb_base_df.loc[i, 'Year'])

                    if df_year == year:
                        df_year_index = i
                        teams = str(qb_base_df.loc[i, 'Tm'])
                        team_team = teams[1:]
                        age = int(qb_base_df.loc[i, 'Age'])
                        qb_age = age
                        break

            # Finds where player played on signified team and moves that row up to the Career row
            for i in range(len(qb_base_df)):
                if (qb_base_df.loc[i, 'Year']) == 'Career':
                    if (len(qb_base_df) - i) != 0:
                        for x in range(1, len(qb_base_df) - i):
                            if (qb_base_df.loc[x + i, 'Tm']) == upper_name:
                                qb_base_df.iloc[i, 1:] = qb_base_df.iloc[i + x, 1:]
                                break
                    else:
                        break
                    break

            # Makes the base qb data frame the career row
            for i in range(len(qb_base_df)):
                if (qb_base_df.loc[i, 'Year'] == 'Career'):
                    qb_base_df = qb_base_df.loc[[i]]
                    break

            qb_base_df = qb_base_df.reset_index(drop = True)

            qb_base_df.loc[0, 'Age'] = qb_age
            qb_base_df.loc[0, 'Year'] = year

            if href == '/players/H/HillTa00':
                columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'QBrec', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Yds.1', 'NY/A', 'ANY/A', '4QC', 'GWD', 'Awards', 'Y/G', 'QBR']
            else:
                columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'QBrec', 'Lng', 'Y/A', 'AY/A', 'Y/C', 'Yds.1', 'NY/A', 'ANY/A', '4QC', 'GWD', 'AV', 'Awards', 'Y/G', 'QBR']
            # qb_base_df = qb_base_df.drop(columns_to_drop, axis = 1)
            try:
                qb_base_df = qb_base_df.drop(columns = columns_to_drop, axis = 1)
            except KeyError:
                # 'Awards' not in columns, remove 'Awards' from the list and drop again
                columns_to_drop.remove('Awards')
                qb_base_df = qb_base_df.drop(columns = columns_to_drop, axis = 1)

            qb_base_df = qb_base_df.reset_index(drop = True)

            qb_base_columns = qb_base_df.columns.tolist()
            qb_base_columns[3] = 'Qb_Cmp'
            qb_base_columns[4] = 'Qb_Att'
            qb_base_columns[5] = 'Qb_Cmp%'
            qb_base_columns[6] = 'Qb_Yds'
            qb_base_columns[7] = 'Qb_Pass_Td'
            qb_base_columns[8] = 'Qb_Td%'
            qb_base_columns[9] = 'Qb_Int'
            qb_base_columns[10] = 'Qb_Int%'
            qb_base_columns[11] = 'Qb_Pass_1D'
            qb_base_columns[12] = 'Qb_Pass_Succ%'
            qb_base_columns[13] = 'Qb_Rate'
            qb_base_columns[14] = 'Qb_Sk'
            qb_base_columns[15] = 'Qb_Sk%'
            qb_base_df.columns = qb_base_columns

            qb_base_df = qb_base_df.reset_index(drop=True)

            qb_data.append(qb_base_df)

            time.sleep(5)

            # Working with the Qb rushing url
            response = requests.get(qb_rushing_link)

            soup = BeautifulSoup(response.content, 'html.parser')

            table = soup.find('table', {'id': 'rushing_and_receiving'})

            qb_rushing_df = pd.read_html(str(table))[0]

            qb_rushing_df.columns = qb_rushing_df.columns.droplevel()

            qb_age = 0

            for i in range(len(qb_rushing_df)):
                year_value = qb_rushing_df.loc[i, 'Year']
                
                if pd.notna(year_value) and year_value != '*':  # Check if the value is not NaN
                    qb_rushing_df.loc[i, 'Year'] = str(year_value)[:4]
                    df_year = int(qb_rushing_df.loc[i, 'Year'])

                    if df_year == year:
                        df_year_index = i
                        teams = str(qb_rushing_df.loc[i, 'Tm'])
                        team_team = teams[1:]
                        age = int(qb_rushing_df.loc[i, 'Age'])
                        qb_age = age
                        break

            # Finds where player played on signified team and moves that row up to the Career row
            for i in range(len(qb_rushing_df)):
                if (qb_rushing_df.loc[i, 'Year']) == 'Career':
                    if (len(qb_rushing_df) - i) != 0:
                        for x in range(1, len(qb_rushing_df) - i):
                            if (qb_rushing_df.loc[x + i, 'Tm']) == upper_name:
                                qb_rushing_df.iloc[i, 1:] = qb_rushing_df.iloc[i + x, 1:]
                                break
                    else:
                        break
                    break

            # Makes the base qb data frame the career row
            for i in range(len(qb_rushing_df)):
                if (qb_rushing_df.loc[i, 'Year'] == 'Career'):
                    qb_rushing_df = qb_rushing_df.loc[[i]]
                    break

            qb_rushing_df = qb_rushing_df.reset_index(drop = True)

            qb_rushing_df.loc[0, 'Age'] = qb_age
            qb_rushing_df.loc[0, 'Year'] = year

            qb_rushing_columns = qb_rushing_df.columns.tolist()
            qb_rushing_columns[7] = 'Qb_Rush_Att'
            qb_rushing_columns[8] = 'Qb_Rush_Yds'
            qb_rushing_columns[9] = 'Qb_Rush_Td'
            qb_rushing_columns[10] = 'Qb_Rush_1D'
            qb_rushing_columns[11] = 'Qb_Rush_Succ%'
            qb_rushing_columns[28] = 'Qb_Touches'
            qb_rushing_columns[32] = 'Qb_Fmb'
            qb_rushing_df.columns = qb_rushing_columns

            drop_columns = [0, 1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31]
            qb_rushing_df = qb_rushing_df.drop(qb_rushing_df.columns[drop_columns], axis = 1)

            qb_rushing_df = qb_rushing_df.reset_index(drop=True)

            qb_data.append(qb_rushing_df)

            qb_df = pd.concat(qb_data, axis = 1)

            qb_df_columns = qb_df.columns.tolist()

            qb_df_columns[0] = 'Qb_Year'
            qb_df_columns[1] = 'Qb_Age'
            qb_df_columns[2] = 'Qb_G'

            qb_df.columns = qb_df_columns

            # Applying all qb dfs into the overall df
            full_clustered_df = pd.read_csv('clustered_df_careers.csv')
            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_upper_abbreviation = all_abbreviations['Upper'].tolist()

            for r in range(len(full_clustered_df)):
                cluster_team_name = full_clustered_df.loc[r, 'team']
                team_name = team.split('/')[-1]
                season = full_clustered_df.loc[r, 'season']
                index = all_reference_abbreviation.index(team_name)
                upper_name = all_upper_abbreviation[index]

                all_qb_columns = qb_df.columns.tolist()

                if cluster_team_name == upper_name:
                    if season == year:
                        qb_df = qb_df.rename(index={qb_df.index[0]: r})
                        row_index = qb_df.index[0]
                        for col in all_qb_columns:
                            full_clustered_df.loc[r, col] = qb_df.loc[row_index, col]
            
            full_clustered_df.to_csv('clustered_df_careers.csv', index  = False)
            print(f"{year} quarterback scraped for {team}.")

        # Remove break statement to do all years of a franchise
        # break

    # Remove break statement to do all franchises
    # break

In [None]:
# SCRAPES CAREER AVERAGES FOR ALL RUNNING BACKS

for num in range(30, 32):
    # team = team_hrefs[10]
    team = team_hrefs[num]

    for j, year in enumerate(years):
        url = f'https://www.pro-football-reference.com{team}/{year}-snap-counts.htm'

        # Send a request to the website
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the first table
        table = soup.find('table', {'id': 'snap_counts'})

        # Extract table headers
        headers = [th.getText() for th in table.find('thead').findAll('th')]
        rows = table.find('tbody').findAll('tr')

        hrefs = []
        for row in rows:
            first_col = row.find('th') 
            if first_col:
                link = first_col.find('a')
                if link:
                    hrefs.append(link['href'])

        # Extract table rows
        data = []
        for row in rows:
            cols = row.findAll('td')
            if cols:
                data.append([col.getText() for col in row.findAll(['th', 'td'])])

        # Create a DataFrame
        df = pd.DataFrame(data)

        receiver_hrefs = []
        runningback_hrefs = []
        tightend_hrefs = []
        quarterback_hrefs = []

        receiver_counts = []
        runningback_counts = []
        tightend_counts = []
        quarterback_counts = []

        for i in range(len(df)):
            player = str(df.iloc[i, 0])
            position = str(df.iloc[i, 1])
            snap_counts = int(df.iloc[i, 2])
            if position == 'RB':
                runningback_hrefs.append(hrefs[i])
                runningback_counts.append(snap_counts)
            elif position == 'WR':
                receiver_hrefs.append(hrefs[i])
                receiver_counts.append(snap_counts)
            elif position == 'TE':
                tightend_hrefs.append(hrefs[i])
                tightend_counts.append(snap_counts)
            elif position == 'QB':
                quarterback_hrefs.append(hrefs[i])
                quarterback_counts.append(snap_counts)
            

        # Getting the appropriate number of players for each position
        receiver_max_indices = sorted(range(len(receiver_counts)), key=lambda i: receiver_counts[i], reverse=True)[:3]
        runningback_max_indices = sorted(range(len(runningback_counts)), key=lambda i: runningback_counts[i], reverse=True)[:2]
        tightend_max_indices = sorted(range(len(tightend_counts)), key=lambda i: tightend_counts[i], reverse=True)[:1]
        quarterback_max_indices = sorted(range(len(quarterback_counts)), key=lambda i: quarterback_counts[i], reverse=True)[:1]

        # Getting the hrefs for all players that are going to be used
        receiver_hrefs = [receiver_hrefs[i] for i in receiver_max_indices]
        receiver_hrefs = [href.replace('.htm', '') for href in receiver_hrefs]

        runningback_hrefs = [runningback_hrefs[i] for i in runningback_max_indices]
        runningback_hrefs = [href.replace('.htm', '') for href in runningback_hrefs]

        tightend_hrefs = [tightend_hrefs[i] for i in tightend_max_indices]
        tightend_hrefs = [href.replace('.htm', '') for href in tightend_hrefs]

        quarterback_hrefs = [quarterback_hrefs[i] for i in quarterback_max_indices]
        quarterback_hrefs = [href.replace('.htm', '') for href in quarterback_hrefs]

        # Used for distinguishing between first and second string running backs
        u = 1

        for href in runningback_hrefs:
            rb_data = []

            rb_base_link = f"https://www.pro-football-reference.com{href}.htm"
            # rb_fantasy_link = f"https://www.pro-football-reference.com{href}/fantasy/{year}/"
            rb_fantasy_link = f"https://www.pro-football-reference.com{href}/fantasy/"

            # Dealing with the base df for runningbacks

            time.sleep(5)

            response = requests.get(rb_base_link)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            table = soup.find('table')

            rb_base_df = pd.read_html(str(table))[0]
            rb_base_df.columns = rb_base_df.columns.droplevel()
            rb_base_df = rb_base_df.reset_index(drop=True)

            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_main_upper_abbreviation = all_abbreviations['Upper_Reference'].tolist()
            all_backup_upper_abbreviation = all_abbreviations['Backup_Upper_Reference'].tolist()

            team_name = team.split('/')[-1]
            index = all_reference_abbreviation.index(team_name)

            if num == 13:
                if year in range(2016, 2020):
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            elif num == 15:
                if year == 2016:
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            else:
                upper_name = all_main_upper_abbreviation[index]

            number_teams = 0

            # Finding career averages for a rb for a team
            rb_age = 0

            for i in range(len(rb_base_df)):
                year_value = rb_base_df.loc[i, 'Year']
                
                if pd.notna(year_value) and year_value != '*':  # Check if the value is not NaN
                    rb_base_df.loc[i, 'Year'] = str(year_value)[:4]
                    df_year = int(rb_base_df.loc[i, 'Year'])

                    if df_year == year:
                        df_year_index = i
                        teams = str(rb_base_df.loc[i, 'Tm'])
                        team_team = teams[1:]
                        age = int(rb_base_df.loc[i, 'Age'])
                        rb_age = age
                        break

            # Finds where player played on signified team and moves that row up to the Career row
            for i in range(len(rb_base_df)):
                if (rb_base_df.loc[i, 'Year']) == 'Career':
                    if (len(rb_base_df) - i) != 0:
                        for x in range(1, len(rb_base_df) - i):
                            if (rb_base_df.loc[x + i, 'Tm']) == upper_name:
                                rb_base_df.iloc[i, 1:] = rb_base_df.iloc[i + x, 1:]
                                break
                    else:
                        break
                    break

            # Makes the base rb data frame the career row
            for i in range(len(rb_base_df)):
                if (rb_base_df.loc[i, 'Year'] == 'Career'):
                    rb_base_df = rb_base_df.loc[[i]]
                    break

            rb_base_columns = rb_base_df.columns.tolist()

            if rb_base_columns[7] == 'Att':
                rb_base_columns[7] = f'Rush_Att_{u}'
                rb_base_columns[8] = f'Rush_Yds_{u}'
                rb_base_columns[9] = f'Rush_Tds_{u}'
                rb_base_columns[10] = f'Rush_1D_{u}'
                rb_base_columns[11] = f'Rush_Succ%_{u}'
                rb_base_columns[16] = f'Rb_Tgt_{u}'
                rb_base_columns[17] = f'Rb_Rec_{u}'
                rb_base_columns[18] = f'Rb_Rec_Yds_{u}'
                rb_base_columns[20] = f'Rb_Rec_Td_{u}'
                rb_base_columns[21] = f'Rb_Rec_1D_{u}'
                rb_base_columns[22] = f'Rb_Rec_Succ%_{u}'
                rb_base_columns[28] = f'Rb_Touch_{u}'
                rb_base_columns[32] = f'Rb_Fmb_{u}'

                rb_base_df.columns = rb_base_columns

                columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Lng', 'Y/A', 'Y/G', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Y/Tch', 'YScm', 'RRTD', 'AV', 'A/G', 'Awards']
                try:
                    rb_base_df = rb_base_df.drop(columns = columns_to_drop, axis = 1)
                except KeyError:
                    # 'Awards' not in columns, remove 'Awards' from the list and drop again
                    columns_to_drop.remove('Awards')
                    rb_base_df = rb_base_df.drop(columns = columns_to_drop, axis = 1)
                    
            if rb_base_columns[7] == 'Tgt':
                rb_base_columns[28] = f'Rb_Touch_{u}'
                rb_base_columns[32] = f'Rb_Fmb_{u}'

                rb_base_columns[7] = f'Rb_Tgt_{u}'
                rb_base_columns[8] = f'Rb_Rec_{u}'
                rb_base_columns[9] = f'Rb_Rec_Yds_{u}'
                rb_base_columns[11] = f'Rb_Rec_Td_{u}'
                rb_base_columns[12] = f'Rb_Rec_1D_{u}'
                rb_base_columns[13] = f'Rb_Rec_Succ%_{u}'

                rb_base_columns[19] = f'Rush_Att_{u}'
                rb_base_columns[20] = f'Rush_Yds_{u}'
                rb_base_columns[21] = f'Rush_Tds_{u}'
                rb_base_columns[22] = f'Rush_1D_{u}'
                rb_base_columns[23] = f'Rush_Succ%_{u}'

                rb_base_df.columns = rb_base_columns

                columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'AV', 'Awards']
                try:
                    rb_base_df = rb_base_df.drop(columns = columns_to_drop, axis = 1)
                except KeyError:
                    # 'Awards' not in columns, remove 'Awards' from the list and drop again
                    columns_to_drop.remove('Awards')
                    rb_base_df = rb_base_df.drop(columns = columns_to_drop, axis = 1)

            rb_base_df = rb_base_df.reset_index(drop=True)

            rb_data.append(rb_base_df)

            rb_df = pd.concat(rb_data, axis = 1)

            rb_df = rb_df.reset_index(drop=True)

            rb_df_columns = rb_df.columns.tolist()
            rb_df_columns[0] = f'Year_{u}'
            rb_df_columns[1] = f'Rb_Age_{u}'
            rb_df_columns[2] = f'Rb_G_{u}'
            rb_df.columns = rb_df_columns

            rb_df.loc[0, f'Year_{u}'] = year
            rb_df.loc[0, f'Rb_Age_{u}'] = rb_age

            # Applying all rb dfs into the overall df
            full_clustered_df = pd.read_csv('clustered_df_careers_rb.csv')
            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_upper_abbreviation = all_abbreviations['Upper'].tolist()

            for r in range(len(full_clustered_df)):
                cluster_team_name = full_clustered_df.loc[r, 'team']
                team_name = team.split('/')[-1]
                season = full_clustered_df.loc[r, 'season']
                index = all_reference_abbreviation.index(team_name)
                upper_name = all_upper_abbreviation[index]

                all_rb_columns = rb_df.columns.tolist()

                if cluster_team_name == upper_name:
                    if season == year:
                        rb_df = rb_df.rename(index={rb_df.index[0]: r})
                        row_index = rb_df.index[0]
                        for col in all_rb_columns:
                            full_clustered_df.loc[r, col] = rb_df.loc[row_index, col]
            
            full_clustered_df.to_csv('clustered_df_careers_rb.csv', index  = False)
            print(f"{year}, {u} string runningback scraped for {team}.")

            u += 1

        # Remove break statement to do all years of a franchise
        # break

    # Remove break statement to do all franchises
    # break

In [None]:
# SCRAPES CAREER AVERAGES FOR ALL WIDE RECEIVERS

for num in range(30, 32):
    # team = team_hrefs[10]
    team = team_hrefs[num]

    for j, year in enumerate(years):
        url = f'https://www.pro-football-reference.com{team}/{year}-snap-counts.htm'

        # Send a request to the website
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the first table
        table = soup.find('table', {'id': 'snap_counts'})

        # Extract table headers
        headers = [th.getText() for th in table.find('thead').findAll('th')]
        rows = table.find('tbody').findAll('tr')

        hrefs = []
        for row in rows:
            first_col = row.find('th') 
            if first_col:
                link = first_col.find('a')
                if link:
                    hrefs.append(link['href'])

        # Extract table rows
        data = []
        for row in rows:
            cols = row.findAll('td')
            if cols:
                data.append([col.getText() for col in row.findAll(['th', 'td'])])

        # Create a DataFrame
        df = pd.DataFrame(data)

        receiver_hrefs = []
        runningback_hrefs = []
        tightend_hrefs = []
        quarterback_hrefs = []

        receiver_counts = []
        runningback_counts = []
        tightend_counts = []
        quarterback_counts = []

        for i in range(len(df)):
            player = str(df.iloc[i, 0])
            position = str(df.iloc[i, 1])
            snap_counts = int(df.iloc[i, 2])
            if position == 'RB':
                runningback_hrefs.append(hrefs[i])
                runningback_counts.append(snap_counts)
            elif position == 'WR':
                receiver_hrefs.append(hrefs[i])
                receiver_counts.append(snap_counts)
            elif position == 'TE':
                tightend_hrefs.append(hrefs[i])
                tightend_counts.append(snap_counts)
            elif position == 'QB':
                quarterback_hrefs.append(hrefs[i])
                quarterback_counts.append(snap_counts)
            

        # Getting the appropriate number of players for each position
        receiver_max_indices = sorted(range(len(receiver_counts)), key=lambda i: receiver_counts[i], reverse=True)[:3]
        runningback_max_indices = sorted(range(len(runningback_counts)), key=lambda i: runningback_counts[i], reverse=True)[:2]
        tightend_max_indices = sorted(range(len(tightend_counts)), key=lambda i: tightend_counts[i], reverse=True)[:1]
        quarterback_max_indices = sorted(range(len(quarterback_counts)), key=lambda i: quarterback_counts[i], reverse=True)[:1]

        # Getting the hrefs for all players that are going to be used
        receiver_hrefs = [receiver_hrefs[i] for i in receiver_max_indices]
        receiver_hrefs = [href.replace('.htm', '') for href in receiver_hrefs]

        runningback_hrefs = [runningback_hrefs[i] for i in runningback_max_indices]
        runningback_hrefs = [href.replace('.htm', '') for href in runningback_hrefs]

        tightend_hrefs = [tightend_hrefs[i] for i in tightend_max_indices]
        tightend_hrefs = [href.replace('.htm', '') for href in tightend_hrefs]

        quarterback_hrefs = [quarterback_hrefs[i] for i in quarterback_max_indices]
        quarterback_hrefs = [href.replace('.htm', '') for href in quarterback_hrefs]

        # Used for distinguishing between first, second, and third string wide receivers
        u = 1

        for href in receiver_hrefs:
            wr_data = []

            if href == '/players/P/PattCo00':
                wr_base_link = f'https://www.pro-football-reference.com{href}.htm#all_rushing_and_receiving'
            else:
                wr_base_link = f'https://www.pro-football-reference.com{href}.htm#all_receiving_and_rushing'
                

            # Dealing with the base df for wide receivers

            time.sleep(5)

            response = requests.get(wr_base_link)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            table = soup.find('table')

            wr_base_df = pd.read_html(str(table))[0]

            # wr_base_df.columns = wr_base_df.columns.droplevel()

            def drop_column_level_if_possible(df):
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = df.columns.droplevel()
                return df

            kick_return_hrefs = ['/players/M/McClRa00', '/players/A/AgneJa00']

            if href == '/players/P/PryoTe00':
                ssl._create_default_https_context = ssl._create_unverified_context

                tables = pd.read_html(wr_base_link)

                # Extract the 4th table
                wr_base_df = tables[2]

            if href in kick_return_hrefs:
                ssl._create_default_https_context = ssl._create_unverified_context

                tables = pd.read_html(wr_base_link)

                # Extract the 4th table
                wr_base_df = tables[2]

            # Drops level if df has more than one level
            wr_base_df = drop_column_level_if_possible(wr_base_df)

            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_main_upper_abbreviation = all_abbreviations['Upper_Reference'].tolist()
            all_backup_upper_abbreviation = all_abbreviations['Backup_Upper_Reference'].tolist()

            team_name = team.split('/')[-1]
            index = all_reference_abbreviation.index(team_name)

            if num == 13:
                if year in range(2016, 2020):
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            elif num == 15:
                if year == 2016:
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            else:
                upper_name = all_main_upper_abbreviation[index]

            number_teams = 0

            wr_age = 0

            for i in range(len(wr_base_df)):
                year_value = wr_base_df.loc[i, 'Year']
                
                if pd.notna(year_value) and year_value != '*':  # Check if the value is not NaN
                    wr_base_df.loc[i, 'Year'] = str(year_value)[:4]
                    df_year = int(wr_base_df.loc[i, 'Year'])

                    if df_year == year:
                        df_year_index = i
                        teams = str(wr_base_df.loc[i, 'Tm'])
                        team_team = teams[1:]
                        age = int(wr_base_df.loc[i, 'Age'])
                        wr_age = age
                        break

            # Finds where player played on signified team and moves that row up to the Career row
            for i in range(len(wr_base_df)):
                if (wr_base_df.loc[i, 'Year']) == 'Career':
                    if (len(wr_base_df) - i) != 0:
                        for x in range(1, len(wr_base_df) - i):
                            if (wr_base_df.loc[x + i, 'Tm']) == upper_name:
                                wr_base_df.iloc[i, 1:] = wr_base_df.iloc[i + x, 1:]
                                break
                    else:
                        break
                    break

            # Makes the base rb data frame the career row
            for i in range(len(wr_base_df)):
                if (wr_base_df.loc[i, 'Year'] == 'Career'):
                    wr_base_df = wr_base_df.loc[[i]]
                    break

            wr_base_columns = wr_base_df.columns.tolist()

            if wr_base_columns[7] == 'Att':
                wr_base_columns[7] = f'Rec_Rush_Att_{u}'
                wr_base_columns[8] = f'Rec_Rush_Yds_{u}'
                wr_base_columns[9] = f'Rec_Rush_Tds_{u}'
                wr_base_columns[10] = f'Rec_Rush_1D_{u}'
                wr_base_columns[11] = f'Rec_Rush_Succ%_{u}'
                wr_base_columns[16] = f'Rec_Tgt_{u}'
                wr_base_columns[17] = f'Wr_Rec_{u}'
                wr_base_columns[18] = f'Rec_Yds_{u}'
                wr_base_columns[20] = f'Rec_Tds_{u}'
                wr_base_columns[21] = f'Rec_1D_{u}'
                wr_base_columns[22] = f'Rec_Succ%_{u}'
                wr_base_columns[28] = f'Rec_Touch_{u}'
                wr_base_columns[32] = f'Rec_Fumb_{u}'

                wr_base_df.columns = wr_base_columns

                if href == '/players/P/PryoTe00':
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'Awards']
                elif href == '/players/M/McClRa00':
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'Awards']
                elif href == '/players/A/AgneJa00':
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'Awards']
                else:
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Lng', 'Y/A', 'Y/G', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Y/Tch', 'YScm', 'RRTD', 'AV', 'A/G', 'Awards']              
                try:
                    wr_base_df = wr_base_df.drop(columns = columns_to_drop, axis = 1)
                except KeyError:
                    # 'Awards' not in columns, remove 'Awards' from the list and drop again
                    columns_to_drop.remove('Awards')
                    wr_base_df = wr_base_df.drop(columns = columns_to_drop, axis = 1)

            else:
                wr_base_columns[7] = f'Rec_Tgt_{u}'
                wr_base_columns[8] = f'Wr_Rec_{u}'
                wr_base_columns[9] = f'Rec_Yds_{u}'
                wr_base_columns[11] = f'Rec_Tds_{u}'
                wr_base_columns[12] = f'Rec_1D_{u}'
                wr_base_columns[13] = f'Rec_Succ%_{u}'
                wr_base_columns[19] = f'Rec_Rush_Att_{u}'
                wr_base_columns[20] = f'Rec_Rush_Yds_{u}'
                wr_base_columns[21] = f'Rec_Rush_Tds_{u}'
                wr_base_columns[22] = f'Rec_Rush_1D_{u}'
                wr_base_columns[23] = f'Rec_Rush_Succ%_{u}'
                wr_base_columns[28] = f'Rec_Touch_{u}'
                wr_base_columns[32] = f'Rec_Fumb_{u}'

                wr_base_df.columns = wr_base_columns

                wr_base_df = wr_base_df.reset_index(drop=True)

                if href == '/players/P/PryoTe00':
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'Awards']
                elif href == '/players/M/McClRa00':
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'Awards']
                elif href == '/players/A/AgneJa00':
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'Awards']
                else:
                    columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'AV', 'Awards']
                try:
                    wr_base_df = wr_base_df.drop(columns = columns_to_drop, axis = 1)
                except KeyError:
                    # 'Awards' not in columns, remove 'Awards' from the list and drop again
                    columns_to_drop.remove('Awards')
                    wr_base_df = wr_base_df.drop(columns = columns_to_drop, axis = 1)

            wr_data.append(wr_base_df)

            wr_df = pd.concat(wr_data, axis = 1)

            wr_df = wr_df.reset_index(drop=True)

            wr_df_columns = wr_df.columns.tolist()
            wr_df_columns[0] = f'Year_{u}'
            wr_df_columns[1] = f'Wr_Age_{u}'
            wr_df_columns[2] = f'Wr_G_{u}'
            wr_df.columns = wr_df_columns

            wr_df = wr_df.fillna(0)

            wr_df.loc[0, f'Year_{u}'] = year
            wr_df.loc[0, f'Wr_Age_{u}'] = wr_age

            # Applying all wr dfs into the overall df
            full_clustered_df = pd.read_csv('clustered_df_careers_wr.csv')
            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_upper_abbreviation = all_abbreviations['Upper'].tolist()

            for r in range(len(full_clustered_df)):
                cluster_team_name = full_clustered_df.loc[r, 'team']
                team_name = team.split('/')[-1]
                season = full_clustered_df.loc[r, 'season']
                index = all_reference_abbreviation.index(team_name)
                upper_name = all_upper_abbreviation[index]

                all_wr_columns = wr_df.columns.tolist()

                if cluster_team_name == upper_name:
                    if season == year:
                        wr_df = wr_df.rename(index={wr_df.index[0]: r})
                        row_index = wr_df.index[0]
                        for col in all_wr_columns:
                            full_clustered_df.loc[r, col] = wr_df.loc[row_index, col]
            
            full_clustered_df.to_csv('clustered_df_careers_wr.csv', index  = False)

            print(f"{year}, {u} string wide receiver scraped for {team}.")

            u += 1

            # Remove break statement to do all receivers of one year of a franchise
            # break

        # Remove break statement to do all years of a franchise
        # break

    # Remove break statement to do all franchises
    # break

In [None]:
# SCRAPES CAREER AVERAGES FOR ALL TIGHT ENDS

for num in range(31, 32):
    # team = team_hrefs[10]
    team = team_hrefs[num]

    for j, year in enumerate(years):
        url = f'https://www.pro-football-reference.com{team}/{year}-snap-counts.htm'

        # Send a request to the website
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the first table
        table = soup.find('table', {'id': 'snap_counts'})

        # Extract table headers
        headers = [th.getText() for th in table.find('thead').findAll('th')]
        rows = table.find('tbody').findAll('tr')

        hrefs = []
        for row in rows:
            first_col = row.find('th') 
            if first_col:
                link = first_col.find('a')
                if link:
                    hrefs.append(link['href'])

        # Extract table rows
        data = []
        for row in rows:
            cols = row.findAll('td')
            if cols:
                data.append([col.getText() for col in row.findAll(['th', 'td'])])

        # Create a DataFrame
        df = pd.DataFrame(data)

        receiver_hrefs = []
        runningback_hrefs = []
        tightend_hrefs = []
        quarterback_hrefs = []

        receiver_counts = []
        runningback_counts = []
        tightend_counts = []
        quarterback_counts = []

        for i in range(len(df)):
            player = str(df.iloc[i, 0])
            position = str(df.iloc[i, 1])
            snap_counts = int(df.iloc[i, 2])
            if position == 'RB':
                runningback_hrefs.append(hrefs[i])
                runningback_counts.append(snap_counts)
            elif position == 'WR':
                receiver_hrefs.append(hrefs[i])
                receiver_counts.append(snap_counts)
            elif position == 'TE':
                tightend_hrefs.append(hrefs[i])
                tightend_counts.append(snap_counts)
            elif position == 'QB':
                quarterback_hrefs.append(hrefs[i])
                quarterback_counts.append(snap_counts)
            

        # Getting the appropriate number of players for each position
        receiver_max_indices = sorted(range(len(receiver_counts)), key=lambda i: receiver_counts[i], reverse=True)[:3]
        runningback_max_indices = sorted(range(len(runningback_counts)), key=lambda i: runningback_counts[i], reverse=True)[:2]
        tightend_max_indices = sorted(range(len(tightend_counts)), key=lambda i: tightend_counts[i], reverse=True)[:1]
        quarterback_max_indices = sorted(range(len(quarterback_counts)), key=lambda i: quarterback_counts[i], reverse=True)[:1]

        # Getting the hrefs for all players that are going to be used
        receiver_hrefs = [receiver_hrefs[i] for i in receiver_max_indices]
        receiver_hrefs = [href.replace('.htm', '') for href in receiver_hrefs]

        runningback_hrefs = [runningback_hrefs[i] for i in runningback_max_indices]
        runningback_hrefs = [href.replace('.htm', '') for href in runningback_hrefs]

        tightend_hrefs = [tightend_hrefs[i] for i in tightend_max_indices]
        tightend_hrefs = [href.replace('.htm', '') for href in tightend_hrefs]

        quarterback_hrefs = [quarterback_hrefs[i] for i in quarterback_max_indices]
        quarterback_hrefs = [href.replace('.htm', '') for href in quarterback_hrefs]

        for href in tightend_hrefs:
            te_data = []

            te_base_link = f"https://www.pro-football-reference.com{href}.htm#all_receiving_and_rushing"
                
            # Dealing with the base df for tight ends

            time.sleep(5)

            response = requests.get(te_base_link)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, 'html.parser')

            table = soup.find('table')

            te_base_df = pd.read_html(str(table))[0]

            te_base_df.columns = te_base_df.columns.droplevel()

            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_main_upper_abbreviation = all_abbreviations['Upper_Reference'].tolist()
            all_backup_upper_abbreviation = all_abbreviations['Backup_Upper_Reference'].tolist()

            team_name = team.split('/')[-1]
            index = all_reference_abbreviation.index(team_name)

            if num == 13:
                if year in range(2016, 2020):
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            elif num == 15:
                if year == 2016:
                    upper_name = all_backup_upper_abbreviation[index]
                else:
                    upper_name = all_main_upper_abbreviation[index]

            else:
                upper_name = all_main_upper_abbreviation[index]

            number_teams = 0

            te_age = 0

            for i in range(len(te_base_df)):
                year_value = te_base_df.loc[i, 'Year']
                
                if pd.notna(year_value) and year_value != '*':  # Check if the value is not NaN
                    te_base_df.loc[i, 'Year'] = str(year_value)[:4]
                    df_year = int(te_base_df.loc[i, 'Year'])

                    if df_year == year:
                        df_year_index = i
                        teams = str(te_base_df.loc[i, 'Tm'])
                        team_team = teams[1:]
                        age = int(te_base_df.loc[i, 'Age'])
                        te_age = age
                        break

            # Finds where player played on signified team and moves that row up to the Career row
            for i in range(len(te_base_df)):
                if (te_base_df.loc[i, 'Year']) == 'Career':
                    if (len(te_base_df) - i) != 0:
                        for x in range(1, len(te_base_df) - i):
                            if (te_base_df.loc[x + i, 'Tm']) == upper_name:
                                te_base_df.iloc[i, 1:] = te_base_df.iloc[i + x, 1:]
                                break
                    else:
                        break
                    break

            # Makes the base rb data frame the career row
            for i in range(len(te_base_df)):
                if (te_base_df.loc[i, 'Year'] == 'Career'):
                    te_base_df = te_base_df.loc[[i]]
                    break

            te_base_columns = te_base_df.columns.tolist()

            te_base_columns[7] = 'Te_Rec_Tgt'
            te_base_columns[8] = 'Te_Rec'
            te_base_columns[9] = 'Te_Rec_Yds'
            te_base_columns[11] = 'Te_Rec_Tds'
            te_base_columns[12] = 'Te_Rec_1D'
            te_base_columns[13] = 'Te_Rec_Succ%'
            te_base_columns[19] = 'Te_Rec_Rush_Att'
            te_base_columns[20] = 'Te_Rec_Rush_Yds'
            te_base_columns[21] = 'Te_Rec_Rush_Tds'
            te_base_columns[22] = 'Te_Rec_Rush_1D'
            te_base_columns[23] = 'Te_Rec_Rush_Succ%'
            te_base_columns[28] = 'Te_Rec_Touch'
            te_base_columns[32] = 'Te_Rec_Fumb'

            te_base_df.columns = te_base_columns

            te_base_df = te_base_df.reset_index(drop=True)

            columns_to_drop = ['Tm', 'Pos', 'No.', 'GS', 'Y/R', 'Lng', 'R/G', 'Y/G', 'Ctch%', 'Y/Tgt', 'Lng', 'Y/A', 'Y/G', 'A/G', 'Y/Tch', 'YScm', 'RRTD', 'AV', 'Awards']
            try:
                te_base_df = te_base_df.drop(columns = columns_to_drop, axis = 1)
            except KeyError:
                # 'Awards' not in columns, remove 'Awards' from the list and drop again
                columns_to_drop.remove('Awards')
                te_base_df = te_base_df.drop(columns = columns_to_drop, axis = 1)

            te_data.append(te_base_df)

            time.sleep(5)

            te_df = pd.concat(te_data, axis = 1)

            te_df = te_df.reset_index(drop=True)

            te_df_columns = te_df.columns.tolist()
            te_df_columns[0] = 'Te_Year'
            te_df_columns[1] = 'Te_Age'
            te_df_columns[2] = 'Te_G'
            te_df.columns = te_df_columns

            te_df = te_df.fillna(0)

            te_df.loc[0, 'Te_Year'] = year
            te_df.loc[0, 'Te_Age'] = te_age

            # Applying all wr dfs into the overall df
            full_clustered_df = pd.read_csv('clustered_df_careers_te.csv')
            all_abbreviations = pd.read_csv('teams_abbreviations.csv')
            all_reference_abbreviation = all_abbreviations['Reference'].tolist()
            all_lower_abbreviation = all_abbreviations['Lower'].tolist()
            all_upper_abbreviation = all_abbreviations['Upper'].tolist()

            for r in range(len(full_clustered_df)):
                cluster_team_name = full_clustered_df.loc[r, 'team']
                team_name = team.split('/')[-1]
                season = full_clustered_df.loc[r, 'season']
                index = all_reference_abbreviation.index(team_name)
                upper_name = all_upper_abbreviation[index]

                all_te_columns = te_df.columns.tolist()

                if cluster_team_name == upper_name:
                    if season == year:
                        te_df = te_df.rename(index={te_df.index[0]: r})
                        row_index = te_df.index[0]
                        for col in all_te_columns:
                            full_clustered_df.loc[r, col] = te_df.loc[row_index, col]
            
            full_clustered_df.to_csv('clustered_df_careers_te.csv', index  = False)
            print(f"{year} tight end scraped for {team}.")

            # Remove break statement to do all receivers of one year of a franchise
            # break

        # Remove break statement to do all years of a franchise
        # break

    # Remove break statement to do all franchises
    # break

In [None]:
# Convert all career statistics to per-game

dirty_df = pd.read_csv('clustered_df_careers_all_positions.csv')

# stats.csv is a .csv containing all statistics that need to be converted to per-game statistics (leaving alone statistics like percentages and ratings)
stat_df = pd.read_csv('stats.csv')

qb_stats = stat_df['Qb_Stats'].tolist()
rb_stats_1 = stat_df['Rb_1_Stats'].tolist()
rb_stats_2 = stat_df['Rb_2_Stats'].tolist()
wr_stats_1 = stat_df['Wr_1_Stats'].tolist()
wr_stats_2 = stat_df['Wr_2_Stats'].tolist()
wr_stats_3 = stat_df['Wr_3_Stats'].tolist()
te_stats = stat_df['Te_Stats'].tolist()

for col in dirty_df.columns:
    if col in qb_stats:
        dirty_df.loc[:, col] = dirty_df.loc[:, col].astype(float) / dirty_df.loc[:, 'Qb_G'].astype(float)
    elif col in rb_stats_1:
        dirty_df.loc[:, col] = dirty_df.loc[:, col].astype(float) / dirty_df.loc[:, 'Rb_G_1'].astype(float)
    elif col in rb_stats_2:
        dirty_df.loc[:, col] = dirty_df.loc[:, col].astype(float) / dirty_df.loc[:, 'Rb_G_2'].astype(float)
    elif col in wr_stats_1:
        dirty_df.loc[:, col] = dirty_df.loc[:, col].astype(float) / dirty_df.loc[:, 'Wr_G_1'].astype(float)
    elif col in wr_stats_2:
        dirty_df.loc[:, col] = dirty_df.loc[:, col].astype(float) / dirty_df.loc[:, 'Wr_G_2'].astype(float)
    elif col in wr_stats_3:
        dirty_df.loc[:, col] = dirty_df.loc[:, col].astype(float) / dirty_df.loc[:, 'Wr_G_3'].astype(float)
    elif col in te_stats:
        dirty_df.loc[:, col] = dirty_df.loc[:, col].astype(float) / dirty_df.loc[:, 'Te_G'].astype(float)

In [None]:
# Filling empty spaces with 0

dirty_df = dirty_df.fillna(0)

dirty_df.to_csv('career_per_game.csv', index = False)

In [None]:
# Predicting NFL tendency data based on past career average data.

df = pd.read_csv('Read in career average df for past seasons')

columns = df.columns

# Step 2: Separate features and targets
target = columns[6:42]
features = columns[42:]

X = df[features]
y = df[target]

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Step 5: Train the model
rf_model.fit(X_train, y_train)

# Step 6: Make predictions
y_pred = rf_model.predict(X_test)

# Step 7: Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Output the MSE value
print(f'Mean Squared Error: {mse}')

In [None]:
# Putting in tendency predictions into another csv

new_df = pd.read_csv('clustered_df_careers_all_positions_2024.csv')

# Step 2: Ensure that the features are in the same format/order as the training data
X_new = new_df[features]  # Use the same 'features' from the previous step

# Step 3: Use the trained model to predict the missing target values
predictions = rf_model.predict(X_new)  # Use the trained RandomForestRegressor model

# Step 4: Fill in the target columns with the predicted values
new_df[target] = predictions

# Step 5: Save the updated DataFrame to a new CSV file or further process it
new_df.to_csv('new_data_with_predictions.csv', index=False)

In [None]:
# Training model on qb data frame with fantasy results

qb_df = pd.read_csv('clustered_df_qb copy.csv')

features = columns[6:43]
target = 'Qb_FantPt'

X = qb_df[features]
y = qb_df[target]

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Initialize the RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Step 5: Train the model
rf_model.fit(X_train, y_train)

# Step 6: Make predictions
y_pred = rf_model.predict(X_test)

# Step 7: Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Output the MSE value
print(f'Mean Squared Error: {mse}')

new_df = pd.read_csv('qb_fantasy_predictions.csv')

# Step 2: Ensure that the features are in the same format/order as the training data
X_new = new_df[features]  # Use the same 'features' from the previous step

# Step 3: Use the trained model to predict the missing target values
predictions = rf_model.predict(X_new)  # Use the trained RandomForestRegressor model

# Step 4: Fill in the target columns with the predicted values
new_df[target] = predictions

new_df = new_df.sort_values(by='Qb_FantPt', ascending=False)

# Step 5: Save the updated DataFrame to a new CSV file or further process it
new_df.to_csv('qb_fantasy_predictions.csv', index=False)