# Fantasy Sidelines 
### Code to Scrape, Clean, Save, and Analyze NFL injury, player stats, team stats, and player snaps data.

## Libraries used in code

In [1]:
import dash, requests, os, time
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import pandas as pd
import datetime as dt
from datetime import date, timedelta
from bs4 import BeautifulSoup
import numpy as np
from dotenv import load_dotenv
load_dotenv()
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

## Section 1 Scrape Data

### 1.1 - Team Weekly Stats Scraped From www.stathead.com

In [None]:
def team_data_scrape(Season,url):
    # scrape weekly team stats from www.stathead.com
    start = time.time()
    # page to start scrape at
    page = 0
    # login payload information pulled from a .env file
    stat_login_url = "https://stathead.com/users/login.cgi"
    stat_user_name = os.environ.get('statheadusername')
    stat_password = os.environ.get('statheadpassword')
    stat_payload = {
        'username': stat_user_name,
        'password': stat_password
    }
    
    # lots of team data, could not fit into one pull from stathead, needed to use 2 different url's, this allows the function to iterate through both
    if url == 1:
        stat_url = 'https://stathead.com/football/tgl_finder.cgi?request=1&temperature_gtlt=lt&game_num_max=99&week_num_max=99&order_by=points&match=game&year_max={Season}&order_by_asc=0&week_num_min=0&game_type=E&game_num_min=0&year_min={Season}&cstat[1]=all_td_team&ccomp[1]=gt&cval[1]=0&cstat[2]=third_down_att&ccomp[2]=gt&cval[2]=0&cstat[3]=vegas_line&ccomp[3]=gt&cval[3]=-50&cstat[4]=penalties&ccomp[4]=gt&cval[4]=0&cstat[5]=rush_att&ccomp[5]=gt&cval[5]=0&cstat[6]=tot_yds&ccomp[6]=gt&cval[6]=0&cstat[7]=first_down&ccomp[7]=gt&cval[7]=0&cstat[8]=punt&ccomp[8]=gt&cval[8]=0&cstat[9]=pass_cmp&ccomp[9]=gt&cval[9]=0&offset={page}'
    elif url == 2:
        stat_url = 'https://stathead.com/football/tgl_finder.cgi?request=1&temperature_gtlt=lt&game_num_max=99&week_num_max=99&order_by=all_td_opp&match=game&year_max={Season}&order_by_asc=0&week_num_min=0&game_type=R&game_num_min=0&year_min={Season}&cstat[1]=tot_yds_opp&ccomp[1]=gt&cval[1]=0&cstat[2]=rush_yds_diff&ccomp[2]=gt&cval[2]=-500&cstat[3]=score_diff_thru_1&ccomp[3]=gt&cval[3]=-500&cstat[4]=rush_att_opp&ccomp[4]=gt&cval[4]=0&cstat[5]=kick_ret_td_tgl&ccomp[5]=gt&cval[5]=0&cstat[6]=pass_cmp_opp&ccomp[6]=gt&cval[6]=0&cstat[7]=first_down_opp&ccomp[7]=gt&cval[7]=0&cstat[8]=score_diff_1_qtr&ccomp[8]=gt&cval[8]=-500&cstat[9]=third_down_att_opp&ccomp[9]=gt&cval[9]=0&offset={page}'
    elif url != 1 or 2:
        print("Please select 1 or 2.")
    
    # open logged in session for scraping
    with requests.Session() as session:

        s = session.post(stat_login_url, data=stat_payload)
        
        # beginning the scrape and stopping the scrape when page number reaches 100k
        try:

            while page < 100000:
                
                # pulling the website and scraping it
                website = session.get(stat_url.format(Season=Season,page=page)).text
                soup = BeautifulSoup(website, 'html')
                table = soup.find('table', attrs={'class': 'sortable', 'id': 'results'})
                
                # pull headers and rows out of the data
                table_headers = [header.text for header in table.find('thead').find_all('th')]
                table_rows = table.find_all('tr')
                
                # final location for complete data
                final_data = []
                
                # create row for each line of data in table
                for tr in table_rows:
                    td = tr.find_all('td')
                    row = [tr.text for tr in td]
                    final_data.append(row)
                
                # create the dataframe in panadas excluding the blank row and matching headers with the rows
                df = pd.DataFrame(final_data[1:], columns=table_headers[12:])
                
                # writting dataframe to csv, continuous appending just incase the function fails, data will be saved
                if url == 1:
                    df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\nflteam_data_1_{Season}_raw.csv',mode='a',index=False)
                else:
                    df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\nflteam_data_2_{Season}_raw.csv',mode='a',index=False)
                
                # progress through the websites
                page += 100
            
        except:
            # notifying the scrape has completed.
            end = time.time()
            print(f'Done: Team {Season}, {url}, {page}',f'Time to complete: {end-start}',sep='\n')

### 1.2 - Player Weekly Snap Data Scraped From www.fantasypros.com

In [None]:
def player_snap_scrape(Season):

    # starting the weekly player snap scrape
    start = time.time()
    
    # two different sides of the ball, editing the url to allow to grab both sides
    sides = ['','defense.php']
    # create the range of weeks needed for the scrape
    weeks = []
    for wk in range(1,18):
        weeks.append(wk)
    
    ENDPOINT = "https://www.fantasypros.com/nfl/reports/snap-count-analysis/{side}?year={Season}&week={week}&snaps=0&range=week"
    
    # create final storage place for data
    final_df = pd.DataFrame()

    # scrape pages for both sides of the ball
    for side in sides:

        # scrape pages for each week of the season
        for week in weeks:

            # opening the webpage and storing data into lists then to dataframe
            res = requests.get(ENDPOINT.format(Season=Season,side=side,week=week))

            soup = BeautifulSoup(res.content, 'lxml')

            table = soup.find('table', {'id': 'data'})
            table_headers = [header.text for header in table.find('thead').find_all('th')]
            table_rows = table.find_all('tr')

            final_data = []

            for tr in table_rows:
                td = tr.find_all('td')
                row = [tr.text for tr in td]
                final_data.append(row)
            
            # create dataframe from current data
            df = pd.DataFrame(final_data[1:], columns=table_headers)
            df['Season'] = Season
            df['Week'] = week
            
            # add current data to final dataframe
            final_df = pd.concat([final_df, df])
    
    # write dataframe to csv
    final_df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\snapcounts_{Season}_raw.csv',index=False)
    end = time.time()
    print(f'Done: Snaps {Season}',f'Time to complete: {end-start}',sep='\n')

### 1.3 - NFL Weekly Injury Reports Scraped From www.pro-football-reference.com

In [None]:
def injury_reports_scrape(Season):

    # scrape weekly nfl injury reports
    start = time.time()
    # list of team abbreviations from pro-football-reference for url purposes
    teams = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb','htx','clt','jax','kan',
             'sdg','ram','mia','min','nor','nwe','nyg','nyj','rai','phi','pit','sea','sfo','tam','oti','was']

    ENDPOINT = 'https://www.pro-football-reference.com/teams/{team}/{Season}_injuries.htm'
    
    # creating final place to store data
    final_df = pd.DataFrame()

    # scrape pages for all teams
    for team in teams:
        
        # open webpage and scrape contents into lists, then to a dataframe
        res = requests.get(ENDPOINT.format(Season=Season,team=team))

        soup = BeautifulSoup(res.content, 'lxml')

        table = soup.find('table', attrs={'class': 'sortable', 'id': 'team_injuries'})
        table_rows = table.find_all('tr')

        final_data = []
        for tr in table_rows:
            td = tr.find_all(['th','td'])
            row = [tr['data-tip'] if tr.has_attr("data-tip") else tr.text for tr in td]
            final_data.append(row)

        dfdata = final_data[1:]
        data_body = [[dfdata[j][i] for j in range(len(dfdata))] for i in range(len(dfdata[0]))]

        df = pd.DataFrame(data_body,final_data[0]).T
        # adding team and season columns for identification
        df.insert(loc=1,column='Team',value=team)
        df.insert(loc=2,column='Season',value=Season)

        # combine current data with final dataframe
        final_df = pd.concat([final_df, df])
    
    # rename column
    final_df.rename(columns={'PlayerÂ ':'Player'},inplace=True)
    
    # write final data to csv file
    final_df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\nfl_injuryreport_{Season}_raw.csv',index=False)
    end = time.time()
    print(f'Done: Injury Reports {Season}',f'Time to complete: {end-start}',sep='\n')

### 1.4 - Player Weekly Stats Scraped From www.stathead.com

In [None]:
def player_stats_scape(Season):
    
    # scrape weekly player stats from statehead.com
    start = time.time()
    # page/location for monitoring progress and continuing through pages
    page = 0
    location = 2000
    
    # stathead login info pulled from .env file
    stat_login_url = "https://stathead.com/users/login.cgi"
    stat_user_name = os.environ.get('statheadusername')
    stat_password = os.environ.get('statheadpassword')
    stat_payload = {
        'username': stat_user_name,
        'password': stat_password
    }
    stat_url = "https://stathead.com/football/pgl_finder.cgi?request=1&game_num_max=99&week_num_max=99&order_by=all_td&season_start=1&qb_gwd=0&order_by_asc=0&qb_comeback=0&week_num_min=0&game_num_min=0&year_min={Season}&match=game&year_max={Season}&season_end=-1&age_min=0&game_type=R&age_max=99&positions[]=qb&positions[]=rb&positions[]=wr&positions[]=te&positions[]=e&positions[]=t&positions[]=g&positions[]=c&positions[]=ol&positions[]=dt&positions[]=de&positions[]=dl&positions[]=ilb&positions[]=olb&positions[]=lb&positions[]=cb&positions[]=s&positions[]=db&positions[]=k&positions[]=p&cstat[1]=punt_ret&ccomp[1]=gt&cval[1]=0&cstat[2]=sacks&ccomp[2]=gt&cval[2]=0&cstat[3]=fumbles&ccomp[3]=gt&cval[3]=0&cstat[4]=rush_att&ccomp[4]=gt&cval[4]=0&cstat[5]=pass_defended&ccomp[5]=gt&cval[5]=0&cstat[6]=pass_cmp&ccomp[6]=gt&cval[6]=0&cstat[7]=targets&ccomp[7]=gt&cval[7]=0&cstat[8]=kick_ret&ccomp[8]=gt&cval[8]=0&offset={page}"

    # logging into session to begin scrape
    with requests.Session() as session:

        s = session.post(stat_login_url, data=stat_payload)
        
        # begin scrape, once it fails, stop scrape
        try:

            # scrape webpages up to 100k contents
            while page < 100000:

                # opening webpage and storing data into list then dataframe
                website = session.get(stat_url.format(Season=Season,page=page)).text
                soup = BeautifulSoup(website, 'html')
                table = soup.find('table', attrs={'class': 'sortable', 'id': 'results'})

                table_headers = [header.text for header in table.find('thead').find_all('th')]
                table_rows = table.find_all('tr')

                final_data = []

                for tr in table_rows:
                    td = tr.find_all('td')
                    row = [tr.text for tr in td]
                    final_data.append(row)

                df = pd.DataFrame(final_data[1:], columns=table_headers[11:])
                df.rename(columns={'Year':'Season'},inplace=True)
                
                # appendings csv with data from current dataframe to prevent loss if code fails or connection drops
                df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\player_stats_{Season}_raw.csv',mode='a',index=False)

                # continue through webpages and update where the location of the scrape is
                if page > location:
                    print('Player Stats on page:',(page-100))
                    location += 2000

                page += 100
                
        except:
            # end of the scrape notificaiton
            end = time.time()
            print(f'Done: Stats {Season}, {page}',f'Time to complete: {end-start}',sep='\n')

## Section 2 - Clean data

### 2.1 - Clean Team Weekly Stats

In [None]:
def team_data_clean(Season):
    # begin cleaning team data
    start = time.time()
    # opening both csv files into dataframes for cleaning and combining
    team1 = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\nflteam_data_1_{Season}_raw.csv')
    team2 = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\nflteam_data_2_{Season}_raw.csv')
    # drop all blank rows
    team1.dropna(thresh=10,inplace=True)
    team2.dropna(thresh=10,inplace=True)
    # drop useless columns
    team1.drop('LTime',axis=1,inplace=True)
    team2.drop(['LTime'],axis=1,inplace=True)
    # rename columns
    team1.rename(columns={'Tm':'Team','Unnamed: 5':'Away_Home','PF':'Points_For','PA':'Points_Against','PC':'Points_Comb',\
                         'vs. Line':'Vs_Line','Cmp':'TPass_Cmp','Att':'TPass_Att','Cmp%':'TPass_Cmp%','Yds':'TPass_Yds',\
                          'TD':'TPass_TD','Int':'TPass_Int','Sk':'TSack','Yds.1':'TSack_Yds','Rate':'TQB_Rate',\
                          'Att.1':'TRush_Att','Yds.2':'TRush_Yds','Y/A':'TRush_Y/A','TD.1':'TRush_TD','Tot':'TTot_Yds',\
                          'Ply':'TO_Play#','Y/P':'TO_Y/P','DPly':'TD_Play#','DY/P':'TD_Y/P','TO':'TTot_TO','ToP':'TO_ToP',\
                          'Time.1':'TGame_Dur','Yds.3':'TPen_Yds','OppPen':'TOpp_Pen','OppYds':'TOpp_Pen_Yds',\
                          'CombPen':'TComb_Pen','CombPenYds':'TComb_Pen_Yds','1stD':'T1st_Downs','Rsh':'T1st_by_Rsh',\
                          'Pass':'T1st_by_Pass','Pen.1':'T1st_by_Pen','3DAtt':'T3rd_Down_Att','3DConv':'T3rd_Down_Conv',\
                          '3D%':'T3rd_Down%','4DAtt':'T4th_Down_Att','4DConv':'T4th_Down_Conv','4D%':'T4th_Down%',\
                          'TD.2':'TTot_TD','XPA':'TXP_Att','XPM':'TXP_Made','FGA':'TFG_Att','FGM':'TFG_Made','2PA':'T2Pt_Att',\
                          '2PM':'T2Pt_Made','Sfty':'TSfty','Pnt':'TTimes_Punted','Yds.4':'TPunt_Yds','Y/P.1':'TPunt_Yds_Avg','Year':'Season'},inplace=True)
    team2.rename(columns={'Tm':'Team','Unnamed: 5':'Away_Home','TD':'TOpp_Tot_TD','XPA':'TOpp_XP_Att','XPM':'TOpp_XP_Made',\
                          'Att':'TOpp_FG_Att','Md':'TOpp_FG_Made','Sfty':'TOpp_Sfty','Cmp':'TOpp_Pass_Cmp','Att.1':'TOpp_Pass_Att',\
                          'Cmp%':'TOpp_Pass_Cmp%','Yds':'TOpp_Pass_Yds','TD.1':'TOpp_Pass_TD','Int':'TOpp_Pass_Int','Sk':'TOpp_Sk',\
                          'Yds.1':'TOpp_Sk_Yds','Rate':'TOpp_QB_Rate','Att.2':'TOpp_Rush_Att','Yds.2':'TOpp_Rush_Yds',\
                          'Y/A':'TOpp_Rush_Y/A','TD.2':'TOpp_Rush_TD','Tot':'TOpp_Tot_Yds','TO':'TOpp_Tot_TO',\
                          '1stDOpp':'TOpp_1st_Downs','Rush':'TOpp_1st_by_Rsh','Pass':'TOpp_1st_by_Pass','Pen':'TOpp_1st_by_Pen',\
                          'Opp3DAtt':'TOpp_3rd_Down_Att','Opp3DConv':'TOpp_3rd_Down_Conv','Opp3D%':'TOpp_3rd_Down%',\
                          'Opp4DAtt':'TOpp_4th_Down_Att','Opp4DConv':'TOpp_4th_Down_Conv','Opp4D%':'TOpp_4th_Down%',\
                          'Rush.1':'TMargin_Rush','Pass.1':'TMargin_Pass','Tot.1':'TMargin_TotYds','TO.1':'TTO_TD',\
                          'KR':'TKR_TD','PR':'TPR_TD','IR':'TInt_TD','FR':'TFmb_TD','OR':'TOtherRet_TD',\
                          'RetTD':'TAll_Ret_TD','Q1':'TMar_Thru_Q1','Q2':'TMar_Thru_Q2','Q3':'TMar_Thru_Q3',\
                          'Q1.1':'TScore_Diff_Q1','Q2.1':'TScore_Diff_Q2','Q3.1':'TScore_Diff_Q3',\
                          'Q4':'TScore_Diff_Q4','1stHalf':'TScore_Diff_1stHalf','2ndHalf':'TScore_Diff_2ndHalf','Year':'Season'},inplace=True)
    
    # merge the two dataframes based on team, season, date, time, away/home, opp, week, g#, dat, result, and ot
    team = pd.merge(left=team1,right=team2,\
                     how='outer',\
                     on=['Team','Season','Date','Time','Away_Home','Opp','Week','G#','Day','Result','OT'])
    team.set_index('Team',inplace=True)
    # drop all rows that have headers in them
    team.drop('Tm',inplace=True)
    team.reset_index(inplace=True)

    # create a list of all the column names in the team dataframe
    team_cols = []
    
    for col in team.columns:
        team_cols.append(col)

    # cleaning the away/home column to show away or home instead of @
    team.replace({'Away_Home':{'@':'Away',None:'Home'}},inplace=True)
    team[team_cols[11:]] = team[team_cols[11:]].fillna(value=0)
    # change datatype of the columns
    team[['TPass_Cmp','TPass_Att','T3rd_Down_Att','T3rd_Down_Conv','T4th_Down_Att',\
          'T4th_Down_Conv','TOpp_Pass_Cmp','TOpp_Pass_Att','TOpp_3rd_Down_Att',\
          'TOpp_3rd_Down_Conv','TOpp_4th_Down_Att','TOpp_4th_Down_Conv']] = team[['TPass_Cmp','TPass_Att','T3rd_Down_Att',\
                                                                                  'T3rd_Down_Conv','T4th_Down_Att',\
                                                                                  'T4th_Down_Conv','TOpp_Pass_Cmp',\
                                                                                  'TOpp_Pass_Att','TOpp_3rd_Down_Att',\
                                                                                  'TOpp_3rd_Down_Conv','TOpp_4th_Down_Att',\
                                                                                  'TOpp_4th_Down_Conv']].astype(float)
    # calculating the columns to show apporpriate values
    team['TPass_Cmp%'] = team['TPass_Cmp']/team['TPass_Att']
    team['T3rd_Down%'] = team['T3rd_Down_Conv']/team['T3rd_Down_Att']
    team['T4th_Down%'] = team['T4th_Down_Conv']/team['T4th_Down_Att']
    team['TOpp_Pass_Cmp%'] = team['TOpp_Pass_Cmp']/team['TOpp_Pass_Att']
    team['TOpp_3rd_Down%'] = team['TOpp_3rd_Down_Conv']/team['TOpp_3rd_Down_Att']
    team['TOpp_4th_Down%'] = team['TOpp_4th_Down_Conv']/team['TOpp_4th_Down_Att']
    # changing dates/time to datetime values
    team['Date'] = pd.to_datetime(team['Date'],errors='coerce',format='%Y-%m-%d')
    team['TGame_Dur'] = team['TGame_Dur']+':00'
    team['TO_ToP'] = '00:'+team['TO_ToP']
    team['TGame_Dur'] = pd.to_timedelta(team['TGame_Dur'],errors='coerce')
    team['TGame_Dur'] = team['TGame_Dur'].dt.total_seconds()
    team['TO_ToP'] = pd.to_timedelta(team['TO_ToP'],errors='coerce')
    team['TO_ToP'] = team['TO_ToP'].dt.total_seconds()
    #changing datatypes of columns
    team[team_cols[11:16]] = team[team_cols[11:16]].astype(float)
    team[team_cols[17]] = team[team_cols[17]].astype(float)
    team[team_cols[19:]] = team[team_cols[19:]].astype(float)
    # adding month column
    team.insert(loc=9,column='Month',value=team['Date'].dt.month)
    # chaning week datatype and removing any week larger than week 17
    team['Week'] = team['Week'].astype(int)
    team = team[team['Week']<=17]
    team['Week'] = team['Week'].astype(str)
    # saving cleaned data to csv
    team.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\nflteam_data_{Season}_clean.csv',index=False)
    
    end = time.time()
    print(f'Team data cleaned. {Season}',f'Time to complete: {end-start}',sep='\n')

### 2.2 - Clean Player Weekly Snap

In [None]:
def player_snaps_clean(Season):
    
    start = time.time()
    snaps = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\snapcounts_{Season}_raw.csv')
    # drop unnecessary columns
    snaps.drop(['Fantasy Pts','Pts/100 Snaps','Rush %','Tgt %','Touch %','Util %','Tackle %','Sack %','QB Hit %','Snaps/Gm'],axis=1,inplace=True)
    # change datatypes and calculate columns approriately
    snaps['Season'] = snaps['Season'].astype(str)
    snaps['Snap %'] = snaps['Snap %']/100
    snaps['Week'] = snaps['Week'].astype(str)
    # change team names to be consistent
    snaps.replace({'Team':{'FA':'','GB':'GNB','JAC':'JAX','KC':'KAN','NE':'NWE','NO':'NOR','SF':'SFO','TB':'TAM'}},inplace=True)
    # cleaning player names to be consistent
    snaps['Player'] = snaps['Player']+' '
    snaps.replace({'Player':{' Jr. ':'',' Jr ':'',' Sr. ':'',' Sr ':'',' III ':'',' II ':'',' IV ':'',' V ':''}},regex=True,inplace=True)
    snaps['Player'] = snaps['Player'].str.strip(' ')
    snaps['Player'] = snaps['Player'].str.replace('.','')
    # save cleaned snap data
    snaps.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\snapcounts_{Season}_clean.csv',index=False)
    end = time.time()

    print(f'Snap data cleaned. {Season}',f'Time to complete: {end-start}',sep='\n')

### 2.3 - Clean NFL Weekly Injury Reports

In [None]:
def nfl_injury_clean(Season):
    
    start = time.time()
    injury = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\nfl_injuryreport_{Season}_raw.csv',low_memory=False)
    # separate file to help calculate week numbers
    nfl_weeks = pd.read_csv(r'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\NFL Week Dates.csv')
    # melt columns to break each week into a separate row
    injury = pd.melt(injury,id_vars=['Player','Team','Season'],var_name='Date', value_name='Status')
    # create 2 columns based off a header
    injury[['Date','Opp']] = injury.Date.str.split('vs. ',expand=True)
    injury[['Month','Day']] = injury.Date.str.split('/',expand=True)
    injury[['Status','Injury']] = injury.Status.str.split(":",expand=True)
    # drop all values that are empty after the melting/splitting
    injury.dropna(axis=0,subset=['Status','Injury'],how='all',inplace=True)
    # change datatypes for cleaning
    injury[['Season','Month','Day']] = injury[['Season','Month','Day']].astype(int)
    # combine datat to create a date for the game and change to datetime value
    injury['Date'] = injury['Date']+'/'+(np.where(injury['Month']<=2,injury['Season']+1,injury['Season'])).astype(str)
    injury['Date'] = pd.to_datetime(injury['Date'])
    nfl_weeks['Week'] = nfl_weeks['Week'].astype(str)
    # create function to calculate what nfl week the date occured
    def pre_thu(d):
        days_behind = 3 - d.weekday()
        if days_behind > 0:
            days_behind -= 7
        return d + dt.timedelta(days_behind)
    injury['week_start_nfl'] = injury['Date'].apply(pre_thu)
    nfl_weeks['Start Date'] = pd.to_datetime(nfl_weeks['Start Date'])
    # merge injury data with the nfl week data to pull the weeks into current datagrame
    injury = pd.merge(left=injury,right=nfl_weeks,how='left',left_on='week_start_nfl',right_on='Start Date')
    # update team names for consistency
    injury.replace({'Team':\
                       {'crd':'ARI', 'atl':'ATL', 'rav':'BAL', 'buf':'BUF', 'car':'CAR', 'chi':'CHI', 'cin':'CIN',\
                        'cle':'CLE', 'dal':'DAL', 'den':'DEN', 'det':'DET', 'gnb':'GNB','htx':'HOU','clt':'IND',\
                        'jax':'JAX','kan':'KAN','sdg':'LAC','ram':'LAR','mia':'MIA','min':'MIN','nor':'NOR','nwe':'NWE',\
                        'nyg':'NYG','nyj':'NYJ','rai':'OAK','phi':'PHI','pit':'PIT','sea':'SEA','sfo':'SFO','tam':'TAM',\
                        'oti':'TEN','was':'WAS'}},inplace=True)
    # clean injury data for easier management and searching of data
    injury['Injury'] = injury['Injury'].str.strip(' ')
#     injury.replace({'Injury':{'right':'','left':'','Right':'','Left':'','Biceps':'Bicep',\
#                               'Triceps':'Tricep','Ankles':'Ankle','hip':'Hip','Hips':'Hip','Knees':'Knee',\
#                               'Virus':'Illness','Triceps':'Tricep','Oblique':'Abdomen',\
#                               'NotInjuryRelated':'Not Injury Related','MedicalIllness':'Illness',\
#                               'LowerLeg':'Lower Leg','CoreMuscle':'Abdomen','Abdominal':'Abdomen'}},\
#                                regex=True,inplace=True)
    # clean player names for consistency
    injury['Player'] = injury['Player']+' '
    injury.replace({'Player':{' Jr. ':'',' Jr ':'',' Sr. ':'',' Sr ':'',' III ':'',' II ':'',' IV ':'',' V ':''}},regex=True,inplace=True)
    injury['Player'] = injury['Player'].str.strip(' ')
    injury['Player'] = injury['Player'].str.replace('.','')
    # drop unnecessary columns
    injury.drop(['Month','Day','week_start_nfl','Start Date'],axis=1,inplace=True)
    # drop all empty week values
    injury.dropna(subset=['Week'],inplace=True)
    # change datatypes
    injury[['Player','Week','Season']] = injury[['Player','Week','Season']].astype(str)
    # reorganize columns
    injury = injury[['Player','Team','Opp','Date','Season','Week','Status','Injury']]
    # add two empty columns for data input
    injury[['Specific_Inj','Side']] = None
    injury.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\injury_report_{Season}_clean.csv',index=False)
    end = time.time()
    
    print(f'Injury data cleaned. {Season}',f'Time to complete: {end-start}',sep='\n')

### 2.4 - Clean Player Weekly Stats

In [None]:
def player_stats_clean(Season):
    
    start = time.time()
    stats = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Raw\player_stats_{Season}_raw.csv')
    # drop empty rows
    stats.dropna(how='all',inplace=True)
    # drop all rows with header information
    stats.drop(stats[stats['Player'] == 'Player'].index, inplace = True)
    stats.drop('Lg',axis=1,inplace=True)
    # rename columns for better management
    stats.rename(columns={'Tm':'Team','Unnamed: 6':'Away_Home','Cmp':'IPass_Cmp','Att':'IPass_Att','Cmp%':'IPass_Cmp%','Yds':'IPass_Yds',\
                 'TD':'IPass_TD','Int':'IPass_Int','Rate':'IQB_Rate','Sk':'I_Sk','Yds.1':'ISk_Yds','Y/A':'IPass_Y/A',\
                 'AY/A':'IPass_AdjY/A','Att.1':'IRush_Att','Yds.2':'IRush_Yds','Y/A.1':'IRush_Y/A','TD.1':'IRush_TD',\
                 'Tgt':'IRec_Tgt','Rec':'IRec_Rec','Yds.3':'IRec_Yds','Y/R':'IRec_Y/R','TD.2':'IRec_TD','Ctch%':'IRec_Ctch%',\
                 'Y/Tgt':'IRec_Y/Tgt','XPM':'IXP_Made','XPA':'IXP_Att','XP%':'IXP%','FGM':'IFG_Made','FGA':'IFG_Att',\
                 'FG%':'IFG%','2PM':'I2pt_Made','Sfty':'ISfty','TD.3':'ITot_TD','Pts':'ITot_Pts','Rt':'IKR_Rt','Yds.4':'IKR_Yds',\
                 'Y/Rt':'IKR_Y/Rt','TD.4':'IKR_TD','Ret':'IPR_Rt','Yds.5':'IPR_Yds','Y/R.1':'IPR_Y/Rt','TD.5':'IPR_TD',\
                 'Sk.1':'ITack_Sk','Solo':'ITack_Solo','Ast':'ITack_Ast','Comb':'ITack_Tot','TFL':'ITack_TFL',\
                 'QBHits':'ITack_QBHits','Int.1':'IDef_Int','Yds.6':'IDef_IntYds','TD.6':'IDef_IntTD','PD':'IDef_PD',\
                 'Fmb':'IFmb_Fmb','FL':'IFmb_Lost','FF':'IFmb_Forced','FR':'IFmb_Recov','Yds.7':'IFmb_Yds','TD.7':'IFmb_TD'},\
                 inplace=True)
    # create a list of the columns
    stats_cols = []
    for col in stats.columns:
        stats_cols.append(col)
    # replace blanks and @ with home and away
    stats.replace({'Away_Home':{'@':'Away',None:'Home'}},inplace=True)
    # change datatypes and calculate data appropriately
    stats[['IPass_Cmp','IPass_Att','IRec_Rec','IRec_Tgt','IXP_Made','IXP_Att','IFG_Made','IFG_Att']] = stats[['IPass_Cmp','IPass_Att','IRec_Rec','IRec_Tgt','IXP_Made','IXP_Att','IFG_Made','IFG_Att']].astype(float)
    stats['IPass_Cmp%'] = stats['IPass_Cmp']/stats['IPass_Att']
    stats['IRec_Ctch%'] = stats['IRec_Rec']/stats['IRec_Tgt']
    stats['IXP%'] = stats['IXP_Made']/stats['IXP_Att']
    stats['IFG%'] = stats['IFG_Made']/stats['IFG_Att']
    # change datatypes
    stats[stats_cols[11:]] = stats[stats_cols[11:]].astype(float)
    stats[stats_cols[11:]] = stats[stats_cols[11:]].fillna(value=0)
    stats['Date'] = pd.to_datetime(stats['Date'],errors='coerce',format='%Y-%m-%d')
    # insert a column for season
    stats.insert(loc=4,column='Season',value=Season)
    stats['Player'] = stats['Player'].astype(str)
    stats['Week'] = stats['Week'].astype(str)
    stats['Season'] = stats['Season'].astype(str)
    # clean player names for consistency
    stats['Player'] = stats['Player']+' '
    stats.replace({'Player':{' Jr. ':'',' Jr ':'',' Sr. ':'',' Sr ':'',' III ':'',' II ':'',' IV ':'',' V ':''}},regex=True,inplace=True)
    stats['Player'] = stats['Player'].str.strip(' ')
    stats['Player'] = stats['Player'].str.replace('.','')
    stats.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\player_stats_{Season}_clean.csv',index=False)
    end = time.time()
    
    print(f'Player stats data cleaned. {Season}',f'Time to complete: {end-start}',sep='\n')

## Section 3 - Grouped Run Fucntions

### 3.1 - Scrape All Data

In [None]:
def run_scrape(Start,End):
    # run all web scrape functions at once for multiple season
    start = time.time()
    season = []
    for yr in range(Start,End+1):
        season.append(yr)
    for s in season:
        team_data_scrape(s,1)
        team_data_scrape(s,2)
        player_snap_scrape(s)
        injury_reports_scrape(s)
        player_stats_scape(s)
    end = time.time()
    
    print(f'Scrape complete.',f'Time to complete season {s}: {end-start}',sep='\n')

### 3.2 - Clean All Data

In [None]:
def run_clean(Start,End):
    # run all data clean functions at once for multiple seasons
    start = time.time()
    season = []
    for yr in range(Start,End+1):
        season.append(yr)
    for s in season:
        team_data_clean(s)
        player_snaps_clean(s)
        nfl_injury_clean(s)
        player_stats_clean(s)
    end = time.time()
    
    print(f'Data clean complete.',f'Time to compelte season {s}: {end-start}',sep='\n')

In [None]:
# run_scrape(2016,2020)

In [None]:
# run_clean(2016,2020)

## Section 4 - Custom Fantasy Scoring

### 4.1 - Accept Custom Settings

In [None]:
# collect custom scoring format for each league
def custom_scoring():
    while True:
        print("Please enter in custom scoring format: ")
        lg_pass_yds = 1/float(input('Passing yards per 1 point league value: '))
        lg_pass_tds = float(input('Passing touchdowns league value: '))
        lg_int = float(input('Offensive interceptions league value: '))
        lg_yds = 1/float(input('Non-passing yards per 1 point league value: '))
        lg_tds = float(input('Non-passing touchdowns league value: '))
        lg_rec_rec = float(input('Receptions league value: '))
        lg_2pt = float(input('2-point conversion league value: '))
        lg_fmb = float(input('Fumbles lost league value: '))
        lg_kicker_xp = float(input('Point after attempt made league value: '))
        lg_fg_yds = 1/float(input('Field goal yards per 1 point league value: '))
        lg_def_sk = float(input('Defensive sack league value: '))
        lg_def_int = float(input('Defensive interceptions league value: '))
        lg_def_fmb = float(input('Defensive fumble recovery league value: '))
        lg_def_td = float(input('Defensive touchdown league value: '))
        lg_def_sfty = float(input('Defensive saftey league value: '))
        lg_def_block = float(input('Defensive blocked kick league value: '))
        lg_spec_td = float(input('Special teams touchdown league value: '))
        lg_def_0pts = float(input('Defensive allowed points 0 league value: '))
        lg_def_6pts = float(input('Defensive allowed points 1-6 league value: '))
        lg_def_13pts = float(input('Defensive allowed points 7-13 league value: '))
        lg_def_20pts = float(input('Defensive allowed points 14-20 league value: '))
        lg_def_27pts = float(input('Defensive allowed points 21-27 league value: '))
        lg_def_34pts = float(input('Defensive allowed points 28-34 league value: '))
        lg_def_35pts = float(input('Defensive allowed points 35+ league value: '))
        lg_spec_xpreturn = float(input('Special teams extra point returned league value: '))
        idp_solo_tck = float(input('IDP points per solo tackle league value: '))
        idp_ass_tck = float(input('IDP points per assisted tackle league value: '))
        idp_tck_loss = float(input('IDP points per tackle for loss league value: '))
        idp_sack = float(input('IDP points per sack league value: '))
        idp_int = float(input('IDP points per interception league value: '))
        idp_ffmb  = float(input('IDP points per forced fumble league value: '))
        idp_fmbrec = float(input('IDP points per fumble recovery league value: '))
        idp_td = float(input('IDP points per defensive touchdown league value: '))
        idp_sfty = float(input('IDP points per defensive saftey league value: '))
        idp_passd = float(input('IDP points per pass defended league value: '))

        cust_scoring = {'lg_pass_yds':lg_pass_yds,
                        'lg_pass_tds':lg_pass_tds,
                        'lg_int':lg_int,
                        'lg_yds':lg_yds,
                        'lg_tds':lg_tds,
                        'lg_rec_rec':lg_rec_rec,
                        'lg_2pt':lg_2pt,
                        'lg_fmb':lg_fmb,
                        'lg_kicker_xp':lg_kicker_xp,
                        'lg_fg_yds':lg_fg_yds,
                        'lg_def_sk':lg_def_sk,
                        'lg_def_int':lg_def_int,
                        'lg_def_fmb':lg_def_fmb,
                        'lg_def_td':lg_def_td,
                        'lg_def_sfty':lg_def_sfty,
                        'lg_def_block':lg_def_block,
                        'lg_spec_td':lg_spec_td,
                        'lg_def_0pts':lg_def_0pts,
                        'lg_def_6pts':lg_def_6pts,
                        'lg_def_13pts':lg_def_13pts,
                        'lg_def_20pts':lg_def_20pts,
                        'lg_def_27pts':lg_def_27pts,
                        'lg_def_34pts':lg_def_34pts,
                        'lg_def_35pts':lg_def_35pts,
                        'lg_spec_xpreturn':lg_spec_xpreturn,
                        'idp_solo_tck':idp_solo_tck,
                        'idp_ass_tck':idp_ass_tck,
                        'idp_tck_loss':idp_tck_loss,
                        'idp_sack':idp_sack,
                        'idp_int':idp_int,
                        'idp_ffmb':idp_ffmb,
                        'idp_fmbrec':idp_fmbrec,
                        'idp_td':idp_td,
                        'idp_sfty':idp_sfty,
                        'idp_passd':idp_passd
                       }
        # save custom scoring to a text file for use later
        file = open(r'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Custom_Scoring.txt','w')
        for k in cust_scoring:
            file.write(f'{k},{cust_scoring[k]}\n')
        file.close()
        break

In [None]:
# custom_scoring()

## Section 5 - Prepare for Analysis

### 5.1 - Combine all files of one season into a single file

In [None]:
# combine data
def combine_season(Season):
    
    # pull custom scoring format from txt file
    file = open(r'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Custom_Scoring.txt','r')
    score_format = {}
    for line in file:
        column = line.split(',')
        score_format[column[0]] = float(column[1])
    file.close()
    
    # open csv files for combining data
    inj = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\clean\injury_report_{Season}_clean.csv')
    stat = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\clean\player_stats_{Season}_clean.csv')
    snp = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\clean\snapcounts_{Season}_clean.csv')
    tm = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\clean\nflteam_data_{Season}_clean.csv')
    
    # combine stat and snap data
    ss = pd.merge(left=stat,right=snp,how='outer',on=['Player','Season','Week'])
    ss['Team_x'] = ss['Team_x'].fillna(ss['Team_y'])
    ss['Pos_x'] = ss['Pos_y'].fillna(ss['Pos_x'])
    ss.drop(['Team_y','Pos_y'],axis=1,inplace=True)
    ss.rename(columns={'Team_x':'Team','Pos_x':'Pos'},inplace=True)
    
    # combine stat_snap data and injury data
    ssj = pd.merge(left=ss,right=inj,how='outer',on=['Player','Season','Week'])
    ssj['Date_x'] = ssj['Date_x'].fillna(ssj['Date_y'])
    ssj['Team_x'] = ssj['Team_x'].fillna(ssj['Team_y'])
    ssj['Opp_x'] = ssj['Opp_x'].fillna(ssj['Opp_y'])
    ssj.drop(['Date_y','Team_y','Opp_y','Games'],axis=1,inplace=True)
    ssj.rename(columns={'Date_x':'Date','Team_x':'Team','Opp_x':'Opp'},inplace=True)
    ssj['Player'] = ssj['Player'].str.replace('.','')
    
    # calculate fantasy points and drop columns no longer needed
    """
    Need to edit the numbers to reflect what the user selects as the value using the saved file for scoring format
    """
    ssj['Fntsy_Pts'] = (ssj['IPass_Yds']*(1/25))+(ssj['IPass_Int']*-1)+(ssj['IPass_TD']*4)+(ssj['IRush_Yds']*(1/10))+(ssj['IRush_TD']*6)+(ssj['IRec_Rec']*(1/2))+(ssj['IRec_Yds']*(1/10))+(ssj['IRec_TD']*6)+(ssj['IKR_TD']*6)+(ssj['IPR_TD']*6)+(ssj['IFmb_Lost']*-2)+(ssj['IFmb_TD']*6)+(ssj['I2pt_Made']*2)+(ssj['ISfty']*10)+(ssj['ITack_Sk']*4)+(ssj['ITack_Solo']*1.5)+(ssj['ITack_Ast']*0.75)+(ssj['ITack_TFL']*2)+(ssj['IDef_Int']*6)+(ssj['IDef_PD']*1.5)+(ssj['IFmb_Forced']*4)+(ssj['IFmb_Recov']*4)+(ssj['IDef_IntTD']*6)
    ssj = ssj[['Player','Age','Pos','Team','Date','Day','Season','Week','G#','Away_Home','Opp','Result','Fntsy_Pts','Snaps','Snap %','Status','Injury']]

    # combine team data into the individual data to fill in some blanks
    ssjt = pd.merge(left=ssj,right=tm,how='outer',on=['Team','Season','Week'])
    ssjt['Date_x'] = ssjt['Date_x'].fillna(ssjt['Date_y'])
    ssjt['Away_Home_x'] = ssjt['Away_Home_x'].fillna(ssjt['Away_Home_y'])
    ssjt['Opp_x'] = ssjt['Opp_x'].fillna(ssjt['Opp_y'])
    ssjt['G#_x'] = ssjt['G#_x'].fillna(ssjt['G#_y'])
    ssjt['Day_x'] = ssjt['Day_x'].fillna(ssjt['Day_y'])
    ssjt['Result_x'] = ssjt['Result_x'].fillna(ssjt['Result_y'])
    ssjt.drop(['Date_y','Away_Home_y','Opp_y','G#_y','Day_y','Result_y'],axis=1,inplace=True)
    ssjt.rename(columns={'Date_x':'Date','Away_Home_x':'Away_Home','Opp_x':'Opp','G#_x':'G#','Day_x':'Day','Result_x':'Result'},inplace=True)
    ssjt = ssjt[['Player','Age','Pos','Team','Date','Day','Season','Week','G#','Away_Home','Opp','Result','Fntsy_Pts','Snaps','Snap %','Status','Injury']]
    
    # sort columns for easier viewing
    ssjt.sort_values(by=['Player','Season','Week'],inplace=True)
    
    # give/change all player positions to players with positions listed for the current year
    ssjt.replace({'Pos':{'0':None}},inplace=True)
    key = ssjt[['Player','Pos']]
    key = key.drop_duplicates()
    key = key.dropna()
    key.replace({'Pos':{'DT':'DL','CB':'DB','FB':'RB','OLB':'LB','S':'DB','G':'OL','DE':'DL','FS':'DB','RT':'OL','T':'OL','RG':'OL','LT':'OL','C':'OL','LS':'LongSnap','LDE':'DL','RILB':'LB','LCB':'DB','ILB':'LB',\
                       'MLB':'LB','TB':'RB','LDT':'DL','NT':'DL','WILL':'LB','SS':'DB','RDE':'DL','SAM':'LB','HB':'RB','WR/RS':'WR','G/T':'OL','C/G':'OL','G/C':'OL','OT':'OL','CB/RS':'DB','DB/LB':'LB','T/G':'OL',\
                        'RB/WR':'RB','LB/FB':'LB','DE/LB':'LB','DT/DE':'DL','TE/DE':'TE','K/P':'K','OS':'OL','LG':'OL','MIKE':'LB','LILB':'LB','WLB':'LB','G/OT':'OL','SLB':'LB','RCB':'DB'}},inplace=True)
    ssjt = pd.merge(ssjt,key,how='left',left_on='Player',right_on='Player')
    ssjt['Pos_x'] = ssjt['Pos_y']
    ssjt.drop(['Pos_y'],axis=1,inplace=True)
    ssjt.rename(columns={'Pos_x':'Pos'},inplace=True)
    ssjt.drop_duplicates(subset=['Player','Season','Week'],keep='first',inplace=True)
    player = ssjt['Player'].unique().tolist()
    player = player*17
    player.sort()
    df2 = pd.DataFrame(player,columns=['Player'])
    df2['Season'] = Season
    df2['Week'] = df2.groupby('Player').cumcount()+1
    ssjt = pd.merge(ssjt,df2,how='outer')
    
    # write data frame to csv
    ssjt.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\clean\nfl_{Season}.csv',index=False)
    

def combine_all_season(Start,End):
    start = time.time()
    for yr in range(Start,End+1):
        combine_season(yr)
    end = time.time()
    print(f'Data clean combined and cleaned further.',f'Time to complete: {end-start}',sep='\n')

In [None]:
combine_all_season(2016,2020)

### 5.2 - Split data after the player sustained an injury for the remainder of the season

In [None]:
def injured_database(Start,End):
    # create a database of all athletes that sustained an injury and what their production was post injury for the remainder of the season
    start = time.time()
    
    season = []
    for yr in range(Start,End+1):
        season.append(yr)
        
    for s in season:
        df = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\nfl_{s}.csv',low_memory=False)
        df.sort_values(['Player','Week'],inplace=True)
        df.reset_index(inplace=True)
        df.drop(['index'],axis=1,inplace=True)

        # create list of status's to be able to identify were to split the rows
        status = df['Status'].dropna().unique().tolist()

        # create a database to located the index of the first instance of an injury
        Player = df.loc[df['Status'].isin(status)].drop_duplicates(subset=['Player']).index.unique().tolist()

        # split out the rows then save them to a separate dataframe and export to csv
        inj_df = pd.DataFrame()

        for i in range(0,len(Player)):
            idx = df[df['Player']==df['Player'][Player[i]]].tail(1)
            inj = df.loc[Player[i]-1:idx.index.values.astype(int)[0]]
            inj_df = pd.concat([inj_df,inj])

        inj_df['Inj_Week'] = 0
        inj_df['Inj_Week'] = inj_df.groupby('Player').cumcount()
        inj_df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\Analysis_DB\injury_db_{s}.csv',index=False)
    end = time.time()
    print(f'Injury databases created for seasons {Start} to {End}.',f'Time to complete: {end-start}',sep='\n')

In [None]:
# injured_database(2016,2020)

### 5.3 - Split data before the player sustained an injury or for full season healthy players

In [None]:
def healthy_database(Start,End):
    # create a database of all athletes that were healthy or healthy until sustaining an injury
    start = time.time()
    
    season = []
    for yr in range(Start,End+1):
        season.append(yr)
        
    for s in season:
        df = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\nfl_{s}.csv',low_memory=False)
        df.sort_values(['Player','Week'],inplace=True)
        df.reset_index(inplace=True)
        df.drop(['index'],axis=1,inplace=True)
        
        # create list of status's to be able to identify were to split the rows
        status = df['Status'].dropna().unique().tolist()
        
        # create a database to locate the index of the first instance of an injury
        Player = df.loc[df['Status'].isin(status)].drop_duplicates(subset=['Player']).index.unique().tolist()
        
        # split out the rows then save them to a separate dataframe and export to csv
        healthy_df = pd.DataFrame()
        
        for i in range(0,len(Player)):
            idx = df[df['Player']==df['Player'][Player[i]]].head(1)
            healthy = df.loc[idx.index.values.astype(int)[0]:Player[i]-1]
            healthy_df = pd.concat([healthy_df,healthy])

        healthy_df['Healthy_Week'] = 0
        healthy_df['Healthy_Week'] = healthy_df.groupby('Player').cumcount()+1
        healthy_df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\Analysis_DB\healthy_db_{s}.csv',index=False)
    end = time.time()
    print(f'Healthy databases created for seasons {Start} to {End}.',f'Time to complete: {end-start}',sep='\n')

In [None]:
# healthy_database(2016,2020)

## Section 6 - Analysis

### 6.1 - 

In [31]:
inj = pd.DataFrame()
hthy = pd.DataFrame()

for s in range(2016,2021):
    i= pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\Analysis_DB\injury_db_{s}.csv',low_memory=False)
    h = pd.read_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\nfl_data\Clean\Analysis_DB\healthy_db_{s}.csv',low_memory=False)
    i['Date'] = pd.to_datetime(i['Date'],format='%Y-%m-%d')
    h['Date'] = pd.to_datetime(h['Date'],format='%Y-%m-%d')
    inj = pd.concat([inj,i])
    hthy = pd.concat([hthy,h])
    
inj.replace({'Fntsy_Pts':{0:np.nan}},inplace=True)
hthy.replace({'Fntsy_Pts':{0:np.nan}},inplace=True)

pos = ['DB','DL','LB','QB','RB','TE','WR']
inj = inj.loc[inj['Pos'].isin(pos)]
hthy = hthy.loc[hthy['Pos'].isin(pos)]

# Choose how to filter the NFL data
filter_snap_counts = int(input('Filter by how many snap counts, minimum? (Recommended: 15) '))
filtered_inj = inj.query(f'Snaps >= {filter_snap_counts}')
filtered_hthy = hthy.query(f'Snaps >= {filter_snap_counts}')
filtered_inj['Injury'] = filtered_inj['Injury'].str.lower()
filtered_inj['Injury'] = filtered_inj['Injury'].str.strip()
filtered_inj.replace({'Injury':
                     {'biceps':'bicep','abdominal':'abdomen','core':'abdomen',"broken fibula":'fibula','fractured forearm':'forearm',\
                      'general medical issue':'illness','quadriceps':'quadricep','ribs':'rib','rib cage':'rib','glute':'hip',\
                      'sprained shoulder':'sprained ac joint','thumb sprain':'thumb','nack':'neck','nack, groin':'neck, groin',\
                      'torn acl':'acl','torn mcl':'mcl','oblique':'abdomen'}},inplace=True)
filtered_inj = filtered_inj.filter(['Player','Pos','Team','Date','Day','Away_Home','Opp','Fntsy_Pts','Snaps','Status','Injury','Inj_Week',])

Filter by how many snap counts, minimum? (Recommended: 15)  15


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [33]:
pd.pivot_table(filtered_inj,index=['Pos','Injury','Inj_Week'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Fntsy_Pts,Snaps
Pos,Injury,Inj_Week,Unnamed: 3_level_1,Unnamed: 4_level_1
DB,abdomen,1,9.0,51.25
DB,abdomen,3,3.0,27.0
DB,abdomen,8,9.75,27.0
DB,achilles,1,12.0,46.5
DB,achilles,2,4.5,50.0
DB,achilles,3,6.0,50.0
DB,achilles,4,13.125,47.0
DB,achilles,5,3.75,35.0
DB,achilles,6,9.0,84.0
DB,achilles,7,8.25,55.333333


In [None]:
# pd.pivot_table(filtered_hthy,index=['Season','Team','Pos'])
# pd.pivot_table(filtered_inj,index=['Player','Season','Pos'])

In [None]:
# pd.pivot_table(filtered_hthy,index=['Player','Season','Pos'])