In [1]:
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import pandas as pd
import datetime as dt
from bs4 import BeautifulSoup
import requests
import os
from dotenv import load_dotenv
load_dotenv()

def team_data_scrape(Start_Year,End_Year,url):
    page = 0
    location = 1000
    stat_login_url = "https://stathead.com/users/login.cgi"
    stat_user_name = os.environ.get('statheadusername')
    stat_password = os.environ.get('statheadpassword')
    stat_payload = {
        'username': stat_user_name,
        'password': stat_password
    }
    
    if url == 1:
        stat_url = 'https://stathead.com/football/tgl_finder.cgi?request=1&temperature_gtlt=lt&game_num_max=99&week_num_max=99&order_by=points&match=game&year_max={Start_Year}&order_by_asc=0&week_num_min=0&game_type=E&game_num_min=0&year_min={End_Year}&cstat[1]=all_td_team&ccomp[1]=gt&cval[1]=0&cstat[2]=third_down_att&ccomp[2]=gt&cval[2]=0&cstat[3]=vegas_line&ccomp[3]=gt&cval[3]=-50&cstat[4]=penalties&ccomp[4]=gt&cval[4]=0&cstat[5]=rush_att&ccomp[5]=gt&cval[5]=0&cstat[6]=tot_yds&ccomp[6]=gt&cval[6]=0&cstat[7]=first_down&ccomp[7]=gt&cval[7]=0&cstat[8]=punt&ccomp[8]=gt&cval[8]=0&cstat[9]=pass_cmp&ccomp[9]=gt&cval[9]=0&offset={page}'
    elif url == 2:
        stat_url = 'https://stathead.com/football/tgl_finder.cgi?request=1&temperature_gtlt=lt&game_num_max=99&week_num_max=99&order_by=all_td_opp&match=game&year_max={Start_Year}&order_by_asc=0&week_num_min=0&game_type=R&game_num_min=0&year_min={End_Year}&cstat[1]=tot_yds_opp&ccomp[1]=gt&cval[1]=0&cstat[2]=rush_yds_diff&ccomp[2]=gt&cval[2]=-500&cstat[3]=score_diff_thru_1&ccomp[3]=gt&cval[3]=-500&cstat[4]=rush_att_opp&ccomp[4]=gt&cval[4]=0&cstat[5]=kick_ret_td_tgl&ccomp[5]=gt&cval[5]=0&cstat[6]=pass_cmp_opp&ccomp[6]=gt&cval[6]=0&cstat[7]=first_down_opp&ccomp[7]=gt&cval[7]=0&cstat[8]=score_diff_1_qtr&ccomp[8]=gt&cval[8]=-500&cstat[9]=third_down_att_opp&ccomp[9]=gt&cval[9]=0&offset={page}'
    elif url != 1 or 2:
        print("Please select 1 or 2.")
        
    with requests.Session() as session:
        
        s = session.post(stat_login_url, data=stat_payload)
        
        while page < 100000:
            
            website = session.get(stat_url.format(Start_Year=Start_Year,End_Year=End_Year,page=page)).text
            soup = BeautifulSoup(website, 'html')
            table = soup.find('table', attrs={'class': 'sortable', 'id': 'results'})

            table_headers = [header.text for header in table.find('thead').find_all('th')]
            table_rows = table.find_all('tr')

            final_data = []
            
            for tr in table_rows:
                td = tr.find_all('td')
                row = [tr.text for tr in td]
                final_data.append(row)
                
            df = pd.DataFrame(final_data[1:], columns=table_headers)
            df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\player_stats\nflteam_data_{Start_Year}_{End_Year}.csv',mode='a',index=False)

            if page > location:
                print(page)
                location += 1000

                page += 100
                
        print('Done')

def player_snap_scrape(Start_Year,End_Year):
    
    sides = ['','defense.php']
    
    ENDPOINT = "https://www.fantasypros.com/nfl/reports/snap-counts/{side}?year={year}"

    final_df = pd.DataFrame()

    for year in range(Start_Year, End_Year+1):
        
        for side in sides:
        
            res = requests.get(ENDPOINT.format(year=year,side=side))

            soup = BeautifulSoup(res.content, 'html.parser')

            table = soup.find('table', {'id': 'data'})

            df = pd.read_html(str(table))[0]

            df.columns = df.columns[:3].tolist() + [f'Week {i}' for i in df.columns[3:-2]] + df.columns[-2:].tolist()

            df['Year'] = year

            cols = df.columns[:3].tolist() + df.columns[-1:].tolist() + df.columns[3:-1].tolist()
            df = df[cols]

            final_df = pd.concat([final_df, df])
            print(year,side)

    final_df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\snapcounts\snapcounts_{Start_Year}_{End_Year}.csv')
    print('Done')

def player_snap_clean(Start_Year,End_Year):
    
    dfname = []
    
    for num in range(Start_Year,End_Year+1):
        dfname.append(f'{num}_Offense')
        dfname.append(f'{num}_Defense')
        data = {key: pd.read_csv(f'FantasyPros_Fantasy_Football_{key}_Snap_Counts.csv') for key in dfname}

    yr = Start_Year
        
    while yr < End_Year+1:
        for key in data:
            data[key].insert(loc=3,column='Year',value=yr,allow_duplicates=True)
            data[key]['Year'] = data[key]['Year'].astype(int)
            yr += 0.5

    for key in data:
        data[key] = pd.melt(data[key],id_vars=['Player','Pos','Team','Year'],var_name='Week', value_name='Snaps')
        data[key].replace({'Team':\
                  {'GB':'GNB','JAC':'JAX','KC':'KAN','NE':'NWE','NO':'NOR','SF':\
                   'SFO','TB':'TAM','Multi':None,'LV':'OAK',},'Snaps':{'bye':None}},inplace=True)
        data[key].dropna(thresh=3,inplace=True)

    player = pd.concat(data.values(), ignore_index=True)

    team = pd.read_csv(f'nflteamsnaps_{Start_Year}_{End_Year}.csv')
    team.dropna(thresh=5,inplace=True)
    team.drop(['LTime','Y/P'],axis=1,inplace=True)
    team.rename(columns={'Unnamed: 5':'Away_Home','Time.1':'ToG','Tm':'Team'},inplace=True)
    team['Date'] = pd.to_datetime(team['Date'])
    team['ToG'] = pd.to_datetime(team['ToG'])
    team['Away_Home'].fillna(value='Home',inplace=True)
    team['Away_Home'].replace('@','Away',inplace=True)
    team['Away_Home'].replace('nan','Home',inplace=True)
    team['TO'].fillna(value=0,inplace=True)
    team.insert(loc=2,column='Month',value=team['Date'].dt.month)
    df_bridge = team['ToP'].str.split(":",expand=True)
    team['ToP'] = (df_bridge[0].astype(int)*60)+df_bridge[1].astype(int)
    team['Week'] = team['Week'].astype(str)

    player_snaps = pd.merge(left=player,right=team,how='outer',left_on=['Team','Year','Week'],right_on=['Team','Year','Week'])
    
    player_snaps.to_csv(f'player_snaps_{Start_Year}_{End_Year}.csv',index=False)
    print("Done")

def injury_reports_scrape(Start_Year,End_Year):
    
    teams = ['crd', 'atl', 'rav', 'buf', 'car', 'chi', 'cin', 'cle', 'dal', 'den', 'det', 'gnb','htx','clt','jax','kan',
             'sdg','ram','mia','min','nor','nwe','nyg','nyj','rai','phi','pit','sea','sfo','tam','oti','was']

    ENDPOINT = 'https://www.pro-football-reference.com/teams/{team}/{year}_injuries.htm'

    final_df = pd.DataFrame()

    for year in range(Start_Year, End_Year+1):

        for team in teams:

            res = requests.get(ENDPOINT.format(year=year,team=team))

            soup = BeautifulSoup(res.content, 'lxml')

            table = soup.find('table', attrs={'class': 'sortable', 'id': 'team_injuries'})
            table_rows = table.find_all('tr')

            final_data = []
            for tr in table_rows:
                td = tr.find_all(['th','td'])
                row = [tr['data-tip'] if tr.has_attr("data-tip") else tr.text for tr in td]
                final_data.append(row)

            dfdata = final_data[1:]
            data_body = [[dfdata[j][i] for j in range(len(dfdata))] for i in range(len(dfdata[0]))]

            df = pd.DataFrame(data_body,final_data[0]).T

            df.insert(loc=1,column='Team',value=team)
            df.insert(loc=2,column='Year',value=year)

            final_df = pd.concat([final_df, df])
            print(team,year)

    final_df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\injury_reports\nlf_injuryreport_{Start_Year}_{End_Year}.csv')
    print('Done')

def injury_reports_clean(Start_Year,End_Year):
    dfname = []
    teams = ['crd','atl','rav','buf','car','chi','cin','cle','dal','den','det','gnb','htx','clt','jax','kan',
                     'sdg','ram','mia','min','nor','nwe','nyg','nyj','rai','phi','pit','sea','sfo','tam','oti','was']

    for team in teams:
        for year in range(Start_Year,End_Year+1):
            dfname.append(f'{team}_{year}_injuryreport')
            data = {key: pd.read_csv(f'{key}.csv') for key in dfname}

    for key in data:
        data[key].drop('Unnamed: 0',axis=1,inplace=True)
        data[key] = pd.melt(data[key],id_vars=['Player','Team','Year'],var_name='Date', value_name='Status')
        data[key][['Date','Opp']] = data[key].Date.str.split("vs. ",expand=True)
        data[key][['Status','Injury']] = data[key].Status.str.split(":",expand=True)
        data[key]['Date'] = data[key]['Date'].astype(str)+'/'+data[key]['Year'].astype(str)
        data[key]['Date'] = pd.to_datetime(data[key]['Date'])
        data[key].replace({'Team':\
                           {'crd':'ARI', 'atl':'ATL', 'rav':'BAL', 'buf':'BUF', 'car':'CAR', 'chi':'CHI', 'cin':'CIN',\
                            'cle':'CLE', 'dal':'DAL', 'den':'DEN', 'det':'DET', 'gnb':'GNB','htx':'HOU','clt':'IND',\
                            'jax':'JAX','kan':'KAN','sdg':'LAC','ram':'LAR','mia':'MIA','min':'MIN','nor':'NOR','nwe':'NWE',\
                            'nyg':'NYG','nyj':'NYJ','rai':'OAK','phi':'PHI','pit':'PIT','sea':'SEA','sfo':'SFO','tam':'TAM',\
                            'oti':'TEN','was':'WAS'}},inplace=True)
        data[key].dropna(thresh=3,inplace=True)

    nfl_injury = pd.concat(data.values(),ignore_index=True)
    nfl_injury.to_csv(f'NFL_{Start_Year}_{End_Year}_Injuryreport.csv',index=False)
    print('Done')

def player_stats_scape(Start_Year,End_Year):
    page = 0
    location = 1000
    stat_login_url = "https://stathead.com/users/login.cgi"
    stat_user_name = os.environ.get('statheadusername')
    stat_password = os.environ.get('statheadpassword')
    stat_payload = {
        'username': stat_user_name,
        'password': stat_password
    }
    stat_url = "https://stathead.com/football/pgl_finder.cgi?request=1&game_num_max=99&week_num_max=99&order_by=all_td&season_start=1&qb_gwd=0&order_by_asc=0&qb_comeback=0&week_num_min=0&game_num_min=0&year_min={Start_Year}&match=game&year_max={End_Year}&season_end=-1&age_min=0&game_type=R&age_max=99&positions[]=qb&positions[]=rb&positions[]=wr&positions[]=te&positions[]=e&positions[]=t&positions[]=g&positions[]=c&positions[]=ol&positions[]=dt&positions[]=de&positions[]=dl&positions[]=ilb&positions[]=olb&positions[]=lb&positions[]=cb&positions[]=s&positions[]=db&positions[]=k&positions[]=p&cstat[1]=punt_ret&ccomp[1]=gt&cval[1]=0&cstat[2]=sacks&ccomp[2]=gt&cval[2]=0&cstat[3]=fumbles&ccomp[3]=gt&cval[3]=0&cstat[4]=rush_att&ccomp[4]=gt&cval[4]=0&cstat[5]=pass_defended&ccomp[5]=gt&cval[5]=0&cstat[6]=pass_cmp&ccomp[6]=gt&cval[6]=0&cstat[7]=targets&ccomp[7]=gt&cval[7]=0&cstat[8]=kick_ret&ccomp[8]=gt&cval[8]=0&offset={page}"
    
    with requests.Session() as session:
        
        s = session.post(stat_login_url, data=stat_payload)
        
        while page < 100000:
            
            website = session.get(stat_url.format(Start_Year=Start_Year,End_Year=End_Year,page=page)).text
            soup = BeautifulSoup(website, 'html')
            table = soup.find('table', attrs={'class': 'sortable', 'id': 'results'})

            table_headers = [header.text for header in table.find('thead').find_all('th')]
            table_rows = table.find_all('tr')

            final_data = []
            
            for tr in table_rows:
                td = tr.find_all('td')
                row = [tr.text for tr in td]
                final_data.append(row)
                
            df = pd.DataFrame(final_data[1:], columns=table_headers[11:])
            df.to_csv(rf'C:\Users\cudde\OneDrive\Podcasting\Fantasy Sidelines\Injury Data Python\Data_Collect_Clean\player_stats\player_stats_{Start_Year}_{End_Year}.csv',mode='a',index=False)

            if page > location:
                print(page)
                location += 1000
                
            page += 100
            
        print('Done')

In [2]:
Start = 2017
End = 2019

In [None]:
team_data_scrape(Start,End,1)

In [None]:
'player_snap_scrape(Start,End)'

In [None]:
'injury_reports_scrape(Start,End)'

In [None]:
#player_snap_clean(Start,End)

In [None]:
#injury_reports_clean(Start,End)

In [None]:
'player_stats_scape(Start,End)'

5100
