br

### Import libraries.

In [10]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

# Get the injury probabilities from Sports Injury Predictor.

### Make a list of the URLs we'll use.

In [11]:
# All the URLs are the same except for the position, so we can create a list of strings for that.
pos = ['qb','rb','wr','te']

# Use list comprehension to iterate through that list and complete the URL, making a final list of URLs to use.
url_list = ['http://sportsinjurypredictor.com/injury-predictor/injury-predictor/search-by-position/'+string+'/low-to-high' for string in pos]

### Make a big data frame with all the players' data.

In [12]:
# Create an empty data frame that has the same column label as what we'll be appending later.
df_injury = pd.DataFrame(columns=['Name','Injury'])

# Start the for loop over list of URLs (websites for different positions).
for url in url_list:

    # Download the page.
    req = requests.get(url)

    # Assign the text of the page to a variable.
    page = req.text

    # Parse the HTML code.
    soup = BeautifulSoup(page, "lxml")

    # Download the rows.
    rows = [row for row in soup.find("div", "page").find_all("div","player-card margin-top-20")]
    
    # Get the row names
    names_list = [row.find("span","h3 face-myers c005b99").get_text() for row in rows]

    # Make a data frame for that.
    df_names_list = pd.DataFrame(names_list, columns=['Name'])
    

    # Get rid of all the spaces and new lines and the % sign. We just want the number.
    # The argument for strip is all the characters I want to get rid of.
    percentages_list = [row.find('div','h1 face-myers inline c4a9f00').get_text().strip('\t\n%') for row in rows]
    
    # Those number are strings. If you want to do any math with them later, make them numbers, using "float".
    # We also want to divide by 100 to get a percentage.
    percentages_list = [float(i)/100 for i in percentages_list]
    # Make a data frame for that.
    df_percentages_list = pd.DataFrame(percentages_list, columns=['Injury'])
    
    # Create a data frame in Pandas by concatenating those data frames.
    df_this_position = pd.concat([df_names_list, df_percentages_list], axis=1)

    # We want each data frame to be named by position. 
    # We'll do this by finding out where in the url_list loop we are, then using that as an index for the pos list.
    current_position = pos[url_list.index(url)]
    # This next part is somewhat inelegant. Next time find a way to create strings within the variable name.
    if current_position == 'qb':
        df_inj_qb = df_this_position
    elif current_position == 'rb':
        df_inj_rb = df_this_position
    elif current_position == 'wr':
        df_inj_wr = df_this_position
    elif current_position == 'te':
        df_inj_te = df_this_position
    
    # Append that to the big data frame with the rankings data for all players.
    df_injury = df_injury.append(df_this_position)

### Make a dictionary for reference later.

In [13]:
dict_injury = df_injury.set_index('Name').to_dict()

# I don't fully understand to_dict, and it includes the column name as the first dictionary entry...
# ...so to access Cam Newton you'd have to type "dict_injury['Injury']['Cam Newton']. To simplify it I'm doing this:
dict_injury = dict_injury['Injury']

# Get the projections from Fantasy Pros.

In [14]:
# All the URLs are the same except for the position, so we can create a list of strings for that.
pos = ['qb','rb','wr','te']

# Use list comprehension to iterate through that list and complete the URL, making a final list of URLs to use.
url_list = ['https://www.fantasypros.com/nfl/projections/'+string+'.php' for string in pos]

## I want to keep the positions separate, so we won't be making one large dataframe like we did for injuries.

# ___________________FOR NON-PPR SCORING:

# Start the for loop over list of URLs (websites for different positions).
for url in url_list:

    # Download the page.
    req = requests.get(url)

    # Assign the text of the page to a variable.
    page = req.text

    # Parse the HTML code.
    soup = BeautifulSoup(page)

    # Download the rows from the table by finding all the "tr" tags (rows) with the class "mpb-available".
    # I knew they needed that particular class because I looked at the tags in Chrome Developer Tools on the website.
    rows = [row for row in soup.find('table', id='data').find_all('tr')[2:]]
    # This used to be # rows = [row for row in soup.find('table', id='data').find_all('tr', 'mpb-available')]
    # ... but the website changed its format so mpb-available wasn't the class anymore...
    # ... each class had a different name so I had to take all the tr's and get rid of the first two (headers).

    # Get the row names by looking at the text in the "a" tag for each row.
    names_list = [row.find('a').get_text() for row in rows]
    # Make a data frame for that.
    df_names_list = pd.DataFrame(names_list, columns=['Name'])

    # All we're interested in is the projected points for each player.
    # That's the last "td" tag in each row. Use Beautiful Soup's "find_all" with a slice for just the last element.
    # That's different for each position, so we need to calculate the length of the list of td's and find the last.
    last_element_index = len(rows[0].find_all('td'))-1 # subtract 1 to get the correct index
    projections_list = [row.find_all('td')[last_element_index].get_text() for row in rows]
    # Those number are strings. If you want to do any math with them later, make them numbers, using "float".
    projections_list = [float(i) for i in projections_list]
    # Make a data frame for that.
    df_projections_list = pd.DataFrame(projections_list, columns=['Projection'])
    
    # Create a data frame in Pandas by concatenating those data frames.
    df_this_position = pd.concat([df_names_list, df_projections_list], axis=1)

    # We want each data frame to be named by position. 
    # We'll do this by finding out where in the url_list loop we are, then using that as an index for the pos list.
    current_position = pos[url_list.index(url)]
    # This next part is somewhat inelegant. Next time find a way to create strings within the variable name.
    if current_position == 'qb':
        df_proj_qb = df_this_position
    elif current_position == 'rb':
        df_proj_rb = df_this_position
    elif current_position == 'wr':
        df_proj_wr = df_this_position
    elif current_position == 'te':
        df_proj_te = df_this_position

# ___________________FOR HALF-PPR SCORING:

### For RBs and WRs, we have to do some math with the raw projection data because Fantasy Pros does its calculations for regular scoring.

In [15]:
# Start the for loop over list of URLs (websites for different positions).
for url in url_list:

    # Download the page.
    req = requests.get(url)

    # Assign the text of the page to a variable.
    page = req.text

    # Parse the HTML code.
    soup = BeautifulSoup(page, "lxml")

    # Download the rows from the table by finding all the "tr" tags (rows) with the class "mpb-available".
    # I knew they needed that particular class because I looked at the tags in Chrome Developer Tools on the website.
    rows = [row for row in soup.find('table', id='data').find_all('tr')[2:]]
    # This used to be # rows = [row for row in soup.find('table', id='data').find_all('tr', 'mpb-available')]
    # ... but the website changed its format so mpb-available wasn't the class anymore...
    # ... each class had a different name so I had to take all the tr's and get rid of the first two (headers).

    # Get the row names by looking at the text in the "a" tag for each row.
    names_list = [row.find('a').get_text() for row in rows]
    # Make a data frame for that.
    df_names_list = pd.DataFrame(names_list, columns=['Name'])

    
    
    
    
    
    
    # QB scoring is unaffected by PPR scoring, but the other positions will require some math. 
    # Get the position by finding out where in the url_list loop we are, then using that as an index for the pos list.
    current_position = pos[url_list.index(url)]
    
    if current_position == 'qb':
    
        # Use the last "td" tag in each row. Use Beautiful Soup's "find_all" with a slice for just the last element.
        # That's different for each position, so we need to calculate the length of the list of td's and find the last.
        last_element_index = len(rows[0].find_all('td'))-1 # subtract 1 to get the correct index
        projections_list = [row.find_all('td')[last_element_index].get_text() for row in rows]
        # Those number are strings. If you want to do any math with them later, make them numbers, using "float".
        projections_list = [float(i) for i in projections_list]

    
    # We also need to do this differently for each position because the rows of data have different stats.
    # e.g. The TE page on Fantasy Pros doesn't list projected carries, that's only for RBs.
    # Conveniently, the RB and WR pages have the same stats listed in the same order.
    # We're also going to convert directly to float and calculate points, so this is 3 nested loops in each line. Woah!
    # We need to get rid of commas because the float command can't handle that. This comes up when they get 1,000 yards.
    
    elif current_position == 'rb' or current_position == 'wr':
  
        rushyards = [x/10 for x in   [float(i) for i in [row.find_all('td')[2].get_text().replace(',','') for row in rows]]]
        rushtds =   [x*6 for x in    [float(i) for i in [row.find_all('td')[3].get_text() for row in rows]]]
        rec =       [x/2 for x in    [float(i) for i in [row.find_all('td')[4].get_text() for row in rows]]]
        recyards =  [x/10 for x in   [float(i) for i in [row.find_all('td')[5].get_text().replace(',','') for row in rows]]]
        rectds =    [x*6 for x in    [float(i) for i in [row.find_all('td')[6].get_text() for row in rows]]]
        fumbles =   [x*(-2) for x in [float(i) for i in [row.find_all('td')[7].get_text() for row in rows]]]
        
        projections_list = [rushyards[i]+rushtds[i]+rec[i]+recyards[i]+rectds[i]+fumbles[i] for i in range(len(rec))]
        
        
    elif current_position == 'te':
  
        rec =       [x/2 for x in    [float(i) for i in [row.find_all('td')[1].get_text() for row in rows]]]
        recyards =  [x/10 for x in   [float(i) for i in [row.find_all('td')[2].get_text().replace(',','') for row in rows]]]
        rectds =    [x*6 for x in    [float(i) for i in [row.find_all('td')[3].get_text() for row in rows]]]
        fumbles =   [x*(-2) for x in [float(i) for i in [row.find_all('td')[4].get_text() for row in rows]]]
        
        projections_list = [rec[i]+recyards[i]+rectds[i]+fumbles[i] for i in range(len(rec))]
         
        
        
        
        
        
        
    # Make a data frame for that.
    df_projections_list = pd.DataFrame(projections_list, columns=['Projection'])     
    
    # Create a data frame in Pandas by concatenating those data frames.
    df_this_position = pd.concat([df_names_list, df_projections_list], axis=1)

    # This next part is somewhat inelegant. Next time find a way to create strings within the variable name.
    if current_position == 'qb':
        df_proj_qb = df_this_position
    elif current_position == 'rb':
        df_proj_rb = df_this_position
    elif current_position == 'wr':
        df_proj_wr = df_this_position
    elif current_position == 'te':
        df_proj_te = df_this_position

# Match the injury data and the projection data to each other.

### Fix the boring errors where the player names don't match or they aren't listed in both data sets.

In [16]:
# If you get an error when running this program it might be because the player names don't match.
# For example, "Odell Beckham" vs "Odell Beckham Jr."
# Fix it case by case here.

# This changes "Odell Beckham" in the injury dictionary to "Odell Beckham Jr." to match the Fantasy Pros data.
dict_injury['Odell Beckham Jr.'] = dict_injury.pop('Odell Beckham')
dict_injury['Robert Griffin'] = dict_injury.pop('Robert Griffin III')

# For some reason, many players don't show up in the Sports Injury Predictor website...
# ... so I'm going to make their injury probability the average of the other players at their position.

# Calculate the averages.
avg_qb = float(df_inj_qb.mean())
avg_rb = float(df_inj_rb.mean())
avg_wr = float(df_inj_wr.mean())
avg_te = float(df_inj_te.mean())

# QBs
dict_injury['Ryan Fitzpatrick'] = avg_qb
dict_injury['Shaun Hill'] = avg_qb
dict_injury['Trevor Siemian'] = avg_qb
dict_injury['Case Keenum'] = avg_qb

# RBs
dict_injury['Jeremy Langford'] = avg_rb
dict_injury['Dion Lewis'] = avg_rb
dict_injury['Darren Sproles'] = avg_rb
dict_injury['Javorius Allen'] = avg_rb
dict_injury['DeAndre Washington'] = avg_rb
dict_injury['Spencer Ware'] = avg_rb
dict_injury['Chris Thompson'] = avg_rb
dict_injury['Shaun Draughn'] = avg_rb
dict_injury['Dan Herron'] = avg_rb
dict_injury['Tim Hightower'] = avg_rb
dict_injury['Chris Johnson'] = avg_rb
dict_injury['Benny Cunningham'] = avg_rb
dict_injury['Cameron Artis-Payne'] = avg_rb
dict_injury['Paul Perkins'] = avg_rb
dict_injury['Charcandrick West'] = avg_rb
dict_injury['Khiry Robinson'] = avg_rb
dict_injury['Robert Kelley'] = avg_rb
dict_injury['Josh Ferguson'] = avg_rb
dict_injury['Tyler Ervin'] = avg_rb
dict_injury['Mike Gillislee'] = avg_rb
dict_injury['Wendell Smallwood'] = avg_rb


# WRs
dict_injury['John Brown'] = avg_wr
dict_injury['Devante Parker'] = avg_wr
dict_injury['Stefon Diggs'] = avg_wr
dict_injury['Tyler Lockett'] = avg_wr
dict_injury['Willie Snead'] = avg_wr
dict_injury['Josh Gordon'] = avg_wr
dict_injury['Kamar Aiken'] = avg_wr
dict_injury['Sammie Coates'] = avg_wr
dict_injury['Phillip Dorsett'] = avg_wr
dict_injury['Jermaine Kearse'] = avg_wr
dict_injury['Ted Ginn'] = avg_wr
dict_injury['Rishard Matthews'] = avg_wr
dict_injury['Tajae Sharpe'] = avg_wr
dict_injury['Davante Adams'] = avg_wr
dict_injury['Chris Hogan'] = avg_wr
dict_injury['Cole Beasley'] = avg_wr
dict_injury['Seth Roberts'] = avg_wr


# TEs
dict_injury['Coby Fleener'] = avg_te
dict_injury['Gary Barnidge'] = avg_te
dict_injury['Zach Miller'] = avg_te
dict_injury['Clive Walford'] = avg_te
dict_injury['Will Tye'] = avg_te
dict_injury['Crockett Gillmore'] = avg_te
dict_injury['Vance McDonald'] = avg_te
dict_injury['Richard Rodgers'] = avg_te
dict_injury['Cameron Brate'] = avg_te
dict_injury['Virgil Green'] = avg_te
dict_injury['Jesse James'] = avg_te
dict_injury['Lance Kendricks'] = avg_te
dict_injury['Garrett Celek'] = avg_te
dict_injury['Ryan Griffin'] = avg_te

### I don't know how to do this for each variable name so I'm going to repeat each line of code 4 times, one for each position. It's inelegant but it's simple and it works.

In [17]:
# We only care about the top 10 QBs for a 10-team league, so get rid of the rest. 
# We also only want the top 20 RBs, 30 WRs, and 10 TEs.
# In addition to getting rid of irrelavent data, this will make it easier to match the two data sets...
# ...because they might not have the same number of players. Some might list every QB in the league, even free agents.
# However, I'm going to go a little farther because the 10th best QB in projections might not actually be...
# ...the 11th best QB if he has a high enough chance of injury.

# For all players:
N_qb = 12*3
N_rb = 80
N_wr = 80
N_te = 12*3

# Make a column of injury data that matches the names of the players in the projection data frame.
injcolumn_qb = [dict_injury[df_proj_qb['Name'][player]] for player in range(0,N_qb)]
injcolumn_rb = [dict_injury[df_proj_rb['Name'][player]] for player in range(0,N_rb)]
injcolumn_wr = [dict_injury[df_proj_wr['Name'][player]] for player in range(0,N_wr)]
injcolumn_te = [dict_injury[df_proj_te['Name'][player]] for player in range(0,N_te)]

# Make a data frame for that.
df_injcolumn_qb = pd.DataFrame(injcolumn_qb, columns=['Injury'])
df_injcolumn_rb = pd.DataFrame(injcolumn_rb, columns=['Injury'])
df_injcolumn_wr = pd.DataFrame(injcolumn_wr, columns=['Injury'])
df_injcolumn_te = pd.DataFrame(injcolumn_te, columns=['Injury'])

# Create a data frame in Pandas by concatenating that with the projection data frames.
df_qb = pd.concat([df_proj_qb[0:N_qb], df_injcolumn_qb], axis=1)
df_rb = pd.concat([df_proj_rb[0:N_rb], df_injcolumn_rb], axis=1)
df_wr = pd.concat([df_proj_wr[0:N_wr], df_injcolumn_wr], axis=1)
df_te = pd.concat([df_proj_te[0:N_te], df_injcolumn_te], axis=1)

# Export to a CSV spreadsheet.
df_qb.to_csv('FFL Injuries - QB.csv')
df_rb.to_csv('FFL Injuries - RB.csv')
df_wr.to_csv('FFL Injuries - WR.csv')
df_te.to_csv('FFL Injuries - TE.csv')

### Calculate the average score for a backup of each position. This will be used in the spreadsheet formula to calculate how many points you get when the player is injured. 

It doesn't make sense to assume you'll get zero points if your player gets injured for a month. You'll have a backup.

In [18]:
avgbackup_qb = float(df_proj_qb[8:16].mean())
avgbackup_rb = float(df_proj_rb[20:40].mean())
avgbackup_wr = float(df_proj_wr[20:40].mean())
avgbackup_te = float(df_proj_te[8:16].mean())

print('\n', 'Avg Backup QB: ', int(avgbackup_qb))
print('\n', 'Avg Backup RB: ', int(avgbackup_rb))
print('\n', 'Avg Backup WR: ', int(avgbackup_wr))
print('\n', 'Avg Backup TE: ', int(avgbackup_te))


 Avg Backup QB:  257

 Avg Backup RB:  142

 Avg Backup WR:  161

 Avg Backup TE:  123
