# NFL Scrape
To get some of the data for our model, we will scrape it from pro-football-reference.com.

In [1]:
from lxml import html
import requests
import pandas as pd

In [2]:
# scraping NFL 2014-2017 seasons

df = pd.DataFrame()

for year in range(2014,2018):
    url = 'https://www.pro-football-reference.com/years/%d/games.htm' % year
    page = requests.get(url)

# build the DOM Tree
    tree = html.fromstring(page.content)

# get data
    data = tree.xpath('//*[(@id = "games")]//a | //*[(@id = "games")]//strong | //td | //*[contains(concat( " ", @class, " " ), concat( " ", "right", " " ))]')

    row = []
    r = 0
    
# dealing with imperfect scraping because of tie games 

    for x in range(len(data)):
        if ((year==2014)&((x==1600)|(x==1606))):
            row.append('None')
            r = 17
        if ((year==2016)&((x==1999)|(x==2005))):
            row.append('None')
            r = 17
        if ((year==2016)&((x==2054)|(x==2060))):
            row.append('None')
            r = 15
        if (x%19==r) & (x>0):
            row.append(year)
            df = df.append([row])
            row = []
        row.append(data[x].text)

    df = df.iloc[:-11]

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1,Thu,September 4,8:42PM,,,Seattle Seahawks,,,Green Bay Packers,,boxscore,,36,16,398,1,255,1,2014
0,1,Sun,September 7,1:00PM,,,Minnesota Vikings,@,,St. Louis Rams,,boxscore,,34,6,355,0,318,2,2014
0,1,Sun,September 7,1:00PM,,,Buffalo Bills,@,,Chicago Bears,,boxscore,,23,20,360,1,427,3,2014
0,1,Sun,September 7,1:02PM,,,Pittsburgh Steelers,,,Cleveland Browns,,boxscore,,30,27,503,1,389,0,2014
0,1,Sun,September 7,1:02PM,,,New York Jets,,,Oakland Raiders,,boxscore,,19,14,402,2,158,0,2014


Many columns do not have any use to us, so they will be dropped.

In [4]:
df = df.drop([1,2,3,4,5,8,10,11,12], axis=1)

We will rename the columns to make them readable. 

In [5]:
df.columns = ['Week', 'Win_team', 'Home_Away','Lose_team', 'Win_pts', 'Lose_pts','Win_yds','Win_TO','Lose_yds','Lose_TO','Season']


The scraped data comes in the form of winning/ losing team and their stats, so we will convert this data into home and away data to match our other dataset. 

In [6]:
# The winning team is always listed first, and then followed by the Home_Away column, which indicates who is the 
# home team using an @ symbol. If there is an @ symbol the losing team is the home team.

def home(x):
    wteam = x[0]
    ha = x[1]
    lteam = x[2]
    if (ha=='@'):
        ht = lteam
    else:
        ht = wteam
    return ht

def away(x):
    wteam = x[0]
    ha = x[1]
    lteam = x[2]
    if (ha=='@'):
        at = wteam
    else:
        at = lteam
    return at

df['Home_team'] = df[['Win_team','Home_Away','Lose_team']].apply(home, axis=1)
df['Away_team'] = df[['Win_team','Home_Away','Lose_team']].apply(away, axis=1)

Now that we know who the home or away team is, we convert the winning or losing team's stats to home or away stats.

In [7]:
def home_cols(x):
    wt = x[0]
    lt = x[1]
    ht = x[2]
    at = x[3]
    wcol = x[4]
    lcol = x[5]
    if wt==ht:
        home = wcol
    elif lt==ht:
        home = lcol
    return home

def away_cols(x):
    wt = x[0]
    lt = x[1]
    ht = x[2]
    at = x[3]
    wcol = x[4]
    lcol = x[5]
    if wt==at:
        away = wcol
    elif lt==at:
        away = lcol
    return away

df['Home_pts_for'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_pts','Lose_pts']].apply(home_cols, axis=1)
df['Home_pts_against'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_pts','Lose_pts']].apply(away_cols, axis=1)
df['Away_pts_for'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_pts','Lose_pts']].apply(away_cols, axis=1)
df['Away_pts_against'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_pts','Lose_pts']].apply(home_cols, axis=1)

df['Home_yds_for'] = df[['Win_team','Lose_team','Home_team','Away_team','Win_yds','Lose_yds']].apply(home_cols, axis=1)
df['Home_yds_against'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_yds','Lose_yds']].apply(away_cols, axis=1)
df['Away_yds_for'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_yds','Lose_yds']].apply(away_cols, axis=1)
df['Away_yds_against'] = df[['Win_team','Lose_team','Home_team','Away_team','Win_yds','Lose_yds']].apply(home_cols, axis=1)

df['Home_TO_for'] = df[['Win_team','Lose_team','Home_team','Away_team','Win_TO','Lose_TO']].apply(home_cols, axis=1)
df['Home_TO_against'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_TO','Lose_TO']].apply(away_cols, axis=1)
df['Away_TO_for'] = df[['Win_team','Lose_team','Home_team','Away_team', 'Win_TO','Lose_TO']].apply(away_cols, axis=1)
df['Away_TO_against'] = df[['Win_team','Lose_team','Home_team','Away_team','Win_TO','Lose_TO']].apply(home_cols, axis=1)

We change the data type of the columns so that we can create rolling averages for these statistics throughout each season.

In [8]:
df[['Home_pts_for', 'Home_pts_against',
       'Away_pts_for', 'Away_pts_against', 'Home_yds_for', 'Home_yds_against',
       'Away_yds_for', 'Away_yds_against', 'Home_TO_for', 'Home_TO_against',
       'Away_TO_for', 'Away_TO_against']] = df[['Home_pts_for', 'Home_pts_against',
       'Away_pts_for', 'Away_pts_against', 'Home_yds_for', 'Home_yds_against',
       'Away_yds_for', 'Away_yds_against', 'Home_TO_for', 'Home_TO_against',
       'Away_TO_for', 'Away_TO_against']].astype(float)

df[['Week','Season']]=df[['Week','Season']].astype(int)

We add columns to indicate wins and losses for each team throughout the season prior to the game being played. 

In [9]:
def wins(x):
    team = x[0]
    week = x[1]
    season = x[2]
    df_t = df[(df['Win_team']==team)&(df['Season']==season)&(df['Week']<week)]
    w = len(df_t)
    return w

def loss(x):
    team = x[0]
    week = x[1]
    season = x[2]
    df_t = df[(df['Lose_team']==team)&(df['Season']==season)&(df['Week']<week)]
    w = len(df_t)
    return w

df['Home_wins'] = df[['Home_team','Week','Season']].apply(wins, axis=1)
df['Home_losses'] = df[['Home_team','Week','Season']].apply(loss, axis=1)
df['Away_wins'] = df[['Away_team','Week','Season']].apply(wins, axis=1)
df['Away_losses'] = df[['Away_team','Week','Season']].apply(loss, axis=1)

This column indicates whether the home team won the game that was played or not. It will not be used in the model, but will be used to show what features describe a winning team in general that will hopefully be useful in our pre-model exploratory analysis. If used in the model they would definitely be a point of data leakage, giving away information that our model should not and would not have prior to prediction. 

In [10]:
def home_win(x):
    wt=x[0]
    ht=x[1]
    win = 0
    if wt==ht:
        win=1
    else:
        win=0
    return win

df['Home_win']=df[['Win_team','Home_team']].apply(home_win,axis=1)

Now that we've converted our data to home/ away data, we can drop the win/ lose data.

In [11]:
df = df.drop(['Win_team','Home_Away','Lose_team','Win_pts','Lose_pts','Win_yds','Win_TO','Lose_yds','Lose_TO'], axis=1)

Using several functions, we will create columns that represent the rolling averages of points per game for, points per game against, yards per game for, yards per game against, turnovers for, and turnovers against for both the home and away teams. For each row these values will indicate the average for the given statistic prior to the game being played and only in that season.

In [12]:
# functions for rolling averages 

def r_a_ppg_for(x):
    team = x[0]
    week = x[1]
    season = x[2]
    
    df_t = df[((df['Home_team']==team)|(df['Away_team']==team))&(df['Season']==season)&(df['Week']<week)]
    df_t.reset_index(drop=True,inplace=True)
    r_a = 0
    for row in range(len(df_t)):
        if df_t['Home_team'][row]==team:
            r_a+=df_t['Home_pts_for'][row]
        elif df_t['Away_team'][row]==team:
            r_a+=df_t['Away_pts_for'][row]
    try:
        r_a = r_a/len(df_t)
    except:
        r_a = 0
    return r_a

def r_a_ppg_against(x):
    team = x[0]
    week = x[1]
    season = x[2]
    
    df_t = df[((df['Home_team']==team)|(df['Away_team']==team))&(df['Season']==season)&(df['Week']<week)]
    df_t.reset_index(drop=True,inplace=True)
    r_a = 0
    for row in range(len(df_t)):
        if df_t['Home_team'][row]==team:
            r_a+=df_t['Away_pts_for'][row]
        elif df_t['Away_team'][row]==team:
            r_a+=df_t['Home_pts_for'][row]
    try:
        r_a = r_a/len(df_t)
    except:
        r_a = 0
    return r_a

df['Home_ppg_for'] = df[['Home_team','Week','Season']].apply(r_a_ppg_for, axis=1)
df['Away_ppg_for'] = df[['Away_team','Week','Season']].apply(r_a_ppg_for, axis=1)
df['Home_ppg_against'] = df[['Home_team','Week','Season']].apply(r_a_ppg_against, axis=1)
df['Away_ppg_against'] = df[['Away_team','Week','Season']].apply(r_a_ppg_against, axis=1)



def r_a_ypg_for(x):
    team = x[0]
    week = x[1]
    season = x[2]
    
    df_t = df[((df['Home_team']==team)|(df['Away_team']==team))&(df['Season']==season)&(df['Week']<week)]
    df_t.reset_index(drop=True,inplace=True)
    r_a = 0
    for row in range(len(df_t)):
        if df_t['Home_team'][row]==team:
            r_a+=df_t['Home_yds_for'][row]
        elif df_t['Away_team'][row]==team:
            r_a+=df_t['Away_yds_for'][row]
    try:
        r_a = r_a/len(df_t)
    except:
        r_a = 0
    return r_a

def r_a_ypg_against(x):
    team = x[0]
    week = x[1]
    season = x[2]
    
    df_t = df[((df['Home_team']==team)|(df['Away_team']==team))&(df['Season']==season)&(df['Week']<week)]
    df_t.reset_index(drop=True,inplace=True)
    r_a = 0
    for row in range(len(df_t)):
        if df_t['Home_team'][row]==team:
            r_a+=df_t['Away_yds_for'][row]
        elif df_t['Away_team'][row]==team:
            r_a+=df_t['Home_yds_for'][row]
    try:
        r_a = r_a/len(df_t)
    except:
        r_a = 0
    return r_a

df['Home_ypg_for'] = df[['Home_team','Week','Season']].apply(r_a_ypg_for, axis=1)
df['Away_ypg_for'] = df[['Away_team','Week','Season']].apply(r_a_ypg_for, axis=1)
df['Home_ypg_against'] = df[['Home_team','Week','Season']].apply(r_a_ypg_against, axis=1)
df['Away_ypg_against'] = df[['Away_team','Week','Season']].apply(r_a_ypg_against, axis=1)



def to_for(x):
    team = x[0]
    week = x[1]
    season = x[2]
    
    df_t = df[((df['Home_team']==team)|(df['Away_team']==team))&(df['Season']==season)&(df['Week']<week)]
    df_t.reset_index(drop=True,inplace=True)
    to = 0
    for row in range(len(df_t)):
        if df_t['Home_team'][row]==team:
            to+=df_t['Home_TO_for'][row]
        elif df_t['Away_team'][row]==team:
            to+=df_t['Away_TO_for'][row]
    return to

def to_against(x):
    team = x[0]
    week = x[1]
    season = x[2]
    
    df_t = df[((df['Home_team']==team)|(df['Away_team']==team))&(df['Season']==season)&(df['Week']<week)]
    df_t.reset_index(drop=True,inplace=True)
    to = 0
    for row in range(len(df_t)):
        if df_t['Home_team'][row]==team:
            to+=df_t['Away_TO_for'][row]
        elif df_t['Away_team'][row]==team:
            to+=df_t['Home_TO_for'][row]
    return to

df['Home_TO_for_total'] = df[['Home_team','Week','Season']].apply(to_for, axis=1)
df['Away_TO_for_total'] = df[['Away_team','Week','Season']].apply(to_for, axis=1)
df['Home_TO_against_total'] = df[['Home_team','Week','Season']].apply(to_against, axis=1)
df['Away_TO_against_total'] = df[['Away_team','Week','Season']].apply(to_against, axis=1)

After creating the rolling average columns, we can drop the statistics for each game. Like the Home_win column, these columns would be a point of data leakage as they represent statistics at the conclusion of the given game. 

In [13]:
df = df.drop(['Home_pts_for',
       'Home_pts_against', 'Away_pts_for', 'Away_pts_against', 'Home_yds_for',
       'Home_yds_against', 'Away_yds_for', 'Away_yds_against', 'Home_TO_for',
       'Home_TO_against', 'Away_TO_for', 'Away_TO_against'], axis=1)

In order to match our other data set, we map the full team names to their abbreviations. 

In [14]:
abr = {'Seattle Seahawks':'SEA', 'Chicago Bears':'CHI', 'St. Louis Rams':'LAR',
       'Kansas City Chiefs':'KAN', 'Baltimore Ravens':'BAL', 'Philadelphia Eagles':'PHI',
       'Pittsburgh Steelers':'PIT', 'New York Jets':'NYJ', 'Atlanta Falcons':'ATL',
       'Miami Dolphins':'MIA', 'Houston Texans':'HOU', 'Tampa Bay Buccaneers':'TAM',
       'Dallas Cowboys':'DAL', 'Denver Broncos':'DEN', 'Detroit Lions':'DET',
       'Arizona Cardinals':'ARI', 'Buffalo Bills':'BUF', 'Cleveland Browns':'CLE',
       'Tennessee Titans':'TEN', 'New York Giants':'NYG', 'Carolina Panthers':'CAR',
       'Washington Redskins':'WAS', 'Minnesota Vikings':'MIN', 'Cincinnati Bengals':'CIN',
       'San Diego Chargers':'LAC', 'Oakland Raiders':'OAK', 'Green Bay Packers':'GNB',
       'San Francisco 49ers':'SFO', 'Indianapolis Colts':'IND', 'Jacksonville Jaguars':'JAX',
       'New England Patriots':'NWE', 'New Orleans Saints':'NOR', 'Los Angeles Rams':'LAR',
       'Los Angeles Chargers':'LAC'}

df['Home_team'] = df['Home_team'].map(abr)
df['Away_team'] = df['Away_team'].map(abr)

We add a turnover difference column for the home and away teams that will hopefully improve our model. 

In [15]:
df['Home_TO_diff'] = df['Home_TO_for_total']-df['Home_TO_against_total']
df['Away_TO_diff'] = df['Away_TO_for_total']-df['Away_TO_against_total']

df.head()

Unnamed: 0,Week,Season,Home_team,Away_team,Home_wins,Home_losses,Away_wins,Away_losses,Home_win,Home_ppg_for,...,Home_ypg_for,Away_ypg_for,Home_ypg_against,Away_ypg_against,Home_TO_for_total,Away_TO_for_total,Home_TO_against_total,Away_TO_against_total,Home_TO_diff,Away_TO_diff
0,1,2014,SEA,GNB,0,0,0,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,2014,LAR,MIN,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,2014,CHI,BUF,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,2014,PIT,CLE,0,0,0,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,2014,NYJ,OAK,0,0,0,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This dataset is ready and will be saved as a csv file so it can easily be merged with the other data we have. 

In [16]:
df.to_csv('NFL_scrape.csv')