# NFLCorrel Model

In [1]:
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
%matplotlib inline

## Data Preparation
Data was previously compiled in notebook 1.0.

In [2]:
boxscores = pd.read_csv('../data/boxscores2.csv')
boxscores.head()

Unnamed: 0,gsis_id,gamekey,start_time,week,season_year,season_type,finished,home_team,home_score,home_score_q5,...,away_receiving_yds,away_rushing_att,away_rushing_loss,away_rushing_tds,away_rushing_yds,home_passing_ypa,away_passing_ypa,home_rushing_ypa,away_rushing_ypa,neutral_site
0,2014122106,56403,2014-12-21 18:00:00,16,2014,Regular,True,PIT,20,0,...,317.0,14.0,0.0,0.0,39.0,8.8,6.891304,2.615385,2.785714,False
1,2015102500,56595,2015-10-25 13:30:00,7,2015,Regular,True,JAC,34,0,...,298.0,28.0,0.0,0.0,115.0,6.275862,7.095238,3.75,4.107143,True
2,2014122107,56404,2014-12-21 18:00:00,16,2014,Regular,True,TB,3,0,...,318.0,31.0,0.0,1.0,121.0,5.653846,7.95,1.142857,3.903226,False
3,2014122108,56405,2014-12-21 21:05:00,16,2014,Regular,True,STL,27,0,...,391.0,34.0,0.0,1.0,128.0,9.0625,12.21875,5.3,3.764706,False
4,2014122109,56406,2014-12-21 21:25:00,16,2014,Regular,True,DAL,42,0,...,235.0,10.0,0.0,0.0,1.0,12.428571,5.340909,3.175,0.1,False


In [3]:
boxscores.columns

Index(['gsis_id', 'gamekey', 'start_time', 'week', 'season_year',
       'season_type', 'finished', 'home_team', 'home_score', 'home_score_q5',
       'away_team', 'away_score', 'away_score_q5', 'home_defense_ffum',
       'home_defense_frec', 'home_defense_int', 'home_defense_pass_def',
       'home_defense_qbhit', 'home_defense_sk', 'home_defense_tkl_loss',
       'home_fumbles_forced', 'home_fumbles_lost', 'home_fumbles_notforced',
       'home_passing_att', 'home_passing_int', 'home_passing_sk',
       'home_passing_tds', 'home_passing_yds', 'home_puntret_yds',
       'home_receiving_yds', 'home_rushing_att', 'home_rushing_loss',
       'home_rushing_tds', 'home_rushing_yds', 'away_defense_ffum',
       'away_defense_frec', 'away_defense_int', 'away_defense_pass_def',
       'away_defense_qbhit', 'away_defense_sk', 'away_defense_tkl_loss',
       'away_fumbles_forced', 'away_fumbles_lost', 'away_fumbles_notforced',
       'away_passing_att', 'away_passing_int', 'away_passing_sk',
 

### Data Description

The following objects were created based on this data:

- $team\_home$ zero-indexed integer for home team
- $team\_away$ zero-indexed integer for away team
- $season$ zero-indexed integer for season
- $per\_won_{home}$ Percent the home team won per the logistic regression

In [4]:
teams = boxscores['home_team'].unique()
teams = pd.DataFrame(teams, columns=['team'])
teams['i'] = teams.index
num_teams = len(teams.team)

boxscores = pd.merge(boxscores, teams, left_on='home_team', right_on='team', how='left')
boxscores = boxscores.rename(columns = {'i': 'i_home'}).drop('team', 1)
boxscores = pd.merge(boxscores, teams, left_on='away_team', right_on='team', how='left')
boxscores = boxscores.rename(columns = {'i': 'i_away'}).drop('team', 1)

team_home = boxscores.i_home.values
team_away = boxscores.i_away.values

seasons = boxscores['season_year'].unique()
seasons = pd.DataFrame(seasons, columns=['season'])
seasons.sort_values('season',inplace=True)
seasons = pd.DataFrame(seasons.season.values, columns=['season']) # to reset index
seasons['j'] = seasons.index
num_seasons = len(seasons.season)

boxscores = pd.merge(boxscores, seasons, left_on='season_year', right_on='season', how='left')
boxscores = boxscores.rename(columns = {'j': 'j_season'}).drop('season', 1)

season = boxscores.j_season.values

For the moment, the following formula is used as a stand-in for home_perc_won. This will be replaced a better model in the future.

$$(1/2) ( 1 + \frac{WinnerPoints - LoserPoints}{WinnerPoints + LoserPoints} )$$

In [5]:
boxscores['home_perc_won'] = (boxscores['home_score'] - boxscores['away_score'])/(boxscores['home_score'] + boxscores['away_score'])
boxscores['home_perc_won'] = (boxscores['home_perc_won'] + 1) / 2

per_won_home = boxscores.home_perc_won

num_games = len(per_won_home)

## Building the Model
### Model Description
Objects in model.

- $strength_{i,j}$ is a 2-d array describing team $i$ strength in year $j$
    - Each element of $strength$ is normally distributed with (arbitrary) mean 500 and sd 150.
- For each game, create performance $perf_{home}$ and $perf_{away}$ normally distributed with mean $strength_{i,j}$ and some standard deviation.
    - How do we set the standard deviation of performance? I'm not sure. This parameter may need to be tuned.
- For each game, the percent the home team won is a deterministic function: $per\_won_{home} = perf_{home} / (perf_{home} + perf_{away})$

In [10]:
model = pm.Model()
with pm.Model() as model:
    strength = pm.Normal('strength',
                        mu = 500,
                        tau = 1/(150**2),
                        shape=(num_teams, num_seasons))
    
    def get_mus(side, strength=strength, season=season, team_home=team_home, team_away=team_away):
        mus = []
        for i in range(num_games):
            if side == 'home':
                mu = strength[team_home[i], season[i]]
            elif side == 'away':
                mu = strength[team_away[i], season[i]]
            mus.append(mu)
        return mus
    
    perf_home = pm.MvNormal('perf_home',
                            mu=pm.DensityDist('home_mus', get_mus('home')),
                            tau=(1/(150**2))*np.eye(num_games),
                            shape=num_games)
    
    perf_away = pm.MvNormal('perf_away',
                            mu=pm.DensityDist('away_mus', get_mus('away')),
                            tau=(1/(150**2))*np.eye(num_games),
                            shape=num_games)
    
    per_won_home = pm.Deterministic('per_won_home', perf_home() / (perf_home() + perf_away()))

TypeError: 'list' object is not callable

In [None]:
print(range(num_games))

In [None]:
with model:
    
    start = pm.find_MAP()
    step = pm.NUTS(state=start)
    trace = pm.sample(2000, step, start=start)
    
    pm.traceplot(trace)