In [1]:
# Load relevant libraries.

%pylab inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.api import abline_plot
import patsy


Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv("runningback.csv")
#The included dataset is intentionally truncated to provide a small example set for review.
#This set cannot be used to predict actual performance in any week after football week 10, 2015.
#A real weekly dataset needs to incorporate each Thursday's NFL.com adjusted release.
print data.columns

Index([u'GID', u'Unnamed: 1', u'Pos', u'Name', u'Team', u'Opponent',
       u'Home/Away', u'Salary', u'Salary Change', u'Points', u'GP',
       u'Pts_Game', u'Pts_G_$', u'Pts_G(alt)', u'Bye week', u'PLAYS', u'RUN',
       u'RYD', u'RTD', u'TARGETS', u'REC', u'PYDS', u'PTD', u'FUM', u'INT',
       u'PERF', u'LAST', u'Connections'],
      dtype='object')


In [3]:
RBmod = smf.ols('PERF ~ PLAYS + RUN + RYD + RTD + TARGETS + REC + PYDS + PTD + FUM + LAST + Connections', data = data).fit()
print
print RBmod.summary()


                            OLS Regression Results                            
Dep. Variable:                   PERF   R-squared:                       0.785
Model:                            OLS   Adj. R-squared:                  0.767
Method:                 Least Squares   F-statistic:                     44.73
Date:                Wed, 16 Dec 2015   Prob (F-statistic):           1.27e-39
Time:                        09:58:11   Log-Likelihood:                 55.814
No. Observations:                 147   AIC:                            -87.63
Df Residuals:                     135   BIC:                            -51.74
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept      -0.0594      0.057     -1.035     

As shown above, the adjusted R^2 value is relatively high, indicating ~78% accountability for variability, and is very close to the R^2 value, implying that the variables chosen are meaningful. 

However, since we are using a binary outcome variable, that is outperform = TRUE or FALSE, a logit model gives us a better representation.

In [4]:
RBlogitmod = smf.logit('PERF ~ PLAYS + RUN + RYD + RTD + TARGETS + REC + PYDS + PTD + FUM + Connections', data = data).fit()
print
print RBlogitmod.summary()

Optimization terminated successfully.
         Current function value: 0.194123
         Iterations 8

                           Logit Regression Results                           
Dep. Variable:                   PERF   No. Observations:                  147
Model:                          Logit   Df Residuals:                      136
Method:                           MLE   Df Model:                           10
Date:                Wed, 16 Dec 2015   Pseudo R-squ.:                  0.5401
Time:                        09:58:16   Log-Likelihood:                -28.536
converged:                       True   LL-Null:                       -62.051
                                        LLR p-value:                 1.654e-10
                  coef    std err          z      P>|z|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept      -4.0618      1.462     -2.779      0.005        -6.926    -1.197
PLAYS           2.0577   

In [5]:
# Using Adrian Peterson's data to predict performance in Week 10 as a test

e = np.exp(RBlogitmod.params['Intercept'] + RBlogitmod.params['PLAYS']*199 + 
                 RBlogitmod.params['RUN']*180 + RBlogitmod.params['RYD']*810 + 
                 RBlogitmod.params['RTD']*4 + RBlogitmod.params['TARGETS']*19 + 
                 RBlogitmod.params['REC']*18 + RBlogitmod.params['PYDS']*141 + 
                 RBlogitmod.params['PTD']*0 + RBlogitmod.params['FUM'] * 1)
print(e / (1 + e))

1.0


As shown above, Adrian Peterson is correctly predicted to outperform 12 points in Week 10

Simultaneously removing all apparently marginally negligible variables, the model reduces to 6 decision variables.

In [3]:
RBlogitmod = smf.logit('PERF ~ PLAYS + RUN + RTD + TARGETS + PTD + FUM', data = data).fit()
print
print RBlogitmod.summary()

Optimization terminated successfully.
         Current function value: 0.205109
         Iterations 8

                           Logit Regression Results                           
Dep. Variable:                   PERF   No. Observations:                  147
Model:                          Logit   Df Residuals:                      140
Method:                           MLE   Df Model:                            6
Date:                Wed, 16 Dec 2015   Pseudo R-squ.:                  0.5141
Time:                        10:17:27   Log-Likelihood:                -30.151
converged:                       True   LL-Null:                       -62.051
                                        LLR p-value:                 7.578e-12
                 coef    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -5.3409      1.075     -4.967      0.000        -7.448    -3.233
PLAYS          2.2079      1

Six variables are reasonable for user input, and user information can then inform the model.

In [4]:
#Multiple while loops are personal preference and simply facilitate
#easy addition/removal of variables with minimal proofreading
while True:
    try:
        PLAYS = int(raw_input("Number of plays? "))
    except ValueError:
            print "Please enter a number"
            continue
    else:
        break
while True:
    try:
        RUN = int(raw_input("Number of running plays? "))
    except ValueError:
            print "Please enter a number"
            continue
    else:
        break
while True:
    try:
        RTD = int(raw_input("Number of rushing touchdowns? "))
    except ValueError:
            print "Please enter a number"
            continue
    else:
        break
while True:
    try:
        TARGETS = int(raw_input("Number of targets? "))
    except ValueError:
            print "Please enter a number"
            continue
    else:
        break
while True:
    try:
        PTD = int(raw_input("Number of receiving touchdowns? "))
    except ValueError:
            print "Please enter a number"
            continue
    else:
        break
while True:
    try:
        FUM = int(raw_input("Number of fumbles? "))
    except ValueError:
            print "Please enter a number"
            continue
    else:
        break

Number of plays? 199
Number of running plays? 180
Number of rushing touchdowns? 4
Number of targets? 19
Number of receiving touchdowns? 0
Number of fumbles? 1


In [6]:
# 199, 180, 4, 19, 0, 1 Adrian Peterson
e = np.exp(RBlogitmod.params['Intercept'] + RBlogitmod.params['PLAYS']*PLAYS + 
                 RBlogitmod.params['RUN']*RUN + RBlogitmod.params['RTD']*RTD + 
                 RBlogitmod.params['TARGETS']*TARGETS + RBlogitmod.params['PTD']*PTD + RBlogitmod.params['FUM'] * FUM)
print(e / (1 + e))

0.634151719877
