In [1]:
import json  # a built-in Python module for dealing with JSON data
import gzip  # a built-in Python module for dealing with gzipped files
from pprint import pprint

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
import scipy


In [2]:
data = []  # create an empty array to hold all the stints
# open the file and read the data!
f = '../data/nba_pxp/matchups-2015-final.json.gz'
with gzip.open(f) as fp:
    for j in fp:
        data.append(json.loads(j))

In [3]:
d = data[0]
pprint(d)

{'Lakers': {'entered': ['Robert Sacre'],
            'exited': ['Jordan Hill'],
            'on': ['Ronnie Price',
                   'Wayne Ellington',
                   'Wesley Johnson',
                   'Carlos Boozer',
                   'Robert Sacre'],
            'stats': {'dreb': 1,
                      'drebx': 0,
                      'fg2m': 1,
                      'fg3m': 3,
                      'fgm': 4,
                      'fgx': 0,
                      'foul': 0,
                      'fta': 0,
                      'ftm': 0,
                      'non_steal_tov': 0,
                      'oreb': 0,
                      'orebx': 0,
                      'poss': 4,
                      'pts': 11,
                      'team_tov': 0,
                      'time': 0,
                      'tov': 0}},
 'Warriors': {'entered': ['Andre Iguodala'],
              'exited': ['Marreese Speights'],
              'on': ['Stephen Curry',
                     'Klay Thompson

In [4]:
rows = []
for d in data:
    row = {}
    for side in ['home', 'away']:
        team = d[side]
        row[side + '_poss'] = poss = d[team]['stats']['poss']
        row[side + '_unit'] = unit = {name: 1 for name in d[team]['on']}
        row[side + '_pts'] = pts = d[team]['stats']['pts']
        row[side + '_ortg'] = rtg = 100 * pts / poss if poss > 0 else np.nan
    rows.append(row)
    
df = pd.DataFrame(rows)
df['netposs'] = netposs = df['home_poss'] + df['away_poss']
df['home_netpts'] = netpts = df['home_pts'] - df['away_pts']
df['home_netrtg'] = 100 * netpts / netposs

In [5]:
df.head()

Unnamed: 0,away_ortg,away_poss,away_pts,away_unit,home_ortg,home_poss,home_pts,home_unit,netposs,home_netpts,home_netrtg
0,100.0,4,4,"{'Stephen Curry': 1, 'Klay Thompson': 1, 'Andr...",275.0,4,11,"{'Ronnie Price': 1, 'Wayne Ellington': 1, 'Wes...",8,7,87.5
1,,0,0,"{'Ray McCallum': 1, 'Ben McLemore': 1, 'Nik St...",0.0,1,0,"{'Jarrett Jack': 1, 'Alan Anderson': 1, 'Joe J...",1,0,0.0
2,170.0,10,17,"{'Stephen Curry': 1, 'Klay Thompson': 1, 'Harr...",90.0,10,9,"{'Ronnie Price': 1, 'Wayne Ellington': 1, 'Wes...",20,-8,-40.0
3,0.0,2,0,"{'Stephen Curry': 1, 'Klay Thompson': 1, 'Harr...",200.0,3,6,"{'Ronnie Price': 1, 'Wayne Ellington': 1, 'Wes...",5,6,120.0
4,100.0,1,1,"{'Stephen Curry': 1, 'Klay Thompson': 1, 'Harr...",,0,2,"{'Ronnie Price': 1, 'Wayne Ellington': 1, 'Wes...",1,1,100.0


In [6]:
off_vectorizer = DictVectorizer(sparse=False)
def_vectorizer = DictVectorizer(sparse=False)

home_mask = df['home_poss'] > 0
away_mask = df['away_poss'] > 0

X_off = off_vectorizer.fit_transform(
    list(df['home_unit'].loc[home_mask]) + list(df['away_unit'].loc[away_mask]))
X_def = def_vectorizer.fit_transform(
    list(df['away_unit'].loc[home_mask]) + list(df['home_unit'].loc[away_mask]))

X_hc_off = np.vstack([np.ones((np.sum(home_mask), 1)), np.zeros((np.sum(away_mask), 1))])
X_hc_def = np.vstack([np.zeros((np.sum(home_mask), 1)), np.ones((np.sum(away_mask), 1))])

In [7]:
X = np.hstack([X_hc_off, X_off, -X_hc_def, -X_def])
y = np.hstack([df['home_ortg'].loc[home_mask], df['away_ortg'].loc[away_mask]])
w = np.hstack([df['home_poss'].loc[home_mask], df['away_poss'].loc[away_mask]]).astype(np.float)

In [14]:
clf = linear_model.RidgeCV(
    alphas=np.array([0.01, 0.1, 1.0, 10., 100., 500., 750., 1000., 1500., 2000., 5000.]), 
    cv=5,
    fit_intercept=True
)
clf.fit(X, y, sample_weight=w)

RidgeCV(alphas=array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02,   5.00000e+02,   7.50000e+02,   1.00000e+03,
         1.50000e+03,   2.00000e+03,   5.00000e+03]),
    cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False)

In [15]:
print(clf.alpha_)

1500.0


In [16]:
print(clf.coef_[0], clf.coef_[X_off.shape[1] + 1])

1.20752828193 1.20752828193


In [29]:
orapm = clf.coef_[1:X_off.shape[1] + 1]
off_idx = np.argsort(orapm)[::-1]

K = 20
for k in range(K):
    name = off_vectorizer.feature_names_[off_idx[k]]
    print("# {:>2}: {:<20} {:4.2f} orapm".format(k, name, orapm[off_idx[k]]))

#  0: James Harden         7.96 orapm
#  1: Stephen Curry        6.20 orapm
#  2: Kyle Korver          6.07 orapm
#  3: LeBron James         5.44 orapm
#  4: Russell Westbrook    5.26 orapm
#  5: Chris Paul           5.21 orapm
#  6: Carmelo Anthony      5.03 orapm
#  7: LaMarcus Aldridge    4.60 orapm
#  8: Kelly Olynyk         4.49 orapm
#  9: Ty Lawson            4.47 orapm
# 10: Tyreke Evans         4.31 orapm
# 11: George Hill          4.29 orapm
# 12: Luol Deng            4.28 orapm
# 13: Anthony Davis        4.26 orapm
# 14: Dirk Nowitzki        4.25 orapm
# 15: Lou Williams         4.18 orapm
# 16: Danny Green          4.13 orapm
# 17: Kyrie Irving         4.00 orapm
# 18: DeAndre Jordan       3.88 orapm
# 19: Klay Thompson        3.78 orapm


In [31]:
drapm = clf.coef_[X_off.shape[1] + 2:]
def_idx = np.argsort(drapm)[::-1]

K = 20
for k in range(K):
    name = def_vectorizer.feature_names_[def_idx[k]]
    print("# {:>2}: {} {:4.2f} drapm".format(k, name, drapm[def_idx[k]]))

#  0: Draymond Green 5.43 drapm
#  1: DeMarcus Cousins 4.69 drapm
#  2: Tony Allen 4.59 drapm
#  3: Nick Calathes 4.53 drapm
#  4: Darrell Arthur 4.43 drapm
#  5: Khris Middleton 4.37 drapm
#  6: Thabo Sefolosha 4.20 drapm
#  7: Kawhi Leonard 4.19 drapm
#  8: Michael Kidd-Gilchrist 4.12 drapm
#  9: Markieff Morris 4.07 drapm
# 10: C.J. Miles 3.66 drapm
# 11: Jusuf Nurkic 3.65 drapm
# 12: Zach Randolph 3.60 drapm
# 13: Tim Duncan 3.52 drapm
# 14: Alan Anderson 3.51 drapm
# 15: Wesley Matthews 3.51 drapm
# 16: Anthony Davis 3.42 drapm
# 17: Dwight Howard 3.27 drapm
# 18: Cory Jefferson 3.23 drapm
# 19: Kosta Koufos 3.19 drapm
