In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import Window
import configparser

In [3]:
from tqdm import tqdm_notebook

In [4]:
config = configparser.ConfigParser()

config.read(os.path.expanduser("~/.aws/credentials"))

access_key = config.get('personal', "aws_access_key_id")
secret_key = config.get('personal', "aws_secret_access_key")

In [5]:
spark = SparkSession.builder.appName("myApp").config(
    "spark.jars.packages",
    "org.apache.hadoop:hadoop-aws:2.7.3,com.amazonaws:aws-java-sdk:1.7.4",
).config(
    "fs.s3a.access.key",
    access_key,
).config(
    "fs.s3a.secret.key",
    secret_key,
).config(
    'spark.driver.memory',
    '16g',
).getOrCreate()

In [7]:
from pyspark.sql.types import *

In [8]:
schema = StructType([
    StructField('EventID', LongType(), True),
    StructField('Season', LongType(), True),
    StructField('DayNum', LongType(), True),
    StructField('WTeamID', LongType(), True),
    StructField('LTeamID', LongType(), True),
    StructField('WPoints', LongType(), True),
    StructField('LPoints', LongType(), True),
    StructField('ElapsedSeconds', LongType(), True),
    StructField('EventTeamID', LongType(), True),
    StructField('EventPlayerID', LongType(), True),
    StructField('EventType', StringType(), True),
])

In [6]:
events_2010 = pd.read_csv(
    './data/mens-machine-learning-competition-2019/raw/PlayByPlay/Events_{}.csv'
    .format(2010))

In [9]:
players_2010 = pd.read_csv(
    './data/mens-machine-learning-competition-2019/raw/PlayByPlay/Players_{}.csv'
    .format(2010))

In [22]:
# events_2010.set_index(['EventID'], inplace=True)

In [53]:
onegame = events_2010.loc[events_2010.gameid == '201000713141198'].copy()

In [57]:
oneteam = onegame.loc[onegame.EventTeamID == 1314].reset_index().iloc[:23].copy()

In [59]:
oneteam.EventPlayerID.unique()

array([603071, 603060, 603064, 603059, 603063])

In [None]:
events = pd.concat([
    pd.read_csv(
        './data/mens-machine-learning-competition-2019/raw/PlayByPlay/Events_{}.csv'
        .format(year)) for year in range(2010, 2019)
])

In [8]:
sdf = spark.createDataFrame(events, schema=schema)

In [9]:
sdf.write.partitionBy('Season').parquet(
    './data/mens-machine-learning-competition-2019/processed/events.parquet/',
    mode='overwrite',
)

In [22]:
from tqdm import tqdm_notebook

In [10]:
players = pd.concat([
    pd.read_csv(
        './data/mens-machine-learning-competition-2019/raw/PlayByPlay/Players_{}.csv'
        .format(year)) for year in range(2010, 2019)
])

In [12]:
players_sdf = spark.createDataFrame(players)

In [13]:
players_sdf.write.parquet(
    './data/mens-machine-learning-competition-2019/processed/players.parquet/',
    mode='overwrite',
)

In [11]:
sdf = spark.read.parquet('./data/mens-machine-learning-competition-2019/processed/events.parquet/')

In [12]:
sdf = sdf.withColumn(
    'gameid',
    F.concat(
        F.col('Season'),
        F.format_string('%03d', F.col('DayNum')),
        F.col('WTeamID'),
        F.col('LTeamID'),
    ))

In [15]:
cols = [
    'Season',
    'DayNum',
    'WPoints',
    'LPoints',
    'ElapsedSeconds',
    'EventTeamID',
    'EventPlayerID',
    'EventType',
]

In [18]:
sdf.filter(sdf.Season == 2012).filter(F.col('WPoints') + F.col('LPoints') > 0
                                      ).orderBy('EventID').select(cols).show()

+------+------+-------+-------+--------------+-----------+-------------+----------+
|Season|DayNum|WPoints|LPoints|ElapsedSeconds|EventTeamID|EventPlayerID| EventType|
+------+------+-------+-------+--------------+-----------+-------------+----------+
|  2012|     7|      0|      2|            50|       1434|       615312| made2_lay|
|  2012|     7|      0|      3|           112|       1434|       615322|made1_free|
|  2012|     7|      1|      3|           133|       1112|       610642|made1_free|
|  2012|     7|      2|      3|           133|       1112|       610642|made1_free|
|  2012|     7|      2|      5|           160|       1434|       615316|made2_jump|
|  2012|     7|      4|      5|           225|       1112|       610652|made2_jump|
|  2012|     7|      6|      5|           305|       1112|       610642|made2_jump|
|  2012|     7|      7|      5|           319|       1112|       610642|made1_free|
|  2012|     7|      7|      7|           361|       1434|       615315|made

In [10]:
sdf.groupBy('EventType').count().orderBy(F.col('count').desc()).show()

+----------+-------+
| EventType|  count|
+----------+-------+
|   sub_out|3003532|
|    sub_in|2995018|
|   reb_def|2334422|
| foul_pers|1807806|
|made1_free|1394454|
|  turnover|1269209|
|    assist|1255898|
|miss3_jump|1220610|
|miss2_jump|1084121|
|   reb_off|1046828|
| made2_lay| 931981|
| miss2_lay| 723871|
|made3_jump| 643679|
|     steal| 610154|
|made2_jump| 603199|
|miss1_free| 527440|
|timeout_tv| 347186|
|     block| 333282|
|   timeout| 319349|
|  reb_dead| 288413|
+----------+-------+
only showing top 20 rows



In [40]:
print(os.listdir("../input/datafiles/"))

['NCAATourneyCompactResults.csv', 'SecondaryTourneyTeams.csv', 'NCAATourneyDetailedResults.csv', 'TeamConferences.csv', 'Teams.csv', 'RegularSeasonDetailedResults.csv', 'SecondaryTourneyCompactResults.csv', 'NCAATourneySlots.csv', 'Seasons.csv', 'Cities.csv', 'RegularSeasonCompactResults.csv', 'Conferences.csv', 'NCAATourneySeeds.csv', 'TeamSpellings.csv', 'GameCities.csv', 'ConferenceTourneyGames.csv', 'TeamCoaches.csv', 'NCAATourneySeedRoundSlots.csv']


In [None]:
# teams = pd.read_csv('../input/datafiles/Teams.csv')
# teams2 = pd.read_csv('../input/datafiles/TeamSpellings.csv', encoding='latin-1')
# season_cresults = pd.read_csv('../input/datafiles/RegularSeasonCompactResults.csv')
# season_dresults = pd.read_csv('../input/datafiles/RegularSeasonDetailedResults.csv')
# tourney_cresults = pd.read_csv('../input/datafiles/NCAATourneyCompactResults.csv')
# tourney_dresults = pd.read_csv('../input/datafiles/NCAATourneyDetailedResults.csv')
# slots = pd.read_csv('../input/datafiles/NCAATourneySlots.csv')
# seeds = pd.read_csv('../input/datafiles/NCAATourneySeeds.csv')

In [None]:
# SecondaryTourneyCompactResults = pd.read_csv('../input/datafiles/SecondaryTourneyCompactResults.csv')
# ConferenceTourneyGames = pd.read_csv('../input/datafiles/ConferenceTourneyGames.csv')
# GameCities = pd.read_csv('../input/datafiles/GameCities.csv')
# SecondaryTourneyTeams = pd.read_csv('../input/datafiles/SecondaryTourneyTeams.csv')

In [2]:
teams = pd.read_csv('../input/datafiles/Teams.csv')

In [8]:
import pandas as pd
from patsy.contrasts import Sum
from scipy.sparse import csr_matrix, hstack
from scipy.sparse.linalg import lsqr


def load_compact_data():
    season_cresults = pd.read_csv(
        '../input/datafiles/RegularSeasonCompactResults.csv')
    tourney_cresults = pd.read_csv(
        '../input/datafiles/NCAATourneyCompactResults.csv')
    secondary_tourney_cresults = pd.read_csv(
        '../input/datafiles/SecondaryTourneyCompactResults.csv')
    season_cresults['CRType'] = 'Regular'
    tourney_cresults['CRType'] = 'NCAA'
    secondary_tourney_cresults['CRType'] = 'Secondary'

    compact1 = pd.concat([
        season_cresults,
        tourney_cresults,
        secondary_tourney_cresults,
    ],
                         ignore_index=True,
                         join='inner').rename(columns=str.lower)

    compact1 = compact1.rename(
        columns={
            'wteamid': 'team1',
            'wscore': 'score1',
            'lteamid': 'team2',
            'lscore': 'score2',
            'wloc': 'team1loc'
        })

    compact2 = compact1.rename(
        columns={
            'team1': 'team2',
            'score1': 'score2',
            'team2': 'team1',
            'score2': 'score1',
        })

    compact2['team1loc'] = compact2.team1loc.map({
        'H': 'A',
        'A': 'H',
        'N': 'N',
    })

    compact = pd.concat([
        compact1,
        compact2,
    ],
                        ignore_index=True,
                        join='inner')

    compact['team1loc_num'] = compact.team1loc.map({
        'H': 1,
        'A': -1,
        'N': 0,
    })

    return compact


def load_detailed_data():
    season_dresults = pd.read_csv(
        '../input/datafiles/RegularSeasonDetailedResults.csv')
    tourney_dresults = pd.read_csv(
        '../input/datafiles/NCAATourneyDetailedResults.csv')

    season_dresults['tourney'] = False
    tourney_dresults['tourney'] = True

    detailed1 = pd.concat([
        season_dresults,
        tourney_dresults,
    ],
                          ignore_index=True,
                          join='inner')

    detailed1 = detailed1.rename(columns=str.lower)
    detailed2 = detailed1.copy()

    detailed1 = detailed1.rename(
        columns={
            'wteamid': 'team1',
            'wscore': 'score1',
            'lteamid': 'team2',
            'lscore': 'score2',
            'wloc': 'team1loc',
            'wfgm': 'fgm1',
            'wfga': 'fga1',
            'wfgm3': 'fgm31',
            'wfga3': 'fga31',
            'wftm': 'ftm1',
            'wfta': 'fta1',
            'wor': 'or1',
            'wdr': 'dr1',
            'wast': 'ast1',
            'wto': 'to1',
            'wstl': 'stl1',
            'wblk': 'blk1',
            'wpf': 'pf1',
            'lfgm': 'fgm2',
            'lfga': 'fga2',
            'lfgm3': 'fgm32',
            'lfga3': 'fga32',
            'lftm': 'ftm2',
            'lfta': 'fta2',
            'lor': 'or2',
            'ldr': 'dr2',
            'last': 'ast2',
            'lto': 'to2',
            'lstl': 'stl2',
            'lblk': 'blk2',
            'lpf': 'pf2',
        })

    detailed2 = detailed2.rename(
        columns={
            'wteamid': 'team2',
            'wscore': 'score2',
            'lteamid': 'team1',
            'lscore': 'score1',
            'wloc': 'team1loc',
            'wfgm': 'fgm2',
            'wfga': 'fga2',
            'wfgm3': 'fgm32',
            'wfga3': 'fga32',
            'wftm': 'ftm2',
            'wfta': 'fta2',
            'wor': 'or2',
            'wdr': 'dr2',
            'wast': 'ast2',
            'wto': 'to2',
            'wstl': 'stl2',
            'wblk': 'blk2',
            'wpf': 'pf2',
            'lfgm': 'fgm1',
            'lfga': 'fga1',
            'lfgm3': 'fgm31',
            'lfga3': 'fga31',
            'lftm': 'ftm1',
            'lfta': 'fta1',
            'lor': 'or1',
            'ldr': 'dr1',
            'last': 'ast1',
            'lto': 'to1',
            'lstl': 'stl1',
            'lblk': 'blk1',
            'lpf': 'pf1',
        })

    detailed2['team1loc'] = detailed2.team1loc.map({
        'H': 'A',
        'A': 'H',
        'N': 'N',
    })

    detailed = pd.concat([
        detailed1,
        detailed2,
    ],
                         ignore_index=True,
                         join='inner')

    detailed['teamseason1'] = detailed.season.astype(str).str.cat(
        detailed.team1.astype(str), sep=':')
    detailed['teamseason2'] = detailed.season.astype(str).str.cat(
        detailed.team2.astype(str), sep=':')

    detailed['team1'] = pd.Categorical(detailed['team1'])
    detailed['team2'] = pd.Categorical(detailed['team2'])
    detailed['teamseason1'] = pd.Categorical(detailed['teamseason1'])
    detailed['teamseason2'] = pd.Categorical(detailed['teamseason2'])
    detailed['team1loc'] = pd.Categorical(detailed['team1loc'])
    detailed['season'] = pd.Categorical(detailed['season'])
    detailed['team1win'] = pd.Categorical(
        detailed['score1'] > detailed['score2'])

    detailed['poss'] = (
        (detailed.fga1 + 0.475 * detailed.fta1 - detailed.or1 + detailed.to1) /
        2 + (detailed.fga2 + 0.475 * detailed.fta2 - detailed.or2 +
             detailed.to2) / 2)

    detailed['oe1'] = 100 * detailed.score1 / detailed.poss
    detailed['oe2'] = 100 * detailed.score2 / detailed.poss
    detailed['de1'] = detailed['oe2']
    detailed['de2'] = detailed['oe1']

    # eFG%  = (.5*3FGM + FGM) / FGA
    detailed['eFG1'] = (.5 * detailed.fgm31 + detailed.fgm1) / detailed.fga1
    detailed['eFG2'] = (.5 * detailed.fgm32 + detailed.fgm2) / detailed.fga2

    # TO% = TO / Possessions
    detailed['topct1'] = detailed.to1 / detailed.poss
    detailed['topct2'] = detailed.to2 / detailed.poss

    # OR% = OR / (OR + DRopp)
    detailed['orpct1'] = detailed.or1 / (detailed.or1 + detailed.dr2)
    detailed['orpct2'] = detailed.or2 / (detailed.or2 + detailed.dr1)

    # FTRate = FTA / FGA
    detailed['ftrate1'] = detailed.fta1 / detailed.fga1
    detailed['ftrate2'] = detailed.fta2 / detailed.fga2

    # shotopp1 = (fga1 + 0.475 * fta1) / poss
    detailed['shotopp1'] = (
        (detailed.fga1 + 0.475 * detailed.fta1) / detailed.poss)
    detailed['shotopp2'] = (
        (detailed.fga2 + 0.475 * detailed.fta2) / detailed.poss)

    # tspct1 = score1 / (2 * (fga1 + 0.475 * fta1))
    detailed['tspct1'] = (
        detailed.score1 / (2 * (detailed.fga1 + 0.475 * detailed.fta1)))
    detailed['tspct2'] = (
        detailed.score2 / (2 * (detailed.fga2 + 0.475 * detailed.fta2)))

    detailed['tempo'] = 40 * detailed.poss / (40 + 5 * detailed.numot)

    detailed['team1loc_num'] = detailed.team1loc.map({
        'H': 1,
        'A': -1,
        'N': 0,
    })

    return detailed


def design_matrix(games):
    contrast = Sum().code_with_intercept(
        games.teamseason1.cat.categories.tolist())

    matrix_team1 = csr_matrix(contrast.matrix)
    matrix_team2 = csr_matrix(contrast.matrix)[:, 1:]

    contrast_matrix_team1 = hstack(
        (matrix_team1, csr_matrix(matrix_team2.shape)), format='csr')
    contrast_matrix_team2 = hstack(
        (csr_matrix(matrix_team1.shape), matrix_team2), format='csr')

    X1 = contrast_matrix_team1[games.teamseason1.cat.codes.tolist()]
    X2 = contrast_matrix_team2[games.teamseason2.cat.codes.tolist()]
    X = X1 + X2
    return X, contrast_matrix_team1


def calc_adj_metrics(games):
    X, contrast_matrix_team1 = design_matrix(games)
    X = hstack([X, games.team1loc_num.values.reshape(-1, 1)])
    contrast_matrix_team1 = hstack([
        contrast_matrix_team1,
        csr_matrix((contrast_matrix_team1.shape[0], 1))
    ])

    # First let's do Adjusted Offensive Efficiency
    solution = lsqr(X, detailed.oe1.values)
    AdjOE = pd.DataFrame({
        'teamseason':
        detailed.teamseason1.cat.categories.tolist(),
        'AdjOE':
        contrast_matrix_team1.dot(solution[0])
    })

    # Now let's do Adjusted Defensive Efficiency
    solution = lsqr(X, detailed.de1.values)
    AdjDE = pd.DataFrame({
        'teamseason':
        detailed.teamseason1.cat.categories.tolist(),
        'AdjDE':
        contrast_matrix_team1.dot(solution[0])
    })

    # Adjusted Tempo
    solution = lsqr(X, detailed.tempo.values)
    AdjTempo = pd.DataFrame({
        'teamseason':
        detailed.teamseason1.cat.categories.tolist(),
        'AdjTempo':
        contrast_matrix_team1.dot(solution[0])
    })

    return AdjOE.merge(AdjDE, on='teamseason').merge(AdjTempo, on='teamseason')

In [9]:
compact = load_compact_data()
detailed = load_detailed_data()

In [10]:
adj_metrics = calc_adj_metrics(detailed)

In [11]:
detailed

Unnamed: 0,season,daynum,team1,score1,team2,score2,team1loc,numot,fgm1,fga1,...,orpct1,orpct2,ftrate1,ftrate2,shotopp1,shotopp2,tspct1,tspct2,tempo,team1loc_num
0,2003,10,1104,68,1328,62,N,0,27,58,...,0.388889,0.294118,0.310345,0.415094,0.905442,0.863265,0.510894,0.488574,73.500000,0
1,2003,10,1272,70,1393,63,N,0,26,62,...,0.375000,0.416667,0.306452,0.298507,1.032903,1.112525,0.492784,0.411765,68.762500,0
2,2003,11,1266,73,1437,61,N,0,24,58,...,0.435897,0.543860,0.500000,0.315068,1.106785,1.294140,0.508534,0.363420,64.850000,0
3,2003,11,1296,56,1457,50,N,0,18,38,...,0.230769,0.472222,0.815789,0.306122,0.902439,0.960633,0.531057,0.445434,58.425000,0
4,2003,11,1400,77,1208,71,N,0,30,61,...,0.531250,0.488372,0.213115,0.435484,1.049609,1.169141,0.573130,0.474440,64.000000,0
5,2003,11,1458,81,1186,55,H,0,26,57,...,0.352941,0.200000,0.473684,0.369565,1.042942,0.807692,0.580021,0.508553,66.950000,1
6,2003,12,1161,80,1236,62,H,0,23,55,...,0.382353,0.333333,0.709091,0.682927,0.962212,0.710617,0.544033,0.570902,76.412500,1
7,2003,12,1186,75,1457,61,N,0,28,62,...,0.342105,0.186047,0.338710,0.389831,0.929309,0.902841,0.521014,0.436182,77.450000,0
8,2003,12,1194,71,1156,66,N,0,28,58,...,0.257143,0.371429,0.310345,0.519231,0.879273,0.856482,0.533434,0.509063,75.687500,0
9,2003,12,1458,84,1296,56,H,0,32,67,...,0.378378,0.290323,0.283582,0.230769,1.128595,0.856560,0.552450,0.485269,67.362500,1


In [4]:
import pymc3 as pm



In [7]:
import sqlite as sqlite_backend

In [9]:
X, contrast_matrix_team1 = design_matrix(detailed)
X = hstack([X, detailed.team1loc_num.values.reshape(-1, 1)])
contrast_matrix_team1 = hstack(
    [contrast_matrix_team1,
     csr_matrix((contrast_matrix_team1.shape[0], 1))])

In [11]:
print('Running on PyMC3 v{}'.format(pm.__version__))

Running on PyMC3 v3.6


In [222]:
oneseason = detailed.loc[detailed.season == 2010].copy()

In [223]:
oneseason['team1'] = oneseason.team1.cat.remove_unused_categories()
oneseason['team2'] = oneseason.team2.cat.remove_unused_categories()

In [224]:
categories1 = oneseason.team1.cat.categories
categories2 = oneseason.team2.cat.categories

codes1 = oneseason.team1.cat.codes
codes2 = oneseason.team2.cat.codes

In [226]:
basic_model = pm.Model()

with basic_model:

    # Priors for unknown model parameters
    alpha = pm.Normal('alpha', mu=0, sd=10)
    homeAdv = pm.Normal('homeAdv', mu=2, sd=5)
    betaOff = pm.Normal('betaOff', mu=0, sd=10, shape=len(categories2))
    betaDef = pm.Normal('betaDef', mu=0, sd=10, shape=len(categories2))
    sigma = pm.InverseGamma('sigma', alpha=2, beta=24)

    betaOff_ = pm.Deterministic('betaOff_', betaOff - tensor.mean(betaOff))
    betaDef_ = pm.Deterministic('betaDef_', betaDef - tensor.mean(betaDef))

    # Expected value of outcome
    mu = alpha + betaOff_[np.array(codes1)] + betaDef_[np.array(
        codes2)] + oneseason.team1loc_num.values * homeAdv

    # Likelihood (sampling distribution) of observations
    Y_obs = pm.Normal('Y_obs', mu=mu, sd=sigma, observed=oneseason.oe1.values)

In [227]:
with basic_model:
    db = pm.backends.Text(
        'AdjOffensiveEff', vars=[
            alpha,
            homeAdv,
            betaOff_,
            betaDef_,
            sigma,
        ])
    # draw 500 posterior samples
    trace = pm.sample(500, trace=db)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, betaDef, betaOff, homeAdv, alpha]
Sampling 4 chains: 100%|██████████| 4000/4000 [00:44<00:00, 90.06draws/s] 


In [231]:
trace.get_values('betaOff_').mean(axis=0)

array([-4.84609489e+00,  2.54427237e+00,  5.76140651e+00, -1.45838915e+01,
       -8.72310679e+00, -6.41643338e+00, -2.04921455e+01, -8.17324582e+00,
        6.33750656e+00,  5.26418412e+00,  9.52251938e+00, -3.33327479e+00,
       -8.30670015e+00,  3.99174018e+00, -2.67917028e+00, -1.21388431e+01,
        6.34972804e+00,  1.89866073e+00, -8.44863243e+00,  1.56157680e+01,
        1.02180276e+00, -8.17284566e+00, -5.36392133e+00, -6.15195645e-01,
        8.09972945e+00,  2.48599826e-01, -2.37850973e+00,  3.22907449e+00,
       -2.02623459e+00, -2.20481400e+01, -4.01075416e+00,  3.44534474e+00,
        8.50154390e+00,  1.54909945e+01, -1.28308366e+00, -2.27655205e-01,
        1.70645900e+01, -1.02516661e+00, -2.00519626e+00, -9.52228050e+00,
       -7.72965864e+00, -9.55012018e+00, -8.23232465e+00,  6.28184252e+00,
       -6.06958747e+00, -1.72416745e+01,  4.29244771e+00, -1.42592410e+00,
        8.21532127e+00,  2.67460071e+00,  1.31099292e+00,  5.74919452e+00,
       -4.31459805e+00,  

In [234]:
trace.get_values('betaDef_').mean(axis=0)

array([  4.07154189,  -2.72953126,  -6.56590525,   5.22195093,
         3.93072176,   6.00355648,  11.83727347,   4.22001694,
         2.06062366,  -3.39649423,  -9.01583414,   7.52009568,
        -2.60823982,  -1.76051563,  -0.23564772,  -4.43185696,
        -0.26639371,   4.7247726 ,  -2.28943565,  -8.06413499,
        -3.05210091,   4.04780904,   6.74364934,  -1.48633788,
        -3.10916459,  -3.16159718,   2.18802228,  -1.11418854,
         9.14681636,   8.33762294,   3.28779132,   2.94295131,
       -12.79783027,  -9.88327294,   0.67141863,   8.54553892,
        -5.7743679 ,  -1.92264132,   1.28925029,   4.61488437,
        11.58470234,   4.890397  ,   4.36103528,  -0.86180456,
         2.64216143,   7.80758137,  -7.55456718,   0.86456086,
       -12.08749685,   1.15704417,  -2.74264494,   4.41608264,
         7.21245424,   1.80487374,  -3.67127801,   2.14153076,
        -9.51659975,   6.47645911,  -1.52294382,   0.25531352,
         6.27792963,   3.16306708,   1.06520007,   4.04

In [240]:
oneseason.columns

Index(['season', 'daynum', 'team1', 'score1', 'team2', 'score2', 'team1loc',
       'numot', 'fgm1', 'fga1', 'fgm31', 'fga31', 'ftm1', 'fta1', 'or1', 'dr1',
       'ast1', 'to1', 'stl1', 'blk1', 'pf1', 'fgm2', 'fga2', 'fgm32', 'fga32',
       'ftm2', 'fta2', 'or2', 'dr2', 'ast2', 'to2', 'stl2', 'blk2', 'pf2',
       'tourney', 'teamseason1', 'teamseason2', 'team1win', 'poss', 'oe1',
       'oe2', 'de1', 'de2', 'eFG1', 'eFG2', 'topct1', 'topct2', 'orpct1',
       'orpct2', 'ftrate1', 'ftrate2', 'shotopp1', 'shotopp2', 'tspct1',
       'tspct2', 'tempo', 'team1loc_num'],
      dtype='object')

In [381]:
oneseason.topct1

34074     0.182753
34075     0.311191
34076     0.083218
34077     0.257077
34078     0.139421
34079     0.259253
34080     0.270668
34081     0.143627
34082     0.132939
34083     0.214554
34084     0.162536
34085     0.100125
34086     0.217687
34087     0.210970
34088     0.174165
34089     0.243325
34090     0.212061
34091     0.261663
34092     0.180485
34093     0.165850
34094     0.190801
34095     0.161172
34096     0.187421
34097     0.257152
34098     0.167627
34099     0.154679
34100     0.186983
34101     0.216986
34102     0.211856
34103     0.159046
            ...   
165612    0.313794
165613    0.184988
165614    0.161261
165615    0.238901
165616    0.105601
165617    0.194580
165618    0.156500
165619    0.167119
165620    0.188558
165621    0.084418
165622    0.281974
165623    0.191879
165624    0.156667
165625    0.116497
165626    0.139023
165627    0.279883
165628    0.187628
165629    0.240915
165630    0.288214
165631    0.173085
165632    0.109056
165633    0.

In [256]:
pm.invlogit(0.0).eval()

array(0.5)

In [362]:
eFG_model = pm.Model()

with eFG_model:

    # Priors for unknown model parameters
    intercept = pm.Normal('intercept', mu=-0.0364, sd=3e-3)

    homeAdv = pm.Normal('homeAdv', mu=0.0613, sd=3.5e-3)

    betaOff = pm.Normal('betaOff', mu=0, sd=3.5e-3, shape=len(categories1))
    betaDef = pm.Normal('betaDef', mu=0, sd=3.5e-3, shape=len(categories2))

    betaOff_ = pm.Deterministic('betaOff_', betaOff - tensor.mean(betaOff))
    betaDef_ = pm.Deterministic('betaDef_', betaDef - tensor.mean(betaDef))

    lphi = pm.Normal('lphi', mu=3.5, sd=1.33e-2)
    phi = pm.Deterministic('phi', pm.math.exp(lphi))

    # Expected value of outcome
    mu = pm.invlogit(intercept + betaOff_[np.array(codes1)] +
                     betaDef_[np.array(codes2)] +
                     oneseason.team1loc_num.values * homeAdv)

    alpha = mu * phi
    beta = (1 - mu) * phi

    Y_obs = pm.Beta(
        'Y_obs', alpha=alpha, beta=beta, observed=oneseason.eFG1.values)

In [355]:
with eFG_model:
    mp = pm.find_MAP()

logp = 14,239, ||grad|| = 0.034801: 100%|██████████| 12/12 [00:00<00:00, 282.56it/s]  


In [364]:
with eFG_model:
    db = pm.backends.Text(
        'AdjeFG', vars=[
            intercept,
            homeAdv,
            betaOff_,
            betaDef_,
            phi,
        ])

    # draw 500 posterior samples
    trace = pm.sample(500, trace=db)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [lphi, betaDef, betaOff, homeAdv, intercept]
Sampling 4 chains: 100%|██████████| 4000/4000 [03:26<00:00, 19.38draws/s]


In [382]:
topct_model = pm.Model()

with topct_model:

    # Priors for unknown model parameters
    intercept = pm.Normal('intercept', mu=-2.5e-1, sd=3e-3)

    homeAdv = pm.Normal('homeAdv', mu=3.7e-2, sd=3.3e-3)

    betaOff = pm.Normal('betaOff', mu=0, sd=1.6e-4, shape=len(categories1))
    betaDef = pm.Normal('betaDef', mu=0, sd=1.6e-4, shape=len(categories2))

    betaOff_ = pm.Deterministic('betaOff_', betaOff - tensor.mean(betaOff))
    betaDef_ = pm.Deterministic('betaDef_', betaDef - tensor.mean(betaDef))

    lphi = pm.Normal('lphi', mu=2, sd=8e-3)
    phi = pm.Deterministic('phi', pm.math.exp(lphi))

    # Expected value of outcome
    mu = pm.invlogit(intercept + betaOff_[np.array(codes1)] +
                     betaDef_[np.array(codes2)] +
                     oneseason.team1loc_num.values * homeAdv)

    alpha = mu * phi
    beta = (1 - mu) * phi

    Y_obs = pm.Beta(
        'Y_obs', alpha=alpha, beta=beta, observed=oneseason.topct1.values)

In [383]:
with topct_model:
    db = pm.backends.Text(
        'AdjTOpct', vars=[
            intercept,
            homeAdv,
            betaOff_,
            betaDef_,
            phi,
        ])

    # draw 500 posterior samples
    trace = pm.sample(500, trace=db)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [lphi, betaDef, betaOff, homeAdv, intercept]
Sampling 4 chains: 100%|██████████| 4000/4000 [03:02<00:00, 21.93draws/s] 


In [411]:
orpct_model = pm.Model()

with orpct_model:

    # Priors for unknown model parameters
    intercept = pm.Normal('intercept', mu=-2.5e-1, sd=3e-3)

    homeAdv = pm.Normal('homeAdv', mu=3.7e-2, sd=3.3e-3)

    betaOff = pm.Normal('betaOff', mu=0, sd=1.6e-4, shape=len(categories1))
    betaDef = pm.Normal('betaDef', mu=0, sd=1.6e-4, shape=len(categories2))

    betaOff_ = pm.Deterministic('betaOff_', betaOff - tensor.mean(betaOff))
    betaDef_ = pm.Deterministic('betaDef_', betaDef - tensor.mean(betaDef))

    lphi = pm.Normal('lphi', mu=2, sd=8e-3)
    phi = pm.Deterministic('phi', pm.math.exp(lphi))

    # Expected value of outcome
    mu = pm.invlogit(intercept + betaOff_[np.array(codes1)] +
                     betaDef_[np.array(codes2)] +
                     oneseason.team1loc_num.values * homeAdv)

    alpha = mu * phi
    beta = (1 - mu) * phi

    Y_obs = pm.Beta(
        'Y_obs', alpha=alpha, beta=beta, observed=oneseason.topct1.values)

0.4905

In [424]:
np.mean(1 * (trace.get_values('betaOff_')[:, 13] < 0))

0.496