In [1]:
import pandas as pd
import numpy as np
import theano as tt
from sklearn import preprocessing

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [5]:
import pymc3 as pm
import scipy
np.__version__, scipy.__version__, tt.__version__, pm.__version__

('1.16.4', '1.2.1', '1.0.4', '3.6')

In [6]:
NUM_GAMES = 380
NUM_WEEKS = 38

csv_name = "https://www.football-data.co.uk/mmz4281/1819/E0.csv"
prev_perf_name = '../data/prev_perf.csv'

df = pd.read_csv(csv_name)
df_prev = pd.read_csv(prev_perf_name, header=None)
df_prev.columns = ["team", "points", "goals_scored", "goals_conceded"]

In [7]:
uniq_team_lst = list(df["HomeTeam"].unique())
uniq_team_lst.sort()

df["home_team_ids"] = df["HomeTeam"].apply(
    lambda x: uniq_team_lst.index(x)
) 


df["away_team_ids"] = df.AwayTeam.apply(
    lambda x: uniq_team_lst.index(x) 
)

df["score_diff"] = df["FTHG"] - df["FTAG"]

In [8]:
home_team_ids = df["home_team_ids"]
away_team_ids = df["away_team_ids"]

In [9]:
home_team_wk = [0] * NUM_GAMES
away_team_wk = [0] * NUM_GAMES

for i in range(NUM_GAMES):
    home_team_wk[i] = (
        sum(home_team_ids[:i+1] == home_team_ids[i]) + 
        sum(away_team_ids[:i+1] == home_team_ids[i]) - 1
    )
    
    away_team_wk[i] = (
        sum(home_team_ids[:i+1] == away_team_ids[i]) + 
        sum(away_team_ids[:i+1] == away_team_ids[i]) - 1
    )

df["home_team_wk"] = home_team_wk
df["away_team_wk"] = away_team_wk

In [10]:
#prev_perf = preprocessing.minmax_scale(df_prev["goals_scored"], feature_range=(-1,1))

In [11]:
nteams = len(uniq_team_lst)
ngames = df.shape[0]
nweeks = round(2 * ngames/nteams)

In [12]:
home_week_idx = df["home_team_wk"].values
home_team_idx = df["home_team_ids"].values

away_week_idx = df["away_team_wk"].values
away_team_idx = df["away_team_ids"].values

In [None]:
y = df["score_diff"]

with pm.Model() as epl_model:
    b_home = pm.Normal("b_home", mu=1, sd=1)
    sigma_y = pm.HalfCauchy("sigma_y", 5)
    nu = pm.Gamma("nu", 2, 0.1)    
    a = pm.Normal("a", mu=0, sd=1, shape=(nweeks, nteams))
    
    y_obs = pm.StudentT(
        'y_obs',
        mu=b_home + a[home_week_idx, home_team_idx] - a[away_week_idx, away_team_idx],
        sd=sigma_y,
        nu=nu,
        observed=y
    )

    prior = pm.sample_prior_predictive()
    posterior = pm.sample(tune=1000, sample=2500, target_accept=0.85)
    posterior_pred = pm.sample_ppc(posterior)

  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
  rval = inputs[0].__getitem__(inputs[1:])
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [a, nu, sigma_y, b_home]
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
Sampling 4 chains:   1%|          | 54/6000 [00:00<01:42, 58.02draws/s]

In [None]:
pm.traceplot(posterior, varnames=["b_home", "sigma_y", "nu"]);

In [None]:
pm.summary(posterior, varnames=["b_home", "sigma_y", "nu"])

In [None]:
np.median(posterior.get_values("a")[:, :, 0], axis=0)