In [1]:
import csv

bl_prep = []

with open("bundesliga_15_16.csv", 'r') as bl_csv:
    reader = csv.reader(bl_csv, delimiter=',')
    for row in reader:
        bl_prep.append(row)

bl_prep = bl_prep[1:]
bl_prep[:5]

teams = []
with open("bundesliga_15_16_teams.csv", 'r') as teams_csv:
    reader = csv.reader(teams_csv, delimiter = ',')
    for row in reader:
        teams.extend(row)

teams[:5]

['VfL Wolfsburg',
 'Borussia Dortmund',
 'FC Augsburg',
 'Hamburger SV',
 'Bayer Leverkusen']

In [68]:
home_goals = [int(row[2]) for row in bl_prep]
away_goals = [int(row[3]) for row in bl_prep]
team_vecs = [[0 for i in range(len(teams))] for j in range(len(teams))]

for j in range(len(team_vecs)):
    team_vecs[j][j] = 1
    
team_dict = { teams[i]: team_vecs[i] for i in range(len(teams))}

home_teams = [team_dict[row[0]] for row in bl_prep]
away_teams = [team_dict[row[1]] for row in bl_prep]

In [69]:
from tqdm import *

def nice_out(ts, a, b, g):
    print("Home advantage: " + str(g))
    for row in zip(ts, a, b):
        print(row)

In [70]:
import numpy
import theano
import theano.tensor as T

rnd = numpy.random
x = T.vector("x")
y = T.vector("y")
ht = T.matrix("ht")
at = T.matrix("at")

t = len(teams)
#need to initialize with positive numbers
a = rnd.rand(t)
#normalize
a[-1] = t-a[:-1].sum()
#verify normalization
la = numpy.log(a)
print(numpy.exp(la).sum())
lb = rnd.randn(t)
lg = numpy.exp(rnd.randn())
print("Initial model:")
attack = numpy.exp(la)
defense = numpy.exp(lb)
advantage = numpy.exp(lg)
nice_out(teams, attack, defense, advantage)

#initialize rho (naively)
rho = 0

18.0
Initial model:
Home advantage: 4.72071576695
('VfL Wolfsburg', 0.48069010051622896, 0.2876559810761245)
('Borussia Dortmund', 0.9922646194066993, 1.1379378426496514)
('FC Augsburg', 0.339537332901509, 0.93677868067135428)
('Hamburger SV', 0.79078996915595212, 0.44921070938320118)
('Bayer Leverkusen', 0.61271164653919463, 0.62113681566313406)
('1899 Hoffenheim', 0.39917112484492789, 1.0581281319960734)
('FC Schalke 04', 0.47827402648888617, 0.27788881307972113)
('Bayern M\xc3\xbcnchen', 0.60916774230341963, 1.0085704947587553)
('1. FSV Mainz 05', 0.42470814501079612, 0.56883418846135636)
('Hannover 96', 0.80518819570624633, 0.33938173897396801)
('Eintracht Frankfurt', 0.86677547567696867, 1.8307512985932668)
('Bor. M\xc3\xb6nchengladbach', 0.0032790910366652377, 1.3102282397260432)
('VfB Stuttgart', 0.82521303797363288, 8.9509994002846405)
('Hertha BSC', 0.28472030008462546, 0.3202161110180699)
('FC Ingolstadt 04', 0.56683516156390923, 1.9045333251343115)
('Werder Bremen', 0.787679

In [87]:
training_steps = 1000

x = numpy.array(home_goals)
y = numpy.array(away_goals)
ht = numpy.matrix(home_teams)
at = numpy.matrix(away_teams)
htat = numpy.dot(ht.T, at)
atht = htat.T
htx = numpy.dot(ht.T, x)
aty = numpy.dot(at.T, y)
atx = numpy.dot(at.T, x)
hty = numpy.dot(ht.T, y)
const1 = htx + aty
const2 = atx + hty
x_sum = x.sum()

In [88]:
def log_llh(log_alpha, log_beta, log_gamma):
    alpha = numpy.exp(log_alpha)
    beta = numpy.exp(log_beta)
    gamma = numpy.exp(log_gamma)
    
    ret = numpy.dot(htx, log_alpha.T).sum() + numpy.dot(atx, log_beta.T).sum() + x_sum*log_gamma + (
     numpy.dot(aty, log_alpha.T).sum() + numpy.dot(hty, log_beta.T).sum() - gamma*numpy.dot(alpha, numpy.dot(htat, beta.T).T).sum() -
     numpy.dot(alpha, numpy.dot(atht, beta.T).T).sum() )
    #print(ret.shape)
    return ret

In [89]:
def log_llh_der(log_alpha, log_beta, log_gamma):
    alpha = numpy.exp(log_alpha)
    beta = numpy.exp(log_beta)
    gamma = numpy.exp(log_gamma)
    
    dfdla = (const1 - (numpy.eye(18)*alpha*numpy.dot(htat*gamma + atht, beta).T).T).T
    dfdlb = (const2 - (numpy.eye(18)*beta*numpy.dot(htat + gamma*atht, alpha).T).T).T
    dfdlg = numpy.array([x_sum - gamma*numpy.dot(alpha.T, numpy.dot(htat, beta).T).sum()])
    ret = numpy.concatenate((dfdla, dfdlb, [dfdlg]))
    ret = numpy.ravel(ret)
    return ret

In [90]:
def log_llh_wrap(x):
    log_a = x[:18]
    log_b = x[18:36]
    log_g = x[36]
    
    return -log_llh(log_a, log_b, log_g)

In [91]:
def log_llh_der_wrap(x):
    log_a = x[:18]
    log_b = x[18:36]
    log_g = x[36]
    
    return -log_llh_der(log_a, log_b, log_g)

In [92]:
cons = ({'type': 'eq',
         'fun': lambda x: sum(numpy.exp(x[:18]))-18,
         'jac': lambda x: numpy.concatenate((numpy.exp(x[:18]), numpy.zeros(19)))})

In [93]:
count = 0
def progress(x):
    count += 1
    if((10*count) % trainingsteps == 0):
        print( count*10/trainingsteps + "% completed.")

In [94]:
from scipy.optimize import minimize

x0 = numpy.concatenate((la, lb, [lg]))
#print(log_llh_wrap(x0))
#print(numpy.ravel(numpy.eye(18)*alpha*numpy.dot(htat*gamma + atht, beta).T))
#print(numpy.ravel(alpha) * (numpy.ravel(gamma*numpy.dot(htat, beta) + numpy.dot(atht, beta))))
res = minimize(log_llh_wrap, x0, constraints=cons, method='SLSQP', jac=log_llh_der_wrap, 
               options={'maxiter': training_steps, 'disp':True}, callback=progress)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 237.731305505
            Iterations: 44
            Function evaluations: 98
            Gradient evaluations: 44


In [95]:
print(numpy.exp(res.x))
print(numpy.exp(res.x[:18]).sum())

[ 1.07308199  1.96534672  0.8779234   0.7908241   1.03335801  0.71014688
  0.96386838  1.85827331  0.95317461  0.76800715  0.89553253  1.45022693
  0.9386723   1.06999381  0.45746278  0.73602001  0.71523708  0.74285001
  1.13384741  1.30325953  1.36472794  1.2126714   1.0590939   1.32536978
  1.2231424   0.45516727  1.23846652  1.53581223  1.4875015   1.65167108
  1.95913626  0.95544926  0.92807589  1.69674625  1.36869587  1.11143103
  1.19941378]
18.000000001


In [96]:
la = res.x[:t]
lb = res.x[t:2*t]
lg = res.x[2*t]

In [97]:
print("Final model:")
attack = numpy.exp(la)
defense = numpy.exp(lb)
advantage = numpy.exp(lg)
#nice_out(teams, attack, defense, gamma.get_value())
nice_out(teams, attack, defense, advantage)

Final model:
Home advantage: 1.1994137761
('VfL Wolfsburg', 1.0730819908523481, 1.1338474069200499)
('Borussia Dortmund', 1.965346717445329, 1.3032595266010016)
('FC Augsburg', 0.87792340342390129, 1.3647279373135899)
('Hamburger SV', 0.79082409939010967, 1.2126713959497426)
('Bayer Leverkusen', 1.0333580135914284, 1.0590938972164452)
('1899 Hoffenheim', 0.71014688243102786, 1.3253697844705103)
('FC Schalke 04', 0.9638683839859461, 1.2231423971677988)
('Bayern M\xc3\xbcnchen', 1.8582733148348851, 0.45516727302323928)
('1. FSV Mainz 05', 0.95317460904257656, 1.2384665236983747)
('Hannover 96', 0.7680071495313836, 1.5358122325344505)
('Eintracht Frankfurt', 0.89553252962085528, 1.4875014977785943)
('Bor. M\xc3\xb6nchengladbach', 1.4502269286910567, 1.6516710767413938)
('VfB Stuttgart', 0.93867229973978239, 1.9591362568966033)
('Hertha BSC', 1.0699938066951578, 0.95544926045268763)
('FC Ingolstadt 04', 0.4574627771641025, 0.92807589238479071)
('Werder Bremen', 0.73602001186636978, 1.69674

In [98]:
with open("dc_model_params.csv", 'w') as model:
    model_writer = csv.writer(model, delimiter = ',')
    model_writer.writerow(attack)
    model_writer.writerow(defense)
    model_writer.writerow([advantage])
    
print("Writing model done.")

Writing model done.
