# Last Man Standing

In [187]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import footballdata as foo
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1234)
import pandas as pd

from matplotlib.pyplot import figure, show
import pprint as pp


In [2]:
print(foo.MatchHistory.__doc__)

Provides pandas.DataFrames from CSV files available at
    http://www.football-data.co.uk/data.php

    Column names are explained here: http://www.football-data.co.uk/notes.txt

    Data will be downloaded as necessary and cached locally in ./data

    Parameters
    ----------
    leagues : string or iterable of league-ids to include, None for all
    seasons : string, int or list of seasons. Examples:
              '16-17'; 2016; '2016-17'; [14, 15, 16]
    


In [3]:
foo.MatchHistory.available_leagues()

['BEL-Jupiler League',
 'ENG-Championship',
 'ENG-Conference',
 'ENG-League 1',
 'ENG-League 2',
 'ENG-Premier League',
 'ESP-La Liga',
 'ESP-La Liga 2',
 'FRA-Ligue 1',
 'FRA-Ligue 2',
 'GER-Bundesliga',
 'GER-Bundesliga 2',
 'GRE-Ethniki Katigoria',
 'ITA-Serie A',
 'ITA-Serie B',
 'NED-Eredivisie',
 'POR-Liga 1',
 'SCO-Division 1',
 'SCO-Division 2',
 'SCO-Division 3',
 'SCO-Premiership',
 'TUR-Ligi 1']

In [4]:
prem = foo.MatchHistory('ENG-Premier League', range(2016, 2017)).read_games()
prem.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,date,home_team,away_team,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
league,season,game_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
ENG-Premier League,1617,2017-05-08 Chelsea-Middlesbrough,2017-05-08,Chelsea,Middlesbrough,3,0,H,2,0,H,C Pawson,...,2.59,20,-2.0,1.8,1.77,2.2,2.19,1.14,10.0,22.0
ENG-Premier League,1617,2017-05-21 Watford-Manchester City,2017-05-21,Watford,Manchester City,0,5,A,0,4,A,J Moss,...,2.8,18,1.75,1.99,1.94,2.0,1.93,18.0,9.7,1.16
ENG-Premier League,1617,2017-01-14 Swansea City-Arsenal,2017-01-14,Swansea City,Arsenal,0,4,A,0,1,A,M Jones,...,2.41,28,1.0,2.11,2.06,1.87,1.81,6.95,4.92,1.49
ENG-Premier League,1617,2016-10-22 West Ham United-Sunderland,2016-10-22,West Ham United,Sunderland,1,0,H,0,0,D,R Madley,...,2.01,33,-1.0,2.3,2.17,1.76,1.71,1.69,4.01,5.61
ENG-Premier League,1617,2017-04-15 Southampton-Manchester City,2017-04-15,Southampton,Manchester City,0,3,A,0,0,D,N Swarbrick,...,2.25,20,0.75,1.95,1.87,2.06,2.0,4.65,4.06,1.79


In [5]:
list(prem) # columns of DataFrame

['date',
 'home_team',
 'away_team',
 'FTHG',
 'FTAG',
 'FTR',
 'HTHG',
 'HTAG',
 'HTR',
 'Referee',
 'HS',
 'AS',
 'HST',
 'AST',
 'HF',
 'AF',
 'HC',
 'AC',
 'HY',
 'AY',
 'HR',
 'AR',
 'B365H',
 'B365D',
 'B365A',
 'BWH',
 'BWD',
 'BWA',
 'IWH',
 'IWD',
 'IWA',
 'LBH',
 'LBD',
 'LBA',
 'PSH',
 'PSD',
 'PSA',
 'WHH',
 'WHD',
 'WHA',
 'VCH',
 'VCD',
 'VCA',
 'Bb1X2',
 'BbMxH',
 'BbAvH',
 'BbMxD',
 'BbAvD',
 'BbMxA',
 'BbAvA',
 'BbOU',
 'BbMx>2.5',
 'BbAv>2.5',
 'BbMx<2.5',
 'BbAv<2.5',
 'BbAH',
 'BbAHh',
 'BbMxAHH',
 'BbAvAHH',
 'BbMxAHA',
 'BbAvAHA',
 'PSCH',
 'PSCD',
 'PSCA']

#### TODO check correct formula and explain going from odds $\rightarrow$ prob

In [6]:
# odds are typically given for home team
def probs_from_odds(odds_win, odds_draw, odds_lose):
    prob_win, prob_draw, prob_lose = \
        map(lambda odds: odds / (1 + odds) , [odds_win, odds_draw, odds_lose])
    vig = prob_win + prob_draw + prob_lose - 1 # bookie's cut
    prob_win_normed, prob_draw_normed, prob_lose_normed = \
        map(lambda prob: prob / (1 + vig) , [prob_win, prob_draw, prob_lose])
    return prob_win_normed, prob_draw_normed, prob_lose_normed

In [7]:
probs_from_odds(3.10, 3.30, 2.50)

(0.33787160082557843, 0.342940942173299, 0.31918745700112244)

In [8]:
# Check seasons
prem_index = prem.index.get_values()
set([idx[1] for idx in prem_index])

{'1617'}

In [9]:
teams = list(prem.home_team.unique())
num_teams = len(teams)
print(teams)

['Burnley', 'Crystal Palace', 'Everton', 'Hull City', 'Manchester City', 'Middlesbrough', 'Southampton', 'AFC Bournemouth', 'Arsenal', 'Chelsea', 'Manchester United', 'Leicester City', 'Stoke City', 'Swansea City', 'Tottenham Hotspur', 'Watford', 'West Bromwich Albion', 'Sunderland', 'West Ham United', 'Liverpool']


## Winning probabilities matrix

What we want now is the matrix $X\in [0,1]^{38 \times 20}$, with elements defined by

$$X_{i,j} := p^{(i)}_j = \text{probability that team }j\text{ wins in week }i.$$

#### TODO Refactor to take odds from different bookies, maybe in `prob_from_odds`

In [85]:
num_teams = len(teams)
X = np.zeros( (2 * (num_teams - 1), num_teams))
X_index = np.empty((2 * (num_teams - 1), num_teams), dtype=object)
games_played_by_team = np.zeros(len(teams), dtype=int) # array keeps track of how many games each team has played

for index, row in prem.iterrows():
    j_home = teams.index(row["home_team"])
    j_away = teams.index(row["away_team"])
    i_home = games_played_by_team[j_home]
    i_away = games_played_by_team[j_away]
    home_prob, draw_prob, away_prob = probs_from_odds(row["B365H"], row["B365D"], row["B365A"])
    X[i_home, j_home] = home_prob
    X[i_away, j_away] = away_prob
    X_index[i_home, j_home] = index
    X_index[i_away, j_away] = index
    games_played_by_team[j_home] += 1
    games_played_by_team[j_away] += 1
    

In [86]:
print(np.where(X == 0))

(array([], dtype=int64), array([], dtype=int64))


In [87]:
print(X[24,5])
print(X_index[24,5])
game = prem.loc[X_index[24,5]]
home_prob, draw_prob, away_prob = probs_from_odds(game["B365H"], game["B365D"], game["B365A"])
print(home_prob, draw_prob, away_prob)

0.354387283101
('ENG-Premier League', '1617', '2017-02-11 Middlesbrough-Everton')
0.354387283101 0.341707952758 0.303904764141


## Greedy sampling

In [204]:
def greedy_sample(X, teams):
    """
        Given winning probability matrix X returns a "greedy" sample
        permutation of teams - given as a list of integers.
    """
    ## Check dims of X ok
    num_weeks = X.shape[0]
    num_teams = X.shape[1]
    if (num_teams != len(teams)):
        raise(ValueError("X needs to have %d rows, has %d." % (num_teams, X.shape[1])))
    # generate samples
    visited = []
    for t in list(range(min(num_teams, num_weeks))):
        allowed = [i for i in list(range(num_teams)) if i not in visited]
        probs = X[t, allowed] / np.sum(X[t, allowed])
        sample = np.random.choice(allowed, p=probs)
        visited.append(sample)
    return visited

In [205]:
greedy_sample(X, teams)

[15, 12, 16, 9, 19, 2, 0, 11, 3, 18, 6, 4, 8, 17, 14, 5, 7, 1, 13, 10]

### Note: we want max of `log(win_prob)`

In [164]:
def evaluate_perm(perm_list, X):
    """
      Gives log of winning probability of a permutation
    """
    winning_probs = X[np.arange(len(perm_list)), np.array(perm_list)]
    return np.sum(np.log(winning_probs))

In [66]:
def team_order_from_perm_list(perm_list, teams):
    """
        takes a list of integers and returns corresponding list of teams
    """
    ## check list 'full'
    if( not (sorted(perm_list) == list(range(len(perm_list))))):
        raise(ValueError("perm_list doesn't contain all integers from 0 to %d" % (len(perm_list) -1 )))
    team_choice = [teams[i] for i in perm_list]
    return team_choice

In [165]:
print(evaluate_perm(b, X))

-21.8015750552


### Baseline greedy strategy

In [200]:
def greedy_strategy(X, teams):
    """
        Given winning probability matrix X returns the "greedy" strategy
        permutation of teams - given as a list of integers.
    """
    ## Check dims of X ok
    num_weeks = X.shape[0]
    num_teams = X.shape[1]
    if (num_teams != len(teams)):
        raise(ValueError("X needs to have %d rows, has %d." % (num_teams, X.shape[1])))
    # generate sample
    visited = []
    for t in list(range(min(num_teams, num_weeks))):
        allowed = [i for i in list(range(num_teams)) if i not in visited]
        probs = X[t, allowed] / np.sum(X[t, allowed])
        sample = allowed[np.argmax(probs)]
        visited.append(sample)
    return visited

In [201]:
greedy_strategy_result = {"opt_perm": greedy_strategy(X, teams), \
                          "opt_value": evaluate_perm(greedy_perm, X), \
                          "opt_team_list": team_order_from_perm_list(greedy_perm, teams)\
                         }
pp.pprint(greedy_strategy_result, compact=True)

{'opt_perm': [17, 1, 0, 6, 7, 3, 12, 13, 5, 16, 15, 18, 14, 2, 11, 8, 4, 9, 19,
              10],
 'opt_team_list': ['Liverpool', 'West Ham United', 'Sunderland',
                   'West Bromwich Albion', 'Watford', 'Tottenham Hotspur',
                   'Swansea City', 'Stoke City', 'Leicester City',
                   'Manchester United', 'Chelsea', 'Arsenal', 'AFC Bournemouth',
                   'Southampton', 'Middlesbrough', 'Manchester City',
                   'Hull City', 'Everton', 'Crystal Palace', 'Burnley'],
 'opt_value': -22.195522033487677}


### Take many greedy samples and save the best

In [207]:
def greedy_sample_strategy(X, teams, num_iterations):
    optimal_sampled_perm = []
    optimal_value = -float("inf")
    optimal_t = 0
    for t in range(num_iterations - 1):
        if (t % 1000 == 0):
            print("Run %d of %d. Current optimal value is %.4f observed at run %d." \
                  % (t, num_iterations, optimal_value, optimal_t))
        sample = greedy_sample(X, teams)
        value = evaluate_perm(sample, X)
        if (value > optimal_value):
            optimal_value = value
            optimal_sampled_perm = sample
            optimal_t = t
    return optimal_sampled_perm, optimal_value

In [184]:
num_iterations = 1000000
optimal_sampled_perm, optimal_value = greedy_sample_strategy(X, teams, num_iterations)
greedy_sample_result = {"opt_perm": optimal_sampled_perm, \
                        "opt_value": optimal_value, \
                        "opt_team_list": team_order_from_perm_list(optimal_sampled_perm, teams)\
                       }

In [209]:
pp.pprint(greedy_sample_result, compact=True)

{'opt_perm': [15, 18, 19, 4, 7, 13, 0, 10, 6, 16, 2, 1, 14, 9, 5, 3, 8, 12, 17,
              11],
 'opt_team_list': ['Watford', 'West Ham United', 'Liverpool', 'Manchester City',
                   'AFC Bournemouth', 'Swansea City', 'Burnley',
                   'Manchester United', 'Southampton', 'West Bromwich Albion',
                   'Everton', 'Crystal Palace', 'Tottenham Hotspur', 'Chelsea',
                   'Middlesbrough', 'Hull City', 'Arsenal', 'Stoke City',
                   'Sunderland', 'Leicester City'],
 'opt_value': -20.226585488990679}


## Permutation matrices

In [65]:
from birkhoff import to_permutation_matrix as to_permutation_matrix
def perm_matrix_from_list(perm_list):
    """
        Takes an ordered list of integers and returns a permutation matrix.
    """
    ## check list 'full'
    n_range = list(range(len(perm_list)))
    if( not (sorted(perm_list) == n_range)):
        raise(ValueError("perm_list doesn't contain all integers from 0 to %d" % (len(perm_list) -1 )))
    perm_dict = dict(zip(n_range, perm_list))    
    return to_permutation_matrix(perm_dict) 

In [63]:
perm_matrix_from_list(a)

array([[ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.]])

In [67]:
team_order_from_perm_list(a, teams)

['Everton', 'Hull City', 'Crystal Palace', 'Manchester City', 'Burnley']