# Q-Learning on football forecasts

The aim is to apply Q-Learning methods on top of football match forecasts from https://kickoff.ai in order to understand when is best to bet.

The model is both trained and tested using the forecasts from past matches
Data are obtained scraping the aforementioned website and then joining with the historical odds available at http://www.football-data.co.uk of the major bookmakers. 

The model is only trained with (and therefore bets on) basic odds (i.e. 1, X, 2) from B365.
Progress can be achieved by betting on the best odds for a given match comparing different bookmakers, but also by considering other, more complicated, bets (i.e. 1X,12,X2,over and under,etc) bearing in mind that this will affect the action space of the model, making it more complicated.

Other resources linked to this script:
    -  Web Scraping script & joining the data: ...
    -  Remake of kickoff.ai model: ...

### Model

In [1]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


def mlp(n_obs, n_action, n_hidden_layer=1, n_neuron_per_layer=32, activation='relu', loss='mse'):
    """ A multi-layer perceptron """

    model = Sequential()
    model.add(Dense(n_neuron_per_layer, input_dim=n_obs, activation=activation))
    for _ in range(n_hidden_layer):
        model.add(Dense(n_neuron_per_layer, activation=activation))
    model.add(Dense(n_action, activation='linear'))
    model.compile(loss=loss, optimizer=Adam())
    print(model.summary())
    return model

Using TensorFlow backend.


### Environment

In [38]:
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import itertools


class TradingEnv(gym.Env):
  
    def __init__(self, train_data):

        # TO DO:
        # round up to integer to reduce state space (with percentages in fractions)
        # consider betting only on 1
        
        self.game = train_data[train_data.columns.intersection(['B365H', 'B365D', 'B365A', 'Result'])]
        self.train_data = train_data[train_data.columns.intersection(['oddsHome', 'oddsDrawn', 'oddsAway'])]
        self.n_step, self.n_bet = self.train_data.shape

        # instance attributes
        self.cur_step = None
        self.profit = None
        self.won = None
        self.tot = None

        # action space
        self.action_space = spaces.Discrete(3)   

        # observation space: give estimates in order to sample and build scaler
    
        forecast_max = self.train_data.max(axis=1)
        forecast_range = [[0, mx] for mx in forecast_max]

        self.observation_space = spaces.MultiDiscrete(forecast_range)
        
        # seed and start
        self._seed()
        self._reset()
        
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def _reset(self):
        self.cur_step = 0
        self.profit = 0
        self.won = 0
        self.tot = 0
        self.forecast = self.train_data.iloc[self.cur_step, :]
        return self.forecast

    def _step(self, action):
        assert self.action_space.contains(action)
        cur_game = self.game.iloc[self.cur_step, :]
        self.cur_step += 1
        self.forecast = self.train_data.iloc[self.cur_step, :] # update forecast
        reward = self._check_bet(cur_game, action)
        self.profit += reward
        perc = round((self.won * 100)/self.tot, 2)
        done = self.cur_step == self.n_step - 1
        info = {'cur_val': ('$' + str(round(self.profit, 2)), str(perc) + '%')}
        return self.forecast, reward, done, info

    
    def _check_bet(self, cur_game, action):
        if action == int(cur_game[0]):
            reward = cur_game[action] -1
            self.won += 1
        else:
            reward = -1
        self.tot += 1
        return reward

### Agent

In [3]:
from collections import deque
import random
import numpy as np

class DQNAgent(object):
    """ A simple Deep Q agent """
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = mlp(state_size, action_size)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action


    def replay(self, batch_size=32):
        """ vectorized implementation; 30x speed up compared with for loop """
        minibatch = random.sample(self.memory, batch_size)

        states = np.array([tup[0][0] for tup in minibatch])
        actions = np.array([tup[1] for tup in minibatch])
        rewards = np.array([tup[2] for tup in minibatch])
        next_states = np.array([tup[3][0] for tup in minibatch])
        done = np.array([tup[4] for tup in minibatch])

        # Q(s', a)
        target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1)
        # end state target is reward itself (no lookahead)
        target[done] = rewards[done]

        # Q(s, a)
        target_f = self.model.predict(states)
        # make the agent to approximately map the current state to future discounted reward
        target_f[range(batch_size), actions] = target

        self.model.fit(states, target_f, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

### Utils

In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def get_data():
    data = pd.read_csv('/Users/eliogruttadauria/Desktop/df_pastOdds.csv')
    return data

def get_scaler(env):
    low = [0] * env.n_bet
    high = list(env.train_data.max(axis=0))

    scaler = StandardScaler()
    scaler.fit([low, high])
    return scaler


def maybe_make_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
def split_data(data, test_size = 0.2):
    #check good split: test must be last data available
    cut_point = round(data.shape[0]*test_size)
    test_data = data[:cut_point]
    train_data = data[cut_point:]
    return test_data, train_data

### TRAIN (mode = 'train')

In [39]:
import pickle
import time
import numpy as np
import argparse
import re

if __name__ == '__main__':
    
    mode = 'train'
    episode = 100
    batch_size = 32
        
    maybe_make_dir('weights')
    maybe_make_dir('portfolio_val')

    timestamp = time.strftime('%Y%m%d%H%M')

    data = get_data()
    test_data, train_data = split_data(data)

    env = TradingEnv(train_data)
    state_size = 3
    action_size = 3 #4
    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)

    portfolio_value = []
    
    '''
    if mode == 'test':
        # remake the env with test data
        env = TradingEnv(test_data)
        # load trained weights
        agent.load(weights)
        # when test, the timestamp is same as time when weights was trained
        timestamp = re.findall(r'\d{12}', weights)[0]
    '''

    for e in range(episode):
        state = env._reset()
        state = scaler.transform([state])
        for time in range(env.n_step):
            action = agent.act(state)
            next_state, reward, done, info = env._step(action)
            next_state = scaler.transform([next_state])
            if mode == 'train':
                agent.remember(state, action, reward, next_state, done)
                state = next_state
            if done:
                print("episode: {}/{}, episode end value: {}".format(e + 1, episode, info['cur_val']))
                portfolio_value.append(info['cur_val']) # append episode end portfolio value
                break
            if mode == 'train' and len(agent.memory) > batch_size:
                agent.replay(batch_size)
        if mode == 'train' and (e + 1) % 10 == 0:  # checkpoint weights
            agent.save('weights/{}-dqn.h5'.format(timestamp))

    # save portfolio value history to disk
    with open('portfolio_val/{}-{}.p'.format(timestamp, mode), 'wb') as fp:
        pickle.dump(portfolio_value, fp)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_40 (Dense)             (None, 32)                128       
_________________________________________________________________
dense_41 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_42 (Dense)             (None, 3)                 99        
Total params: 1,283
Trainable params: 1,283
Non-trainable params: 0
_________________________________________________________________
None
episode: 1/100, episode end value: ('$106.7', '32.49%')
episode: 2/100, episode end value: ('$222.18', '35.02%')
episode: 3/100, episode end value: ('$224.75', '32.81%')
episode: 4/100, episode end value: ('$243.75', '33.12%')
episode: 5/100, episode end value: ('$235.34', '32.49%')
episode: 6/100, episode end value: ('$245.72', '32.81%')
episode: 7/100, episode end value: ('$245.99', '31.86%')
e

### TEST (mode = 'test') -- finire

In [26]:
import pickle
import time
import numpy as np
import argparse
import re

if __name__ == '__main__':
    
    mode = 'test'
    episode = 200
    batch_size = 32
        
    maybe_make_dir('weights')
    maybe_make_dir('portfolio_val')

    timestamp = time.strftime('%Y%m%d%H%M')

    data = get_data()
    test_data, train_data = split_data(data)

    env = TradingEnv(train_data)
    state_size = 3
    action_size = 3
    agent = DQNAgent(state_size, action_size)
    scaler = get_scaler(env)

    portfolio_value = []
    
    if mode == 'test':
        # remake the env with test data
        env = TradingEnv(test_data)
        # load trained weights
        agent.load(weights)
        # when test, the timestamp is same as time when weights was trained
        timestamp = re.findall(r'\d{12}', weights)[0]

    for e in range(episode):
        state = env._reset()
        state = scaler.transform([state])
        for time in range(env.n_step):
            action = agent.act(state)
            next_state, reward, done, info = env._step(action)
            next_state = scaler.transform([next_state])
            if mode == 'train':
                agent.remember(state, action, reward, next_state, done)
                state = next_state
            if done:
                print("episode: {}/{}, episode end value: {}".format(e + 1, episode, info['cur_val']))
                portfolio_value.append(info['cur_val']) # append episode end portfolio value
                break
            if mode == 'train' and len(agent.memory) > batch_size:
                agent.replay(batch_size)
        if mode == 'train' and (e + 1) % 10 == 0:  # checkpoint weights
            agent.save('weights/{}-dqn.h5'.format(timestamp))

    # save portfolio value history to disk
    with open('portfolio_val/{}-{}.p'.format(timestamp, mode), 'wb') as fp:
        pickle.dump(portfolio_value, fp)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 32)                128       
_________________________________________________________________
dense_23 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_24 (Dense)             (None, 3)                 99        
Total params: 1,283
Trainable params: 1,283
Non-trainable params: 0
_________________________________________________________________
None


NameError: name 'weights' is not defined

TO DO LIST:

    - add 4 action: do not bet
    - reduce action with 1 or X2
    - repeat each match in opposite direction
    - problem in the name of the matches
    - different $ for different matches