In [1]:
import pandas as pd
import time
import re
import json
import numpy as np
from random import shuffle

import tensorflow as tf
from keras import backend as K
from keras.backend.tensorflow_backend import _to_tensor

from keras import initializers, regularizers, constraints
from keras.models import Model, model_from_json
from keras.callbacks import LearningRateScheduler, Callback
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.layers import (Layer, Input, Embedding, Dropout, Dense,
                          TimeDistributed, concatenate, BatchNormalization,
                          Reshape, Flatten, GlobalAveragePooling1D, add,
                          Lambda, subtract, Bidirectional, GRU)


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn

from collections import defaultdict
from tqdm import tqdm

sn.set(font_scale=1.4)#for label size


t0 = time.time()
df = pd.read_csv('full_dataset.csv')
print("Fully loaded in {:02d}m{:02d}s".format(*divmod(int(time.time() - t0), 60)))

Fully loaded in 00m06s


In [2]:
EMBEDDING_FILE = "./embeddings/_event_vectors_200dim_15epochs.txt"
EMBEDDING_DIM = 200

def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE)
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in all_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    
    return embeddings_index
    

d = defaultdict(int)

x = df.action_SEQ.str.split(' / ')
for elt in x:
    for w in elt:
        d[w] += 1
        
x = df.before_SEQ.str.split(' / ')
for elt in x:
    for sents in elt:
        words = sents.split()
        for w in words:
            if w not in d:
                d[w] += 1
            
x = df.after_SEQ.str.split(' / ')
for elt in x:
    for sents in elt:
        words = sents.split()
        for w in words:
            if w not in d:
                d[w] += 1

d = dict(d)

all_words = set(d.keys())
print('All words : ', len(all_words))

embeddings_index = get_embedding()

docs = sum([[key] * value for key, value in d.items()], [])
shuffle(docs)

tokenizer = Tokenizer(lower=False, filters="")
tokenizer.fit_on_texts(docs)

word_index = tokenizer.word_index
rv_word_index = {v: k for k, v in word_index.items()}

nb_words = len(word_index) + 1
embedding_matrix = np.random.rand(nb_words, EMBEDDING_DIM)

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        print('not_found: %s' % word)
        
print('Embedding mat : ', embedding_matrix.shape)

All words :  1186
not_found: <END_PERIOD_1>
not_found: <START_PERIOD_2>
not_found: <END_PERIOD_2>
not_found: <START_PERIOD_1>
Embedding mat :  (1187, 200)


In [3]:
with open('./maps/player2mins.json', 'r', encoding='utf8') as fp:
    player2mins = json.load(fp)

df = df[df.player.isin(player2mins.keys())].reset_index(drop=True)

In [4]:
print(df.shape)

(37380, 897)


In [5]:
def index_and_pad(seq, wmap = word_index, value = 0, maxlen = 5, reverse = False):
    if wmap is not None:
        seq = [[word_index[w] for w in s.split()] for s in seq]
    else:
        seq = [[int(w) for w in s.split()] for s in seq]
    if reverse:
        seq = [[value] * (maxlen - len(s)) + s if len(s)<maxlen else s[:maxlen] for s in seq]
    else:
        seq = [s + [value] * (maxlen - len(s)) if len(s)<maxlen else s[:maxlen] for s in seq]
    return seq

In [6]:
cmap = dict(enumerate(pd.get_dummies(df.player)))
rv_cmap = {v: k for k, v in cmap.items()}

Y = pd.get_dummies(df.player).values

X_HOME = df[[c for c in df.columns if c.endswith('_HOME')]].values
X_AWAY = df[[c for c in df.columns if c.endswith('_AWAY')]].values
X_PLAYER = df[[c for c in df.columns if c.endswith('_PLAYER')]].values

X_HOME.shape, X_AWAY.shape, X_PLAYER.shape

((37380, 295), (37380, 295), (37380, 284))

In [8]:
n_features_team = X_HOME.shape[1]
n_features_player = X_PLAYER.shape[1]
n_cats = Y.shape[1]

In [9]:
n_units = 50
drop_rate = 0.5

def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class AttentionWithContext(Layer):
    def __init__(self, return_coefficients=False,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.return_coefficients = return_coefficients
        self.init = initializers.get('glorot_uniform')
        
        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        
        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)
        
        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)
    
    def build(self, input_shape):
        assert len(input_shape) == 3
        
        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        
        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)
        
        super(AttentionWithContext, self).build(input_shape)
    
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None
    
    def call(self, x, mask=None):
        uit = dot_product(x, self.W)
        
        if self.bias:
            uit += self.b
        
        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)
        
        a = K.exp(ait)
        
        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        
        a = K.expand_dims(a)
        weighted_input = x * a
        
        if self.return_coefficients:
            return [K.sum(weighted_input, axis=1), a]
        else:
            return K.sum(weighted_input, axis=1)
    
    
    def compute_output_shape(self, input_shape):
        if self.return_coefficients:
            return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[-1], 1)]
        else:
            return input_shape[0], input_shape[-1]
          
    def get_config(self):
        config = {"return_coefficients": self.return_coefficients,
       "W_regularizer": self.W_regularizer,
       "u_regularizer": self.u_regularizer,
       "b_regularizer": self.b_regularizer,
       "W_constraint": self.W_constraint,
       "u_constraint": self.u_constraint,
       "b_constraint": self.b_constraint,
       "bias": self.bias}
        base_config = super(AttentionWithContext, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

class SplitLayer(Layer):

    def __init__(self, range_, **kwargs):
        self.range_ = range_
        self.start = self.range_[0]
        self.end = self.range_[1]
        self.output_dim = self.end - self.start
        super(SplitLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        super(SplitLayer, self).build(input_shape)

    def call(self, x):
        return x[:, self.start:self.end]

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)
      
    def get_config(self):
        config = {'range_': self.range_}
        base_config = super(SplitLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [10]:
MAX_I = 30

def get_action_encoder():
    raw_input = Input(shape=(2*1 +4*5,))
            
    target_input = SplitLayer((0, 1))(raw_input)
    target_teams_input = SplitLayer((1, 2))(raw_input)

    bf_ctx_input = SplitLayer((2, 7))(raw_input)
    bf_teams_input = SplitLayer((7, 12))(raw_input)

    af_ctx_input = SplitLayer((12, 17))(raw_input)
    af_teams_input = SplitLayer((17, 22))(raw_input)
    
    embed_layer = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=None,  # sentence size vary from batch to batch
                            trainable=False
                            )

    bf_ctx_wv = embed_layer(bf_ctx_input)
    af_ctx_wv = embed_layer(af_ctx_input)
    target_wv = embed_layer(target_input)

    bf_ctx_wv = concatenate([bf_ctx_wv, Reshape((5, 1))(bf_teams_input)], axis=-1)
    af_ctx_wv = concatenate([af_ctx_wv, Reshape((5, 1))(af_teams_input)], axis=-1)
    target_wv = concatenate([target_wv, Reshape((1, 1))(target_teams_input)], axis=-1)

    bidir_gru = Bidirectional(GRU(units=n_units,
                                     activation='tanh', 
                                     dropout=0.0,
                                     recurrent_dropout=0.0,
                                     implementation=1,
                                     return_sequences=True,
                                     reset_after=True,
                                     recurrent_activation='sigmoid'),
                                 merge_mode='concat', weights=None)


    sent_bf_ctx = bidir_gru(bf_ctx_wv)
    sent_af_ctx = bidir_gru(af_ctx_wv)

    sent_bf_vec = AttentionWithContext(return_coefficients=False)(
            sent_bf_ctx)
    sent_bf_vec_dr = Dropout(drop_rate)(sent_bf_vec)

    sent_af_vec = AttentionWithContext(return_coefficients=False)(
            sent_af_ctx)
    sent_af_vec_dr = Dropout(drop_rate)(sent_af_vec)   

    target_vec = bidir_gru(target_wv)
    target_vec = Flatten()(target_vec)
    target_vec_dr = Dropout(drop_rate)(target_vec)

    out = concatenate([target_vec_dr, sent_bf_vec_dr, sent_af_vec_dr], axis=-1)
    out = Reshape((3, 2 * n_units))(out)

    bidir_gru_2 = Bidirectional(GRU(units=n_units,
                                     activation='tanh', 
                                     dropout=0.0,
                                     recurrent_dropout=0.0,
                                     implementation=1,
                                     return_sequences=False,
                                     reset_after=True,
                                     recurrent_activation='sigmoid'),
                                 merge_mode='concat', weights=None)

    out = bidir_gru_2(out)
    out = Dropout(drop_rate)(out)
    
    out = Dense(256, activation='relu')(out)

    model = Model(raw_input, out)
    
    return model

In [11]:
def build_model():

    home_input = Input(shape=(n_features_team,))
    away_input = Input(shape=(n_features_team,))
    player_input = Input(shape=(n_features_player,))

    target_input = Input(shape=(MAX_I, 1,))
    target_teams_input = Input(shape=(MAX_I, 1,))

    bf_ctx_input = Input(shape=(MAX_I, 5,))
    bf_teams_input = Input(shape=(MAX_I, 5,))

    af_ctx_input = Input(shape=(MAX_I, 5,))
    af_teams_input = Input(shape=(MAX_I, 5,))
    
    timings_input = Input(shape = (12,))

    home_dense = BatchNormalization()(home_input)
    home_dense = Dense(64, activation='relu')(home_dense)##
    home_dense = Dropout(drop_rate)(home_dense)

    away_dense = BatchNormalization()(away_input)
    away_dense = Dense(64, activation='relu')(away_dense)##
    away_dense = Dropout(drop_rate)(away_dense)

    inter = subtract([home_dense, away_dense])

    inter = Dense(64, activation='relu')(##
        concatenate([home_dense, away_dense, inter]))
    inter = Dropout(drop_rate)(inter)

    home_dense = Dense(128, activation='relu')(
        concatenate([home_dense, inter]))

    away_dense = Dense(128, activation='relu')(
        concatenate([away_dense, inter]))

    player_dense = BatchNormalization()(player_input)
    player_dense = Dense(128, activation='relu')(player_dense)
    player_dense = Dropout(drop_rate)(player_dense)

    action_input = concatenate([target_input, target_teams_input,
                                bf_ctx_input, bf_teams_input, af_ctx_input, af_teams_input])

    action_encoder = get_action_encoder()
    actions_encoded = TimeDistributed(action_encoder)(action_input)
    actions_encoded = GlobalAveragePooling1D()(actions_encoded)

    home_pred = Dense(20, activation="softmax", name="home")(home_dense)

    away_pred = Dense(20, activation='softmax', name="away")(away_dense)

    frag_encoded = concatenate([player_dense, home_dense, away_dense, actions_encoded, timings_input])

    frag_encoded = Dense(512, activation='relu')(frag_encoded)
    frag_encoded = Dropout(drop_rate)(frag_encoded)

    player_pred = Dense(n_cats, activation='softmax', name="player")(frag_encoded)

    model=Model([home_input, away_input, player_input, target_input, target_teams_input,
                   bf_ctx_input, bf_teams_input, af_ctx_input, af_teams_input, timings_input], [home_pred, away_pred, player_pred])
    
    return model

In [12]:
X_BEFORE_CTXS = df.before_SEQ.str.split(' / ').apply(index_and_pad, **{"reverse": True})
X_BEFORE_CTXS = np.array(X_BEFORE_CTXS)

X_AFTER_CTXS = df.after_SEQ.str.split(' / ').apply(index_and_pad)
X_AFTER_CTXS = np.array(X_AFTER_CTXS)


X_BEFORE_TEAMS = df.teams_before_SEQ.str.split(' / ').apply(index_and_pad, **{"value": -1, "wmap": None, "reverse": True})
X_BEFORE_TEAMS = np.array(X_BEFORE_TEAMS)

X_AFTER_TEAMS = df.teams_after_SEQ.str.split(' / ').apply(index_and_pad, **{"value": -1, "wmap": None})
X_AFTER_TEAMS = np.array(X_AFTER_TEAMS)

X_ACTIONS = df.action_SEQ.str.split(' / ').apply(index_and_pad, **{"maxlen": 1})
X_ACTIONS = np.array([sum(s,[]) for s in X_ACTIONS])
X_ACTIONS = np.array([s + [0] * (MAX_I - len(s)) if len(s)
                      < MAX_I else s[:MAX_I] for s in X_ACTIONS])

X_ACTIONS = X_ACTIONS[..., np.newaxis]
X_ACTIONS_TEAMS = np.ones(X_ACTIONS.shape).astype(int)

X_BEFORE_CTXS = np.array([np.pad(s, (0,MAX_I-len(s)), "constant", constant_values = 0)[:,:5] if len(s)
                      < MAX_I else s[:MAX_I] for s in X_BEFORE_CTXS])

X_AFTER_CTXS = np.array([np.pad(s, (0,MAX_I-len(s)), "constant", constant_values = 0)[:,:5] if len(s)
                      < MAX_I else s[:MAX_I] for s in X_AFTER_CTXS])

X_BEFORE_TEAMS = np.array([np.pad(s, (0, MAX_I - len(s)), "constant", constant_values=-1)[:, :5] if len(s)
                           < MAX_I else s[:MAX_I] for s in X_BEFORE_TEAMS])

X_AFTER_TEAMS = np.array([np.pad(s, (0, MAX_I - len(s)), "constant", constant_values=-1)[:, :5] if len(s)
                           < MAX_I else s[:MAX_I] for s in X_AFTER_TEAMS])
X_TIMINGS = df[[c for c in df.columns if c.startswith('TIMING')]].values

In [13]:
y_home = pd.get_dummies(df.team_home).values
y_away = pd.get_dummies(df.team_away).values

In [14]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
model_count = 1

def schedule(epoch, lr_):
    if epoch == 4:
        return lr_ * 0.2
    return lr_
    
for splitstrain, splitstest in skf.split(np.zeros(len(df)), df.player.tolist()):
    print("MODEL:", model_count)

    home_train, away_train, player_train = X_HOME[splitstrain], X_AWAY[splitstrain], X_PLAYER[splitstrain]
    actions_train, actions_teams_train = X_ACTIONS[splitstrain], X_ACTIONS_TEAMS[splitstrain]
    bf_ctxs_train, bf_teams_train = X_BEFORE_CTXS[splitstrain], X_BEFORE_TEAMS[splitstrain]
    af_ctxs_train, af_teams_train = X_AFTER_CTXS[splitstrain], X_AFTER_TEAMS[splitstrain]
    timings_train = X_TIMINGS[splitstrain]
    y_train = Y[splitstrain]
    y_home_train = y_home[splitstrain]
    y_away_train = y_away[splitstrain]


    home_test, away_test, player_test = X_HOME[splitstest], X_AWAY[splitstest], X_PLAYER[splitstest]
    actions_test, actions_teams_test = X_ACTIONS[splitstest], X_ACTIONS_TEAMS[splitstest]
    bf_ctxs_test, bf_teams_test = X_BEFORE_CTXS[splitstest], X_BEFORE_TEAMS[splitstest]
    af_ctxs_test, af_teams_test = X_AFTER_CTXS[splitstest], X_AFTER_TEAMS[splitstest]
    timings_test = X_TIMINGS[splitstest]
    y_test = Y[splitstest]
    y_home_test = y_home[splitstest]
    y_away_test = y_away[splitstest]

    K.clear_session()
    callback_list = [LearningRateScheduler(schedule, verbose = 0)]

    model = build_model()
    model.layers[24].layer.layers[7].trainable = True
    print(model.layers[24].layer.layers[7].name, model.layers[24].layer.layers[7].trainable)
    model.compile(loss=['categorical_crossentropy', 'categorical_crossentropy', 'categorical_crossentropy'],
                  loss_weights=[1, 1, 50],
                optimizer=Adam(lr = 0.0005),
                metrics=['accuracy'])

    hist = model.fit([home_train, away_train, player_train,
                  actions_train, actions_teams_train,
                  bf_ctxs_train, bf_teams_train,
                  af_ctxs_train, af_teams_train, timings_train],
                 [y_home_train, y_away_train, y_train],
                 validation_data=([home_test, away_test, player_test,
                                   actions_test, actions_teams_test,
                                   bf_ctxs_test, bf_teams_test,
                                   af_ctxs_test, af_teams_test, timings_test],
                                  [y_home_test, y_away_test, y_test]),
                 epochs=6,
                 callbacks=callback_list,
                 verbose=1,
                 batch_size=128
                )
    

    model_json = model.to_json()
    with open("next_event_model_%d.json" % model_count, "w") as json_file:
        json_file.write(model_json)
    model.save_weights("next_event_model_%d.json" % model_count)
    print("Saved model to disk")

    model_count += 1


MODEL: 1
embedding_1 True
Train on 29809 samples, validate on 7571 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Saved model to disk
MODEL: 2
embedding_1 True
Train on 29857 samples, validate on 7523 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Saved model to disk
MODEL: 3
embedding_1 True
Train on 29899 samples, validate on 7481 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Saved model to disk
MODEL: 4
embedding_1 True
Train on 29952 samples, validate on 7428 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Saved model to disk
MODEL: 5
embedding_1 True
Train on 30003 samples, validate on 7377 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Saved model to disk
