In [1]:
import pandas as pd
import time
import re
import json
import numpy as np
from random import shuffle

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import Callback
from keras.layers import Bidirectional, GRU
from keras.layers import Layer
from keras.layers import Reshape, Flatten
from keras.layers import GlobalAveragePooling1D, add, Lambda, subtract
from keras import initializers, regularizers, constraints
from keras.models import Model
from keras.backend.tensorflow_backend import _to_tensor
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Input, Embedding, Dropout, Dense, TimeDistributed
from keras.layers import concatenate, BatchNormalization
from keras.models import model_from_json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn

from collections import Counter, defaultdict

sn.set(font_scale=1.4)#for label size


t0 = time.time()
df = pd.read_csv('./next_event_dataset.csv')
print("Fully loaded in {:02d}m{:02d}s".format(*divmod(int(time.time() - t0), 60)))

Using TensorFlow backend.


Fully loaded in 00m01s


In [2]:
d = defaultdict(int)

x = df.events.str.split(' / ')
for elt in x:
    for w in elt:
        d[w] += 1

d = dict(d)

EMBEDDING_FILE = "./embeddings/light_vectors_50dim_15epochs.txt"
EMBEDDING_DIM = 50

def get_embedding():
    embeddings_index = {}
    f = open(EMBEDDING_FILE)
    for line in f:
        values = line.split()
        word = values[0]
        if len(values) == EMBEDDING_DIM + 1 and word in all_words:
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs
    f.close()
    return embeddings_index

all_words = set(d.keys())
print('All words : ', len(all_words))

embeddings_index = get_embedding()

docs = sum([[key] * value for key, value in d.items()], [])
shuffle(docs)

tokenizer = Tokenizer(lower=False, filters="", split="--")
tokenizer.fit_on_texts(docs)

word_index = tokenizer.word_index
rv_word_index = {v: k for k, v in word_index.items()}

nb_words = len(word_index) + 1
embedding_matrix = np.random.rand(nb_words, EMBEDDING_DIM)

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        print('not_found: %s' % word)
        
print('Embedding mat : ', embedding_matrix.shape)

All words :  40
Embedding mat :  (41, 50)


In [3]:
events = [[word_index[w] for w in elt.split(" / ")] for elt in df.events]
events = np.array(events, dtype='int32')

pos = [[[float(xx), float(yy)] for xx, yy in zip(x.split(' / '), y.split(' / '))] for x, y in zip(df['xs'], df['ys'])]
pos = np.array(pos)


refs = [[int(r) for r in elt.split(' / ')] for elt in df.refs]
refs = np.array(refs)


deltas = [[int(d) for d in elt.split(' / ')] for elt in df.deltas]
deltas = np.array(deltas)


next_pos = [[float(x), float(y)] for x, y in zip(df.next_x, df.next_y)]
next_pos = np.array(next_pos)


switch = df.team_changed.values.astype(int)[..., None]

In [4]:
n_units = 50

def build_model():
    events_input = Input(shape=(10,))
    pos_input = Input(shape=(10, 2,))
    refs_input = Input(shape=(10,))
    deltas_input = Input(shape=(10,))

    embed_layer = Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            input_length=None,  # sentence size vary from batch to batch
                            trainable=True
                            )

    events_wv = embed_layer(events_input)
    past = concatenate([events_wv,
                       pos_input,
                       Reshape((10, 1))(refs_input),
                       #Reshape((10, 1))(at_home_input),
                       Reshape((10, 1))(deltas_input)])
    bidir_gru = Bidirectional(GRU(units=n_units,
                                         activation='tanh', 
                                         dropout=0.0,
                                         recurrent_dropout=0.0,
                                         implementation=1,
                                         return_sequences=False,
                                         reset_after=True,
                                         recurrent_activation='sigmoid'),
                                     merge_mode='concat', weights=None)

    future = bidir_gru(past)
    fmap = Dense(256, activation='relu')(future)
    fmap = Dropout(0.2)(fmap)
    
    pred_switch = Dense(1, activation='sigmoid', name = "switch")(fmap)
    pred_pos = Dense(2, activation='linear', name = "pos")(fmap)

    model = Model([events_input, pos_input, refs_input, deltas_input], [pred_switch, pred_pos])
    
    return model

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
model_count = 1

for train, test in skf.split(np.zeros(len(df)), df.next_event.tolist()):
    print("MODEL:", model_count)

    events_train = events[train]
    pos_train = pos[train]
    refs_train = refs[train]
    deltas_train = deltas[train]
    next_pos_train = next_pos[train]
    switch_train = switch[train]

    events_test = events[test]
    pos_test = pos[test]
    refs_test = refs[test]
    deltas_test = deltas[test]
    next_pos_test = next_pos[test]
    switch_test = switch[test]
    
    
    K.clear_session()

    model = build_model()

    model.compile(loss={"switch": 'binary_crossentropy',
                        'pos': 'mean_squared_error'},
                  optimizer="adam",
                  metrics={"switch": 'acc', 'pos': 'mse'})

    early_stopping = EarlyStopping(monitor="val_pos_mean_squared_error", patience=2)
    best_model_path = "best_model" + str(model_count) + ".h5"
    model_checkpoint = ModelCheckpoint(best_model_path, monitor = "val_pos_mean_squared_error",
                                       mode="min", save_best_only=True, save_weights_only=True)
    
    callback_list = [early_stopping, model_checkpoint]
    
    hist = model.fit([events_train, pos_train, refs_train, deltas_train], [switch_train, next_pos_train],
                     validation_data=([events_test, pos_test, refs_test, deltas_test], [
                                      switch_test, next_pos_test]),
                     epochs=100,
                     callbacks=callback_list,
                     verbose=1,
                     batch_size=1048
                    )

    model.load_weights(best_model_path)
    
    model_json = model.to_json()
    with open("next_event_model_%d.json" % model_count, "w") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model.save_weights("next_event_model_%d.h5" % model_count)
    print("Saved model to disk")

    model_count += 1

MODEL: 1
Train on 256614 samples, validate on 64176 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Saved model to disk
MODEL: 2
Train on 256622 samples, validate on 64168 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Saved model to disk
MODEL: 3
Train on 256633 samples, validate on 64157 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
S