In [1]:
import pandas as pd
import numpy as np
import os

import json

SHOT_ID = 16


In [2]:
def getOutcome(event_id, dataset):
    event_index = 0
    current_pos = 0
    for i in range(len(dataset)):
        if dataset[i].get("id") == event_id:
            
            event_index = i
            current_pos = dataset[i].get('possession_team').get("id")
            break
            
    for i in range(event_index+1, len(dataset)):
        event = dataset[i]
        if event.get("possession_team").get("id") != current_pos:
            return 0
        elif event.get("type").get("id") == SHOT_ID:
            return event.get("shot").get("statsbomb_xg")
    
    return 0
    

In [3]:
def encodeEvent(event):
    if event.get("type").get("id") == 18 or event.get("type").get("id") == 35 or event.get("type").get("id") == 42:
        return None
    
    
    x = []
    x.append(event.get("minute"))
    x.append(event.get("second"))
    x.append(event.get("possession"))
    x.append(event.get("type").get("id"))
    x.append(event.get("play_pattern").get("id"))
    x.append(event.get("duration"))
    location = event.get("location")
    if location is None:
        return None
    x.append(event.get("location")[0])
    x.append(event.get("location")[1])
    
    return x
    
    

In [4]:
import numpy as np

def createDataset():
    X, Y = [], []
    files = os.listdir("open-data/data/events/")
    for file in files:
        print("Processing file " + file)
        filepath = "open-data/data/events/" + file
        with open(filepath, "r", encoding="UTF-8") as file:
            match = json.load(file)
        
        #do work on data here
        for event in match:
            event_id = event.get("id")
            
            x = encodeEvent(event)
            y = getOutcome(event_id, match)
            
            if y is None:
                y = 0
            
            
            
            if x is not None:
                X.append(x)
                Y.append(y)
            
    
    np.savez("processed/dataset.npz", X, Y)
    return X, Y
        
    

In [5]:
dataset = np.load("dataset.npz", allow_pickle=True)

In [6]:
dataset["arr_0"]

array([[0, 0, 2, ..., 1.909, 61.0, 41.0],
       [0, 2, 2, ..., 1.49, 36.0, 26.0],
       [0, 3, 2, ..., 3.288, 36.0, 29.0],
       ...,
       [92, 7, 171, ..., 0.071, 91.0, 43.0],
       [92, 8, 171, ..., 0.0, 92.0, 37.0],
       [92, 8, 171, ..., 0.0, 30.0, 38.0]], dtype=object)

In [7]:
import sklearn
from sklearn import preprocessing



X = dataset["arr_0"].astype(np.float)
Y = dataset["arr_1"].astype(np.float)

X = np.nan_to_num(X)



In [8]:
import tensorflow as tf
from tensorflow import keras

In [9]:
model = keras.Sequential()
model.add(keras.layers.Dense(8))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(32))
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(32))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Dense(1, activation="sigmoid"))
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["cosine_proximity", "accuracy"])



In [10]:
model.fit(np.asarray(X), np.asarray(Y), epochs=5, batch_size=1024)

Train on 2251443 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f6e20225518>

In [11]:
model.save("model.h5")

In [12]:
model.predict(X)

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [13]:
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor(n_estimators=100)

reg.fit(X,Y)

GradientBoostingRegressor()

In [14]:
reg.predict(X[:1])

array([0.01785338])

In [15]:
from collections import defaultdict

players = defaultdict(list)

files = os.listdir("open-data/data/events/")

for file in files:
    filepath = "open-data/data/events/"+file
    with open(filepath, "r", encoding="UTF-8") as file:
            match = json.load(file)
            
    for event in match:
        player = event.get("player")
        if player is not None:
            name = player.get("name")
            encoded = np.array(encodeEvent(event)).reshape(1,-1)
            encoded = np.nan_to_num(encoded)
            try:
                score = reg.predict(encoded)
                players[name].append(score[0])
            except:
                pass
   

    
            

In [25]:
rankings = []
for player,values in players.items():
    rankings.append((player,(sum(values)/len(values), len(values))))

In [39]:
cutrankings = []
for player in rankings:
    if player[1][1] > 1000:
        cutrankings.append([player[0], player[1][0], player[1][1]])
        
cutrankings = np.array(cutrankings)

pd.DataFrame(cutrankings).to_csv("data.csv")

        

    

In [None]:
import matplotlib.pyplot as plt

