In [143]:
import numpy as np
import pandas as pd 
import pickle


In [112]:
Data = pd.read_csv("Data/epl.csv")

In [154]:
# Function to process the data of a game
# Results in a list containing tuple of teams, dictionary team 1 of events, dictionary team 2 of events
# dictionary team 1 of times, dictionary team 2 of times, player list team 1, player list team 2

def data_process_game(data, match_id):
    
    game_data = data.copy()
    game_data = game_data[game_data.match_id == match_id]
    
    # tuple of home and away team
    teams = (game_data.home_team_id.mode()[0], game_data.away_team_id.mode()[0])
    
    # Players
    home_players = game_data[game_data.team_id == teams[0]].player_id.unique()
    home_players = home_players[~np.isnan(home_players)]
    
    away_players = game_data[game_data.team_id == teams[1]].player_id.unique()
    away_players = away_players[~np.isnan(away_players)]
    
    # Dictionary on events
    
    home_passes = {}
    away_passes = {}
    
    # Dictionary on times
    
    home_time = {}
    away_time = {}
    max_time = game_data['min'].max()
    
    # Dictionary on 
    
    for i in home_players:
        for j in home_players:
            if i != j:
                home_passes[(i, j)] = 0
                home_time[(i, j)] = [0, max_time]
            if i == j:
                home_time[(i, 'Self')] = [0, max_time]
        home_passes[(i, 'Shot')] = 0
        home_passes[(i, 'Gain')] = 0
        home_passes[(i, 'Loss')] = 0
        home_passes[(i, 'Goal')] = 0
    
    for i in away_players:
        for j in away_players:
            if i != j:
                away_passes[(i, j)] = 0  
                away_time[(i, j)] = [0, max_time]
            if i == j:
                away_time[(i, 'Self')] = [0, max_time]
        away_passes[(i, 'Shot')] = 0
        away_passes[(i, 'Gain')] = 0
        away_passes[(i, 'Loss')] = 0
        away_passes[(i, 'Goal')] = 0
    
    for index, row in game_data.iterrows():
        
        # Get Pass Data
        
        if (row['type'] == 1) & (row['outcome'] == 1):
            
            if row['team_id'] == teams[0]:
                
                player1 = row['player_id']
                j = 1
                found_passee = False
                while (not found_passee) & (j < 4):
                    row2 = game_data.loc[index+j]
                    if (row2['team_id'] == teams[0]) & (row2['player_id'] != player1) & (not np.isnan(row2['player_id'])):
                        player2 = row2['player_id']
                        found_passee = True
                    j+=1
                 
                if found_passee: home_passes[(player1, player2)] += 1
            
            if row['team_id'] == teams[1]:

                player1 = row['player_id']
                j = 1
                found_passee = False
                while (not found_passee) & (j < 4):
                    row2 = game_data.loc[index+j]
                    if (row2['team_id'] == teams[1]) & (row2['player_id'] != player1) & (not np.isnan(row2['player_id'])):
                        player2 = row2['player_id']
                        found_passee = True
                    j+=1
                  
                if found_passee: away_passes[(player1, player2)] += 1
                
        # Get Gain Data
        # On ball recovery, out of bounds, keeper pick up, and corner awarded

        if (row['type'] == 49) | ((row['type'] == 5) & (row['outcome'] == 1)) \
        | (row['type'] == 52) | ((row['type'] == 6) & (row['outcome'] == 1)):

            #print('y')

            if row['team_id'] == teams[0]:

                player1 = row['player_id']
                if not np.isnan(player1): home_passes[(player1, 'Gain')] += 1

            if row['team_id'] == teams[1]:

                player1 = row['player_id']
                if not np.isnan(player1): away_passes[(player1, 'Gain')] += 1

        # Get Loss Data
        # On bad pass, out of bounds, bad take on, or dispossessed

        if ((row['type'] == 3) & (row['outcome'] == 0)) | ((row['type'] == 5) & (row['outcome'] == 0)) \
        | ((row['type'] == 50) & (row['outcome'] == 1)) | ((row['type'] == 1) & (row['outcome'] == 0)):

            if row['team_id'] == teams[0]:

                player1 = row['player_id']
                if not np.isnan(player1): home_passes[(player1, 'Loss')] += 1

            if row['team_id'] == teams[1]:

                player1 = row['player_id']
                if not np.isnan(player1): away_passes[(player1, 'Loss')] += 1

        # Get Shot Data
        # On miss, post, attempt saved, or goal

        if (row['type'] == 13) | (row['type'] == 14) | (row['type'] == 15):

            if row['team_id'] == teams[0]:

                player1 = row['player_id']
                home_passes[(player1, 'Shot')] += 1

            if row['team_id'] == teams[1]:

                player1 = row['player_id']
                away_passes[(player1, 'Shot')] += 1

        if (row['type'] == 16):

            if row['team_id'] == teams[0]:

                player1 = row['player_id']
                home_passes[(player1, 'Shot')] += 1
                home_passes[(player1, 'Goal')] += 1

            if row['team_id'] == teams[1]:

                player1 = row['player_id']
                away_passes[(player1, 'Shot')] += 1
                away_passes[(player1, 'Goal')] += 1

    # Get Time Data

    sub_data = game_data[(game_data.type == 19) | (game_data.type == 18)]


    for index, row in sub_data.iterrows():

        if row['type'] == 19:

            if row['team_id'] == teams[0]:

                player = row['player_id']
                home_time[(player, 'Self')][0] = row['min']

            if row['team_id'] == teams[1]:

                player = row['player_id']
                away_time[(player, 'Self')][0] = row['min']

        if row['type'] == 18:

            if row['team_id'] == teams[0]:

                player = row['player_id']
                home_time[(player, 'Self')][1] = row['min']

            if row['team_id'] == teams[1]:

                player = row['player_id']
                away_time[(player, 'Self')][1] = row['min']

    for i in home_players:
        for j in home_players:
            if i != j:
                home_time[(i, j)] = [max(home_time[(i, 'Self')][0], home_time[(j, 'Self')][0]), \
                                 min(home_time[(i, 'Self')][1], home_time[(j, 'Self')][1])]
                if home_time[(i, j)][0] >= home_time[(i, j)][1]:
                    home_time[(i, j)] = [0, float('inf')]

    for i in away_players:
        for j in away_players:
            if i != j:
                away_time[(i, j)] = [max(away_time[(i, 'Self')][0], away_time[(j, 'Self')][0]), \
                                 min(away_time[(i, 'Self')][1], away_time[(j, 'Self')][1])]
                if away_time[(i, j)][0] >= away_time[(i, j)][1]:
                    away_time[(i, j)] = [0, float('inf')]

    
    # Get Time Weighted Event Rates
    # This is number of events divided by time shared between players or total player time
    
    home_rates = {}
    away_rates = {}
    
    for i in home_players:
        for j in home_players:
            if i != j:
                home_rates[(i,j)] = float(home_passes[(i,j)])/(home_time[(i,j)][1] - home_time[(i,j)][0])
        home_rates[(i, 'Shot')] = float(home_passes[(i, 'Shot')])/(home_time[(i, 'Self')][1] - home_time[(i, 'Self')][0])
        home_rates[(i, 'Gain')] = float(home_passes[(i, 'Gain')])/(home_time[(i, 'Self')][1] - home_time[(i, 'Self')][0])
        home_rates[(i, 'Loss')] = float(home_passes[(i, 'Loss')])/(home_time[(i, 'Self')][1] - home_time[(i, 'Self')][0])
        home_rates[(i, 'Goal')] = float(home_passes[(i, 'Goal')])/(home_time[(i, 'Self')][1] - home_time[(i, 'Self')][0])

    for i in away_players:
        for j in away_players:
            if i != j:
                away_rates[(i,j)] = float(away_passes[(i,j)])/(away_time[(i,j)][1] - away_time[(i,j)][0])
        away_rates[(i, 'Shot')] = float(away_passes[(i, 'Shot')])/(away_time[(i, 'Self')][1] - away_time[(i, 'Self')][0])
        away_rates[(i, 'Gain')] = float(away_passes[(i, 'Gain')])/(away_time[(i, 'Self')][1] - away_time[(i, 'Self')][0])
        away_rates[(i, 'Loss')] = float(away_passes[(i, 'Loss')])/(away_time[(i, 'Self')][1] - away_time[(i, 'Self')][0])
        away_rates[(i, 'Goal')] = float(away_passes[(i, 'Goal')])/(away_time[(i, 'Self')][1] - away_time[(i, 'Self')][0])
    
    return [teams, home_passes, away_passes, home_time, away_time, home_rates, away_rates, home_players, away_players]
                    
                



                

In [155]:
# Iterate through every game to output list in dictionary

matches = Data.match_id.unique()
matches = matches[~np.isnan(matches)]
n = len(matches)
ind = 1

games = {} 

for match_id in matches: 
    print('match ' + str(ind) + '/' + str(n))
    L = data_process_game(Data, match_id)
    games[match_id] = L
    ind += 1

    
    
    


match 1/380
match 2/380
match 3/380
match 4/380
match 5/380
match 6/380
match 7/380
match 8/380
match 9/380
match 10/380
match 11/380
match 12/380
match 13/380
match 14/380
match 15/380
match 16/380
match 17/380
match 18/380
match 19/380
match 20/380
match 21/380
match 22/380
match 23/380
match 24/380
match 25/380
match 26/380
match 27/380
match 28/380
match 29/380
match 30/380
match 31/380
match 32/380
match 33/380
match 34/380
match 35/380
match 36/380
match 37/380
match 38/380
match 39/380
match 40/380
match 41/380
match 42/380
match 43/380
match 44/380
match 45/380
match 46/380
match 47/380
match 48/380
match 49/380
match 50/380
match 51/380
match 52/380
match 53/380
match 54/380
match 55/380
match 56/380
match 57/380
match 58/380
match 59/380
match 60/380
match 61/380
match 62/380
match 63/380
match 64/380
match 65/380
match 66/380
match 67/380
match 68/380
match 69/380
match 70/380
match 71/380
match 72/380
match 73/380
match 74/380
match 75/380
match 76/380
match 77/380
match 78

ZeroDivisionError: float division by zero

In [158]:
output = open('EPL_Games.pkl', 'wb')
pickle.dump(games, output)
output.close()

In [156]:
games[matches[5]][3]

{(1195.0, 1212.0): [77, 94],
 (1195.0, 1216.0): [0, inf],
 (1195.0, 1243.0): [77, 94],
 (1195.0, 1420.0): [85, 94],
 (1195.0, 6286.0): [0, inf],
 (1195.0, 12882.0): [77, 94],
 (1195.0, 13898.0): [77, 94],
 (1195.0, 19053.0): [77, 94],
 (1195.0, 19196.0): [77, 94],
 (1195.0, 39765.0): [77, 94],
 (1195.0, 49806.0): [77, 94],
 (1195.0, 54771.0): [77, 94],
 (1195.0, 80442.0): [77, 85],
 (1195.0, 'Self'): [77, 94],
 (1212.0, 1195.0): [77, 94],
 (1212.0, 1216.0): [0, inf],
 (1212.0, 1243.0): [63, 94],
 (1212.0, 1420.0): [85, 94],
 (1212.0, 6286.0): [63, 77],
 (1212.0, 12882.0): [63, 94],
 (1212.0, 13898.0): [63, 94],
 (1212.0, 19053.0): [63, 94],
 (1212.0, 19196.0): [63, 94],
 (1212.0, 39765.0): [63, 94],
 (1212.0, 49806.0): [63, 94],
 (1212.0, 54771.0): [63, 94],
 (1212.0, 80442.0): [63, 85],
 (1212.0, 'Self'): [63, 94],
 (1216.0, 1195.0): [0, inf],
 (1216.0, 1212.0): [0, inf],
 (1216.0, 1243.0): [0, 63],
 (1216.0, 1420.0): [0, inf],
 (1216.0, 6286.0): [0, 63],
 (1216.0, 12882.0): [0, 63],


In [138]:
Data[Data.player_id == 1195]

Unnamed: 0,season_id,match_id,home_team_id,home_team_name,away_team_id,away_team_name,id,event_id,date,time,period_id,min,sec,team_id,player_id,playerName,playerPosition,x,y,type,description,outcome
9884,2012,442004,52,Queens Park Rangers,80,Swansea City,1421014697,648,18Aug2012,4:36:14 PM,2,77,5,52,1195.0,"Johnson, Andrew",Substitute,0.0,0.0,19,Player on,1
9926,2012,442004,52,Queens Park Rangers,80,Swansea City,1873567996,676,18Aug2012,4:38:44 PM,2,79,35,52,1195.0,"Johnson, Andrew",Substitute,85.8,65.2,61,Ball touch,0
9931,2012,442004,52,Queens Park Rangers,80,Swansea City,673397903,677,18Aug2012,4:38:53 PM,2,79,44,52,1195.0,"Johnson, Andrew",Substitute,81.3,101.7,5,Out,1
9949,2012,442004,52,Queens Park Rangers,80,Swansea City,1811029341,684,18Aug2012,4:40:16 PM,2,81,6,52,1195.0,"Johnson, Andrew",Substitute,50.1,50.6,1,Pass,1
37446,2012,442021,43,Manchester City,52,Queens Park Rangers,782518796,6,01Sep2012,5:30:28 PM,1,0,15,52,1195.0,"Johnson, Andrew",Striker,36.9,61.7,49,Ball recovery,1
37447,2012,442021,43,Manchester City,52,Queens Park Rangers,1832840896,7,01Sep2012,5:30:28 PM,1,0,15,52,1195.0,"Johnson, Andrew",Striker,36.9,61.7,1,Pass,1
37507,2012,442021,43,Manchester City,52,Queens Park Rangers,1201040501,27,01Sep2012,5:33:28 PM,1,3,15,52,1195.0,"Johnson, Andrew",Striker,40.9,4.1,1,Pass,0
37533,2012,442021,43,Manchester City,52,Queens Park Rangers,1410692612,39,01Sep2012,5:34:54 PM,1,4,41,52,1195.0,"Johnson, Andrew",Striker,73.3,98.9,1,Pass,0
37760,2012,442021,43,Manchester City,52,Queens Park Rangers,1759904865,107,01Sep2012,5:46:26 PM,1,16,13,52,1195.0,"Johnson, Andrew",Striker,50.1,50.5,1,Pass,1
37812,2012,442021,43,Manchester City,52,Queens Park Rangers,1194800118,129,01Sep2012,5:49:11 PM,1,18,58,52,1195.0,"Johnson, Andrew",Striker,35.3,21.0,4,Foul,1


In [114]:
Data.match_id.unique()

array([441999, 442000, 442001, 442002, 442003, 442004, 442005, 442006,
       442007, 442008, 442009, 442010, 442011, 442012, 442013, 442014,
       442015, 442016, 442017, 442018, 442019, 442020, 442021, 442022,
       442023, 442024, 442025, 442026, 442027, 442028, 442029, 442030,
       442031, 442032, 442033, 442034, 442035, 442036, 442037, 442038,
       442039, 442040, 442041, 442042, 442043, 442044, 442045, 442046,
       442047, 442048, 442049, 442050, 442051, 442052, 442053, 442054,
       442055, 442056, 442057, 442058, 442059, 442060, 442061, 442062,
       442063, 442064, 442065, 442066, 442067, 442068, 442069, 442070,
       442071, 442072, 442073, 442074, 442075, 442076, 442077, 442078,
       442079, 442080, 442081, 442082, 442083, 442084, 442085, 442086,
       442087, 442088, 442089, 442090, 442091, 442092, 442093, 442094,
       442095, 442096, 442097, 442098, 442099, 442100, 442101, 442102,
       442103, 442104, 442105, 442106, 442107, 442108, 442109, 442110,
      

In [108]:
match_id = 442001

In [109]:
L = data_process_game(EPL, match_id)

In [110]:
L

[(54, 45),
 {(1256.0, 1827.0): 1,
  (1256.0, 1869.0): 0,
  (1256.0, 4098.0): 1,
  (1256.0, 4990.0): 3,
  (1256.0, 11735.0): 0,
  (1256.0, 11987.0): 1,
  (1256.0, 15284.0): 2,
  (1256.0, 17160.0): 2,
  (1256.0, 37084.0): 5,
  (1256.0, 37334.0): 2,
  (1256.0, 39104.0): 3,
  (1256.0, 42518.0): 0,
  (1256.0, 81025.0): 0,
  (1256.0, 'Gain'): 3,
  (1256.0, 'Goal'): 1,
  (1256.0, 'Loss'): 6,
  (1256.0, 'Shot'): 2,
  (1827.0, 1256.0): 0,
  (1827.0, 1869.0): 8,
  (1827.0, 4098.0): 1,
  (1827.0, 4990.0): 4,
  (1827.0, 11735.0): 0,
  (1827.0, 11987.0): 1,
  (1827.0, 15284.0): 10,
  (1827.0, 17160.0): 5,
  (1827.0, 37084.0): 2,
  (1827.0, 37334.0): 0,
  (1827.0, 39104.0): 2,
  (1827.0, 42518.0): 0,
  (1827.0, 81025.0): 0,
  (1827.0, 'Gain'): 14,
  (1827.0, 'Goal'): 0,
  (1827.0, 'Loss'): 20,
  (1827.0, 'Shot'): 0,
  (1869.0, 1256.0): 3,
  (1869.0, 1827.0): 7,
  (1869.0, 4098.0): 1,
  (1869.0, 4990.0): 14,
  (1869.0, 11735.0): 3,
  (1869.0, 11987.0): 2,
  (1869.0, 15284.0): 4,
  (1869.0, 17160.0): 

In [78]:
EPL.loc[0]

season_id                2012
match_id               441999
home_team_id                3
home_team_name        Arsenal
away_team_id               56
away_team_name     Sunderland
id                  688830980
event_id                    1
date                18Aug2012
time               2:31:54 PM
period_id                  16
min                         0
sec                         0
team_id                     3
player_id                 NaN
playerName                NaN
playerPosition            NaN
x                           0
y                           0
type                       34
description       Team set up
outcome                     1
Name: 0, dtype: object

In [40]:
game_data = EPL.copy()
game_data = game_data[game_data.match_id == match_id]

In [44]:
teams = (game_data.home_team_id.mode()[0], game_data.away_team_id.mode()[0])
teams

(54, 45)

In [46]:
home_players = game_data[game_data.team_id == teams[0]].player_id.unique()
home_players

array([   nan,  4098.,  1869., 37084.,  4990., 39104., 15284., 11987.,
        1827., 17160., 42518.,  1256., 37334., 81025., 11735.])

In [48]:
home_players = home_players[~np.isnan(home_players)]
home_players

array([ 4098.,  1869., 37084.,  4990., 39104., 15284., 11987.,  1827.,
       17160., 42518.,  1256., 37334., 81025., 11735.])

In [53]:
a = [1,2,3,4]
b = [2,4,6,8]

In [60]:
0/float('inf')

0.0

In [57]:
a = (2,4)

In [58]:
a[0]

2

In [59]:
a[0] = 4

TypeError: 'tuple' object does not support item assignment