In [145]:
# import required libraries
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mplsoccer import Pitch, VerticalPitch
import matplotlib_inline
from matplotlib import cm
import matplotlib_inline
import seaborn as sns
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [41]:
# extract all match names in directory as a list 
files = os.listdir('matches_2021_json')

In [132]:
# avoid last file 'players.json' indexed as '-1'
for i in files[:-1]:
    with open('matches_2021_json\\' + i, 'r') as f:
        data = json.load(f)
   
    # convert list of dictionaries into data frame 
    events = pd.DataFrame(data['events'])
    
    # reset indices
    events = events.reset_index()
    events.drop('index', axis = 1, inplace = True)   
    
    # rearrange columns
    # since only single match has less than 23 columns which is '397511822166547-Halmstad-DegerforsIF'
    if len(events.columns) < 23:
        col_rearr = ['external_id', 'team', 'player', 'next_player', 'xpos', 'ypos', 'xdest', 'ydest', 'start_time', 'end_time',
                    'action', 'xg', 'xt', 'xp','game_time', 'header', 'penalty', 'throw_in']
    else:
        col_rearr = ['external_id','team', 'player', 'next_player', 'xpos', 'ypos', 'xdest', 'ydest', 'start_time', 
                     'end_time','action', 'xg', 'xt', 'xp','game_time', 'header', 'penalty', 'throw_in', 'attack_type', 
                    'corner_outcome', 'foot_used', 'one_touch', 'goal_mouth']
    events = events[col_rearr]
    
    # replace all None values with NaN (missing value)
    events.fillna(value = np.nan, inplace = True)
    
    # save dataframe 
    file_name = i.split('.')[0]
    events.to_csv('matches_2021_csv\\' + file_name + '.csv', index = False)

In [13]:
# load info 'how much players played in each game'
with open('matches_2021_json\\allsvenskan, 2021-players.json', 'r') as f:
    data2 = json.load(f)

# aggregate results into data frame
playing_time = pd.DataFrame()
for i in data2:
    single_col = {'player_name':i['name'], 'team_name':i['team'],
                  'minutes_played':sum(i['minutes']['62163887d0736a3c7964ff4b'].values())}
    playing_time = pd.concat([playing_time, pd.DataFrame([single_col])])

playing_time.reset_index(inplace = True, drop = True)

In [15]:
playing_time

Unnamed: 0,player_name,team_name,minutes_played
0,Christopher Mc Vey,IF Elfsborg,2307
1,Tashreeq Matthews,Varbergs BoIS FC,1969
2,Ali Youssef,BK Häcken,389
3,Mikael Lustig,AIK,2491
4,Davor Blazevic,Hammarby,96
...,...,...,...
432,Emanuel Chabo,IFK Norrköping FK,212
433,Johannes Bjarnason,IFK Norrköping FK,198
434,Kalpi Wilfried Ouattara,Östersund,92
435,Sebastian Lagerlund,BK Häcken,200


In [None]:
# Incorrect minutes data for players. Examples are below:
# Justin Salmon, Degerfors
# Ali Youssef, Hacken

In [None]:
# quick links
# https://fbref.com/en/comps/29/11002/2021-Allsvenskan-Stats
# https://fbref.com/en/comps/29/11002/stats/2021-Allsvenskan-Stats#all_stats_standard

In [67]:
# load fbfref data
match_results = pd.read_csv("match_results.csv")
# create column indices to be removed
rm_col_ind = np.r_[0:5, 6, -5:0]
# remove columns
match_results = match_results.drop(columns = match_results.columns[rm_col_ind], axis = 1)
# make all column names lowercase
match_results = match_results.rename(str.lower, axis='columns')
# save refined .csv file
match_results.to_csv('match_results.csv', index = False)

In [49]:
# load fbref data 
player_stats = pd.read_csv('player_stats.csv')
# create column indices to be removed
rm_col_ind = np.r_[0, 2, -1]
# remove columns
player_stats = player_stats.drop(columns = player_stats.columns[rm_col_ind], axis = 1)
# make all column names lowercase
player_stats = player_stats.rename(str.lower, axis='columns')
player_stats.to_csv('player_stats.csv', index = False)

In [116]:
csv_files = os.listdir('matches_2021_csv/')

all_match_events = pd.DataFrame()
for i in csv_files:
    df = pd.read_csv('matches_2021_csv/'+ i)
    all_match_events = pd.concat([all_match_events, df])
all_match_events.to_csv('all_match_events.csv', index = False)    

In [117]:
all_match_events.shape

(409713, 23)

In [118]:
all_match_events

Unnamed: 0,external_id,team,player,next_player,xpos,ypos,xdest,ydest,start_time,end_time,...,xp,game_time,header,penalty,throw_in,attack_type,corner_outcome,foot_used,one_touch,goal_mouth
0,0,BK Häcken,Tobias Heintz,Alexander Faltsetas,51,50,37.0,56.0,0,3,...,0.882111,1,False,False,False,,,,,
1,1,BK Häcken,Alexander Faltsetas,Godswill Ekpolo,37,52,40.0,9.0,1,4,...,0.772046,1,False,False,False,,,,,
2,2,BK Häcken,Godswill Ekpolo,Joona Toivio,41,7,27.0,25.0,7,10,...,0.906022,1,False,False,False,,,,,
3,3,BK Häcken,Joona Toivio,Johan Hammar,25,29,25.0,68.0,10,13,...,0.895334,1,False,False,False,,,,,
4,4,BK Häcken,Johan Hammar,Joona Toivio,31,63,31.0,33.0,15,18,...,0.867001,1,False,False,False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1799,1799,AIK,Zachary Elbouzedi,,90,14,,,5597,5607,...,,94,False,False,False,,,,,
1800,1800,AIK,Zachary Elbouzedi,,92,13,,,5598,5603,...,,94,False,False,False,,,,,
1801,1801,AIK,Zachary Elbouzedi,,66,16,92.0,13.0,5592,5603,...,,94,False,False,False,,,,,
1802,1802,Varbergs BoIS FC,Philip Mårtensson,Joakim Lindner,6,50,24.0,22.0,5600,5610,...,,94,False,False,False,,,,,


In [None]:
# https://stackoverflow.com/questions/26139423/plot-different-color-for-different-categorical-levels-using-matplotlib

In [237]:
json_files = os.listdir('matches_2021_json/')

In [248]:
# Create data frame with results of all matches
results = pd.DataFrame()

# avoid last file 'players.json'
for i in json_files[:-1]:
    with open('matches_2021_json/' + i, 'r') as f:
        data = json.load(f)

    single_col = {'home_team':data['homeTeam'], 'away_team':data['visitingTeam'], 
              'home_xg':json.loads(data['xG'])[0], 'away_xg':json.loads(data['xG'])[1]}

    results = pd.concat([results, pd.DataFrame([single_col])])
results.reset_index(drop = True, inplace = True)

In [249]:
results.head()

Unnamed: 0,home_team,away_team,home_xg,away_xg
0,BK Häcken,Örebro,3.136022,0.94633
1,Kalmar FF,Östersund,0.81102,0.156506
2,Degerfors IF,IFK Göteborg,0.652445,0.908759
3,IF Elfsborg,Varbergs BoIS FC,0.381496,1.120826
4,Örebro,AIK,1.274922,3.121992


In [272]:
# sanity check of xG results from .csv file
csv_files = os.listdir('matches_2021_csv/')
df = pd.read_csv('matches_2021_csv/' + csv_files[0])
df[(df['action'] == 'Shot') | (df['action'] == 'Goal')][['action', 'team', 'xg']].groupby(by = 'team').sum()

Unnamed: 0_level_0,xg
team,Unnamed: 1_level_1
BK Häcken,3.136022
Örebro,0.94633
