In [1]:
import pandas as pd
import numpy as np

In [2]:
#team = "Croatia"
#team = "Belgium"
team = "France"
croatia_data = pd.read_csv(f"./{team}.csv")
games = list(croatia_data["caseID"].apply(lambda x: str(x)[:4]).drop_duplicates().sort_values())
games

['7530', '7546', '7563', '7580', '8649', '8655', '8658']

In [3]:
merged = pd.read_csv(f"./{games[0]}.csv")
merged["gameID"] = games[0]
for game in games[1:]:
  tmp = pd.read_csv(f"./{game}.csv")
  tmp["gameID"] = game
  merged = pd.concat([merged, tmp], ignore_index=True, sort=False)
merged

Unnamed: 0,caseID,action,type,technique,play_pattern,under_pressure,shot_assist,angle,length,height,...,end_X,end_Y,bad_behavior,result,home_team,away_team,action_type,action_count,action_type_count,gameID
0,753010,Pressure,,,Regular Play,,,,,,...,,,,,Australia,France,Pressure_None,Pressure_1,Pressure_None_1,7530
1,753010,Pass,,,Regular Play,True,,0.000000,51.000000,,...,58.0,78.0,,Incomplete,Australia,France,Pass_None,Pass_1,Pass_None_1,7530
2,753010,Pass,Recovery,,Regular Play,,,0.062419,16.031220,,...,79.0,4.0,,Incomplete,Australia,France,Pass_Recovery,Pass_2,Pass_Recovery_1,7530
3,753010,Ball Recovery,,,Regular Play,,,,,,...,,,,,Australia,France,Ball Recovery_None,Ball Recovery_1,Ball Recovery_None_1,7530
4,753010,Pass,,,Regular Play,,,0.000000,3.000000,,...,48.0,77.0,,Incomplete,Australia,France,Pass_None,Pass_3,Pass_None_2,7530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,865899,Pass,Corner,,From Corner,,,-1.707863,29.274563,,...,115.0,51.0,,Incomplete,Croatia,France,Pass_Corner,Pass_1,Pass_Corner_1,8658
11310,865899,Clearance,,,From Corner,,,,,,...,,,,,Croatia,France,Clearance_None,Clearance_1,Clearance_None_1,8658
11311,865899,Ball Recovery,,,From Corner,,,,,,...,,,,,Croatia,France,Ball Recovery_None,Ball Recovery_1,Ball Recovery_None_1,8658
11312,865899,Shot,Open Play,Volley,From Corner,,,,,,...,120.0,33.6,,Off T,Croatia,France,Shot_Open Play,Shot_1,Shot_Open Play_1,8658


In [4]:
remove = [
    "technique",
    "angle",
    "height",
    "player_position",
    "bad_behavior"
]
# drop unnecessary columns
df = merged.drop(columns=remove)

# set missing endTime to startTime
#print(df[:10][["startTime", "endTime"]])
df.loc[df["endTime"] == "None", "endTime"] = df["startTime"]
#print(df[:10][["startTime", "endTime"]])

df[["startTime", "endTime"]] = df[["startTime", "endTime"]].apply(pd.to_datetime)
#print(df[:10][["startTime", "endTime"]])

# add new columns
df["action_result"] = df["action"] + "_" + df["result"]
df["action_type_result"] = df["action_type"] + "_" + df["result"]

# remove rows where non possession team makes irrelevant action
df.loc[~(
    ((df["action"] != "Foul Committed") &
     (df["action"] != "Goal Keeper") &
     (df["action"] != "Interception") &
     (df["action"] != "Pressure")
    ) & (df["possession_team"] != team)),:]
print(df.head())

# Define the X and Y boundaries of each zone
X_END = 120
Y_END = 80

x_boundaries = [0, X_END * 0.15, X_END * 0.5, X_END * 0.85, X_END]
y_boundaries = [0, Y_END * 0.2, Y_END * 0.8, Y_END]

# Add zones to data
# Define the corresponding zone names
zone_names = ['Zone 1', 'Zone 2', 'Zone 3', 'Zone 4', 'Zone 5', 'Zone 6', 'Zone 7', 'Zone 8', 'Zone 9', 'Zone 10', 'Zone 11', 'Zone 12']

# Create a function to map X and Y coordinates to zones
def map_to_zone(x, y):
    for i, x_boundary in enumerate(x_boundaries[:-1]):
        for j, y_boundary in enumerate(y_boundaries[:-1]):
            if (x_boundary <= x < x_boundaries[i + 1] or i == len(x_boundaries) - 2) and \
               (y_boundary <= y < y_boundaries[j + 1] or j == len(y_boundaries) - 2):
                return zone_names[i + (j * (len(x_boundaries) - 1))]
    return zone_names[-1]

# Apply the mapping function to 'start_X' and 'start_Y' columns
df['zoneStart'] = df.apply(lambda row: map_to_zone(row['start_X'], row['start_Y']), axis=1)

# Apply the mapping function to 'end_X' and 'end_Y' columns
df['zoneEnd'] = df.apply(lambda row: map_to_zone(row['end_X'], row['end_Y']), axis=1)

   caseID         action      type  play_pattern under_pressure shot_assist  \
0  753010       Pressure      None  Regular Play           None        None   
1  753010           Pass      None  Regular Play           True        None   
2  753010           Pass  Recovery  Regular Play           None        None   
3  753010  Ball Recovery      None  Regular Play           None        None   
4  753010           Pass      None  Regular Play           None        None   

     length             recipient               startTime  period  ...  end_Y  \
0       NaN                   NaN 2023-12-01 00:04:44.891       1  ...    NaN   
1  51.00000  Kylian Mbappé Lottin 2023-12-01 00:04:45.196       1  ...   78.0   
2  16.03122   Aziz Eraltay Behich 2023-12-01 00:04:48.689       1  ...    4.0   
3       NaN                   NaN 2023-12-01 00:04:50.889       1  ...    NaN   
4   3.00000     Antoine Griezmann 2023-12-01 00:04:51.409       1  ...   77.0   

       result  home_team away_team    

In [5]:
df.to_csv(f"./{team}_AllGames.csv", sep=";", encoding="utf-8")

In [37]:
teams = ["Croatia", "Belgium", "France"]
player = {
    "Croatia": "Luka Modrić",
    "Belgium": "Eden Hazard",
    "France":  "Antoine Griezmann"
}

for team in teams:
  print(player[team])
  data = pd.read_csv(f"./{team}_AllGames.csv", sep=";")
  incomplete_passes = data.loc[(
      (data["player"] == player[team]) &
      (data["action"] == "Pass") &
      (data["result"] == "Incomplete")), :].shape[0]
  all_passes = data.loc[(
      (data["player"] == player[team]) &
      (data["action"] == "Pass")), :].shape[0]
  print("pass accuracy: ", 1 - incomplete_passes / all_passes)

  num_shots = data.loc[(
      (data["player"] == player[team]) &
      (data["action"] == "Shot") &
      (data["minute"] <= 120)), :].shape[0]
  print("shots: ", num_shots)

  num_onTarget = data.loc[(
      (data["player"] == player[team]) &
      (data["action"] == "Shot") &
      (data["result"] != "Off T") &
      (data["minute"] <= 120)), :].shape[0]
  print("on target: ", num_onTarget)

  num_goals = data.loc[(
      (data["player"] == player[team]) &
      (data["result"] == "Goal") &
      (data["minute"] <= 120)), :].shape[0]
  print("goals: ", num_goals)

  #https://www.worldfootball.net/assists/wm-2018-in-russland/
  print("assists: ", 2 if player[team] != "Luka Modrić" else 1)

  fouls = data.loc[(
      (data["player"] == player[team]) &
      (data["action"] == "Foul Committed")), :].shape[0]
  print("fouls committed: ", fouls)

Luka Modrić
pass accuracy:  0.8444022770398482
shots:  10
on target:  5
goals:  2
assists:  1
fouls committed:  13
Eden Hazard
pass accuracy:  0.8265682656826568
shots:  18
on target:  13
goals:  3
assists:  2
fouls committed:  7
Antoine Griezmann
pass accuracy:  0.7459677419354839
shots:  21
on target:  15
goals:  4
assists:  2
fouls committed:  6


In [58]:
# ć š Š
import matplotlib.pyplot as plt
dark  = "#024D80"
light = "#B3C4DB"
mid   = "#1779AA"

def plot_res(filename, names, values, colors):
  rf_poc = plt.bar(names, values, color=colors)
  plt.xticks(rotation=20)
  #plt.show()
  plt.savefig(filename)
  plt.close()

In [63]:
# relative frequencies player opp creation
names  = ["Modrić", "Rakitić", "Vrsaljko", "Brozović", "Lovren", "Perišić", "Vida"]
values = [13.28, 10.22, 8.55, 7.74, 7.04, 6.87, 6.81]
colors = [dark if p == "Modrić" else light for p in names]
plot_res("player_opp_creation.png", names, values, colors)

# relative frequencies luka actions opp creation
names  = ["Pass", "Inc. Pass", "Ball Rec.", "Off T.", "Dribble", "Goal", "Pressure"]
values = [70.43, 10.87, 6.52, 2.17, 2.17, 1.74, 1.3]
colors = [mid for _ in names]
plot_res("action_opp_creation.png", names, values, colors)


# relative frequencies player pressure success
names  = ["Modrić", "Rakitić", "Lovren", "Brozović", "Vrsaljko", "Vida", "Strinić"]
values = [13.12, 10.46, 9.62, 9.24, 8.44, 7.51, 6.24]
colors = [dark if p == "Modrić" else light for p in names]
plot_res("player_pressure_success.png", names, values, colors)

# relative frequencies luka actions pressure success
names  = ["Pass", "Dribble", "Pressure", "Off T.", "Goal", "Saved", "Blocked"]
values = [93.57, 3.23, 1.29, 0.65, 0.32, 0.32, 0.32]
colors = [mid for _ in names]
plot_res("action_pressure_success.png", names, values, colors)


# relative frequencies player pressure failure
names  = ["Rebić", "Perišić", "Modrić", "Vrsaljko", "Rakitić", "Strinić", "Mandžukić"]
values = [12.64, 12.07, 11.78, 10.92, 10.34, 8.05, 5.75]
colors = [dark if p == "Modrić" else light for p in names]
plot_res("player_pressure_failure.png", names, values, colors)

# relative frequencies luka actions pressure failure
names  = ["Inc. Pass", "Inc. Dribble", "Foul", "Miscontrol", "Out", "Unknown", ""]
values = [73.17, 7.32, 7.32, 7.32, 2.44, 2.44, 0]
colors = [mid for _ in names]
plot_res("action_pressure_failure.png", names, values, colors)

In [64]:
# relative frequencies player opportunity creation belgium
names  = ["De Bruyne", "Hazard", "Witsel", "Meunier", "Alderweireld"]
values = [12.12, 10.88, 8.5, 7.88, 6.75]
colors = [dark if p == "Hazard" else light for p in names]
plot_res("belgium_oc.png", names, values, colors)

# relative frequencies player pressure success belgium
names  = ["Alderweireld", "Witsel", "Vertonghen", "De Bruyne", "Hazard"]
values = [11.66, 9.92, 8.82, 8.71, 8.25]
colors = [dark if p == "Hazard" else light for p in names]
plot_res("belgium_ps.png", names, values, colors)

# relative frequencies player pressure failure belgium
names  = ["Hazard", "De Bruyne", "Lukaku", "Meunier", "Chadli"]
values = [18.86, 13.16, 7.89, 7.46, 7.46]
colors = [dark if p == "Hazard" else light for p in names]
plot_res("belgium_pf.png", names, values, colors)

# relative frequencies player opportunity creation france
names  = ["Griezmann", "Pogba", "Mbappé", "Kanté", "Pavard"]
values = [11.95, 9.75, 8.65, 8.49, 8.49]
colors = [dark if p == "Griezmann" else light for p in names]
plot_res("france_oc.png", names, values, colors)

# relative frequencies player pressure success france
names  = ["Kanté", "Varane", "Pavard", "Pogba", "Hernández"]
values = [11.99, 9.64, 9.54, 8.9, 7.88]
colors = [dark if p == "Griezmann" else light for p in names]
plot_res("france_ps.png", names, values, colors)

# relative frequencies player pressure failure france
names  = ["Mbappé", "Griezmann", "Hernández", "Pogba", "Grioud"]
values = [15.22, 11.3, 10.87, 9.57, 9.57]
colors = [dark if p == "Griezmann" else light for p in names]
plot_res("france_pf.png", names, values, colors)