#Imports

I import the necessary libraries and packages.

In [None]:
!pip install statsbombpy



In [None]:
!pip install mplsoccer



In [None]:
import pandas as pd
import numpy as np
from statsbombpy import sb
# to calculate distance with pitch coordinates
from mplsoccer import Pitch
pitch = Pitch()

# Building the dataset

I want to build a labelled dataset of counterpress actions to see if it is possible to predict the outcome of counterpress actions by classification, using event and tracking data about those actions.
The labels will be as follows:
- 1 = opponent not blocked.
- 2 = opponent blocked but ball not regained.
- 3 = opponent blocked and ball recovered.

I use the free Stasbomb dataset of the Women's Euro 2022 for this analysis.

In [None]:
# id of the matches of the 2022 women euros
matches = sb.matches(competition_id=53, season_id=106)

match_ids = list(matches['match_id'])

match_events = pd.DataFrame({})
for m_id in match_ids:

  current_match_events = sb.events(match_id=m_id).copy()
  current_match_events['match_id'] = [m_id]*len(current_match_events)

  match_events = pd.concat([match_events, current_match_events], ignore_index=True)

match_events.head(5)

Unnamed: 0,50_50,ball_receipt_outcome,ball_recovery_recovery_failure,block_deflection,block_offensive,block_save_block,carry_end_location,clearance_aerial_won,clearance_body_part,clearance_head,...,goalkeeper_punched_out,goalkeeper_shot_saved_to_post,pass_miscommunication,shot_saved_to_post,shot_open_goal,foul_committed_penalty,foul_won_penalty,goalkeeper_success_in_play,half_start_late_video_start,shot_redirect
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# get the events counterpress
counterpress_match_events = match_events[match_events['counterpress'] == True].copy()

# drop columns I don't use
counterpress_match_events = counterpress_match_events[counterpress_match_events['type'] != 'Pass']
counterpress_match_events = counterpress_match_events[counterpress_match_events['type'] != 'Pressure']

counterpress_match_events = counterpress_match_events[['match_id','id', 'minute', 'type', 'possession', 'possession_team', 'team', 'location', 'duel_outcome', 'interception_outcome']]

counterpress_match_events = counterpress_match_events[counterpress_match_events['type'] != '50/50']

counterpress_match_events = counterpress_match_events.reset_index(drop=True)

In [None]:
# build the labels list
labels = []

for index in range(0, len(counterpress_match_events)):

  if counterpress_match_events.at[index, 'type'] == 'Dribbled Past':
    labels.append(1)
  elif (counterpress_match_events.at[index, 'type'] == 'Block') | (counterpress_match_events.at[index, 'type'] == 'Foul Committed'):
    labels.append(2)
  elif counterpress_match_events.at[index, 'type'] == 'Interception':
    if counterpress_match_events.at[index, 'interception_outcome'] in ['Lost', 'Lost In Play']:
      labels.append(1)
    elif counterpress_match_events.at[index, 'interception_outcome'] in ['Lost Out']:
      labels.append(2)
    elif counterpress_match_events.at[index, 'interception_outcome'] in ['Won', 'Success', 'Success In Play', 'Success Out']:
      labels.append(3)
    else:
      labels.append(0)
  elif counterpress_match_events.at[index, 'type'] == 'Duel':
    if counterpress_match_events.at[index, 'duel_outcome'] in ['Lost', 'Lost In Play']:
      labels.append(1)
    elif counterpress_match_events.at[index, 'duel_outcome'] in ['Lost Out']:
      labels.append(2)
    elif counterpress_match_events.at[index, 'duel_outcome'] in ['Won', 'Success', 'Success In Play', 'Success Out']:
      labels.append(3)
    else:
      labels.append(0)

counterpress_match_events['label'] = labels

counterpress_match_events = counterpress_match_events[counterpress_match_events['label'] != 0]
counterpress_match_events = counterpress_match_events.reset_index(drop=True)

In [None]:
print(len(counterpress_match_events))
counterpress_match_events.head(5)

1486


Unnamed: 0,match_id,id,minute,type,possession,possession_team,team,location,duel_outcome,interception_outcome,label
0,3835331,43ecc5a0-5c65-4145-8ba0-37d3b32d6aae,0,Block,3,Switzerland Women's,Switzerland Women's,"[34.7, 2.1]",,,2
1,3835331,8ee32089-269e-408f-850e-ebe70b2c6570,0,Block,4,Sweden Women's,Sweden Women's,"[96.9, 79.1]",,,2
2,3835331,23137461-1499-462e-99d9-e53d55cb45d1,12,Block,31,Sweden Women's,Sweden Women's,"[85.6, 16.2]",,,2
3,3835331,a4f05fa7-0567-47d1-b4de-90cb7da7a62c,17,Block,39,Switzerland Women's,Sweden Women's,"[91.7, 78.5]",,,2
4,3835331,5c003485-9ca5-4d6f-a000-afb7c80f9508,49,Block,130,Sweden Women's,Switzerland Women's,"[21.0, 9.2]",,,2


In [None]:
# i do the same for frame data
match_frames = []
for m_id in match_ids:

  try:
    match_frames += sb.frames(match_id=m_id, fmt='dft')
  except:
    continue

In [None]:
#keep just counterpressing frames
counterpresses_ids = list(counterpress_match_events['id'].unique())

counterpressing_frames = []

for dictionary in match_frames:

  if dictionary['event_uuid'] in list(counterpresses_ids):
    counterpressing_frames.append(dictionary)

In [None]:
len(counterpressing_frames)

1229

In [None]:
distance = 10
ids = []
teammates = []
opponents = []

distance_teammates = []
distance_opponents = []

for dictionary in counterpressing_frames:

  n_teammates = 0
  n_opponents = 0
  d_teammates = 0
  d_opponents = 0
  ids.append(dictionary['event_uuid'])

  actor_location = None

  for freeze_dictionary in dictionary['freeze_frame']:

    if freeze_dictionary['actor'] == True:
      actor_location = freeze_dictionary['location']

  for freeze_dictionary in dictionary['freeze_frame']:

    if freeze_dictionary['actor'] != True:

      curr_dis = pitch.calculate_angle_and_distance(freeze_dictionary['location'][0], freeze_dictionary['location'][1], actor_location[0],  actor_location[1])[1][0]
      if (freeze_dictionary['teammate'] == True) & ( curr_dis <= distance):
        n_teammates+=1
        d_teammates+=curr_dis
      elif (freeze_dictionary['teammate'] == False) & ( curr_dis <= distance):
        n_opponents+=1
        d_opponents+=curr_dis

  teammates.append(n_teammates)
  opponents.append(n_opponents)
  if n_teammates != 0:
    distance_teammates.append(d_teammates/n_teammates)
  else:
    distance_teammates.append(15)
  if n_opponents != 0:
    distance_opponents.append(d_opponents/n_opponents)
  else:
    distance_opponents.append(15)

In [None]:
# i now build a df with three columns: the id of the event, the number of teammates in the frame and the number of opponents in the frame
counterpress_frame_events = pd.DataFrame({'id':ids, 'n_teammates':teammates, 'n_opponents':opponents, 'distance_teammates':distance_teammates, 'distance_opponents':distance_opponents})

In [None]:
# now I do an inner join between the df of frames and the df of events
counterpresses = pd.merge(counterpress_match_events, counterpress_frame_events, on='id', how='inner')

counterpresses.tail(5)

Unnamed: 0,match_id,id,minute,type,possession,possession_team,team,location,duel_outcome,interception_outcome,label,n_teammates,n_opponents,distance_teammates,distance_opponents
1224,3835319,133101c3-3377-4d09-bfa7-86e59bd38c3d,91,Interception,177,Austria Women's,Austria Women's,"[13.7, 72.2]",,Won,3,2,2,4.28244,3.553482
1225,3835319,4603dde0-d584-418d-a5cc-7a67e9c27cd4,4,Block,9,England Women's,England Women's,"[15.3, 36.3]",,,2,3,4,3.156672,5.219857
1226,3835319,f0e18123-5010-4532-a635-cb946b636f0f,20,Block,37,England Women's,Austria Women's,"[58.2, 28.9]",,,2,2,2,4.332532,4.147252
1227,3835319,6c3b38ea-1217-43e1-93ad-95019b9247e7,49,Block,95,Austria Women's,Austria Women's,"[62.8, 54.7]",,,2,1,1,8.894135,0.699029
1228,3835319,c8317ca9-2e71-4a04-ac74-a65c06ed840c,84,Block,164,England Women's,Austria Women's,"[18.7, 31.3]",,,2,2,2,8.103583,6.220967


In [None]:
counterpresses = counterpresses.drop(columns=['possession_team', 'duel_outcome', 'interception_outcome'])

In [None]:
# code to obtain passes rate and duel rate
def opponent_passes(match_events, team):

  match_events_passes = match_events[(match_events['type']=='Pass') & (match_events['team']==team)].copy()

  return ( len(match_events_passes) - len(match_events_passes.dropna(subset=['pass_outcome'])) )/len(match_events_passes)

def opponent_duels(match_events, team):
  match_events_duels = match_events[(match_events['type']=='Duel') & (match_events['team']==team)].copy()
  match_events_dribbles = match_events[(match_events['type']=='Dribble') & (match_events['team']==team)].copy()


  match_events_duels_won = match_events_duels[(match_events_duels['duel_outcome']=='Won') | (match_events_duels['duel_outcome']=='Success') | (match_events_duels['duel_outcome']=='Success In Play') | (match_events_duels['duel_outcome']=='Success Out')]
  match_events_dribbles_won = match_events_dribbles[match_events_dribbles['dribble_outcome']=='Complete']

  return (len(match_events_duels_won) + len(match_events_dribbles_won)) / ((len(match_events_duels) + len(match_events_dribbles)))

In [None]:
# i build the columns to add to the dataset
opponent_passes_rate = []
opponent_duels_rate = []


for index in range(len(counterpresses)):

    current_match = counterpresses.at[index, 'match_id']
    teams = list(counterpresses['team'][counterpresses['match_id']==current_match].unique())

    if counterpresses.at[index, 'team'] == teams[0]:
      opponent_passes_rate.append(opponent_passes(match_events, teams[1]))
      opponent_duels_rate.append(opponent_duels(match_events, teams[1]))
    elif counterpresses.at[index, 'team'] == teams[1]:
      opponent_passes_rate.append(opponent_passes(match_events, teams[0]))
      opponent_duels_rate.append(opponent_duels(match_events, teams[0]))

counterpresses['opponent_passes_rate'] = opponent_passes_rate
counterpresses['opponent_duels_rate'] = opponent_duels_rate

counterpresses.head(10)

Unnamed: 0,match_id,id,minute,type,possession,team,location,label,n_teammates,n_opponents,distance_teammates,distance_opponents,opponent_passes_rate,opponent_duels_rate
0,3835331,43ecc5a0-5c65-4145-8ba0-37d3b32d6aae,0,Block,3,Switzerland Women's,"[34.7, 2.1]",2,2,3,6.435611,2.917697,0.736264,0.441065
1,3835331,8ee32089-269e-408f-850e-ebe70b2c6570,0,Block,4,Sweden Women's,"[96.9, 79.1]",2,2,3,8.416836,6.372921,0.733746,0.405063
2,3835331,23137461-1499-462e-99d9-e53d55cb45d1,12,Block,31,Sweden Women's,"[85.6, 16.2]",2,3,2,7.11586,5.647225,0.733746,0.405063
3,3835331,a4f05fa7-0567-47d1-b4de-90cb7da7a62c,17,Block,39,Sweden Women's,"[91.7, 78.5]",2,0,1,15.0,0.970704,0.733746,0.405063
4,3835331,b2121e7b-0520-46ce-84cd-b20d85fcda35,68,Block,167,Switzerland Women's,"[5.3, 40.0]",2,3,2,5.114777,3.338236,0.736264,0.441065
5,3835331,d9de1589-aff4-456f-8717-b3dbe8bee28e,68,Block,167,Switzerland Women's,"[6.6, 42.9]",2,3,3,5.011389,3.138844,0.736264,0.441065
6,3835331,783f2bcf-741e-40cc-86ce-0e70c0f1ea74,72,Block,175,Switzerland Women's,"[74.9, 8.5]",2,1,3,7.405799,4.791692,0.736264,0.441065
7,3835331,33db3374-f5dc-4a46-9ea8-01770b84b65e,0,Duel,3,Switzerland Women's,"[35.3, 7.3]",3,4,4,5.282664,5.810635,0.736264,0.441065
8,3835331,e6bda229-0eb0-4825-8f86-560879b0af6c,1,Duel,6,Switzerland Women's,"[50.8, 1.2]",2,1,3,2.806449,4.278783,0.736264,0.441065
9,3835331,75f69ef9-090a-4fd5-b7dc-e9b8623fea39,5,Duel,14,Sweden Women's,"[74.3, 24.8]",3,2,2,6.438498,4.367994,0.733746,0.405063


In [None]:
print(len(counterpresses))
counterpresses.tail(5)

1229


Unnamed: 0,match_id,id,minute,type,possession,team,location,label,n_teammates,n_opponents,distance_teammates,distance_opponents,opponent_passes_rate,opponent_duels_rate
1224,3835319,133101c3-3377-4d09-bfa7-86e59bd38c3d,91,Interception,177,Austria Women's,"[13.7, 72.2]",3,2,2,4.28244,3.553482,0.805365,0.36036
1225,3835319,4603dde0-d584-418d-a5cc-7a67e9c27cd4,4,Block,9,England Women's,"[15.3, 36.3]",2,3,4,3.156672,5.219857,0.691047,0.342975
1226,3835319,f0e18123-5010-4532-a635-cb946b636f0f,20,Block,37,Austria Women's,"[58.2, 28.9]",2,2,2,4.332532,4.147252,0.805365,0.36036
1227,3835319,6c3b38ea-1217-43e1-93ad-95019b9247e7,49,Block,95,Austria Women's,"[62.8, 54.7]",2,1,1,8.894135,0.699029,0.805365,0.36036
1228,3835319,c8317ca9-2e71-4a04-ac74-a65c06ed840c,84,Block,164,Austria Women's,"[18.7, 31.3]",2,2,2,8.103583,6.220967,0.805365,0.36036


In [None]:
# split the location into x and y
counterpresses[['loc_x', 'loc_y']] = counterpresses['location'].apply(pd.Series)
counterpresses = counterpresses.drop('location', axis=1)

# keep just the columns i will use for classification
counterpresses = counterpresses[['loc_x', 'loc_y', 'minute', 'n_teammates', 'n_opponents', 'distance_teammates', 'distance_opponents', 'opponent_passes_rate', 'opponent_duels_rate', 'label']]

In [None]:
counterpresses.tail(5)

Unnamed: 0,loc_x,loc_y,minute,n_teammates,n_opponents,distance_teammates,distance_opponents,opponent_passes_rate,opponent_duels_rate,label
1224,13.7,72.2,91,2,2,4.28244,3.553482,0.805365,0.36036,3
1225,15.3,36.3,4,3,4,3.156672,5.219857,0.691047,0.342975,2
1226,58.2,28.9,20,2,2,4.332532,4.147252,0.805365,0.36036,2
1227,62.8,54.7,49,1,1,8.894135,0.699029,0.805365,0.36036,2
1228,18.7,31.3,84,2,2,8.103583,6.220967,0.805365,0.36036,2


In [None]:
print(len(counterpresses[counterpresses['label'] == 1]))
print(len(counterpresses[counterpresses['label'] == 2]))
print(len(counterpresses[counterpresses['label'] == 3]))

202
377
650


# Classification model

I apply the XGBoost algorithm to the created dataset. Since I observe that class 3 has a significantly higher cardinality, I apply the algorithm first with the defined 3 classes and then with 2 classes combining classes 1 and 2 (ball not recovered).

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

## Three classes

In [None]:
counterpresses_copy = counterpresses.copy()
X = counterpresses.drop('label', axis=1)
map_dict = {1:0, 2:1, 3:2}
y = np.array([map_dict[x] for x in counterpresses['label']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier
model = XGBClassifier(objective='multi:softmax',
                      num_classes=3,
                      learning_rate =0.05,
                      max_depth=4,
                      min_child_weight=0.8,
                      gamma=0.4,
                      reg_lambda=0.2,
                      reg_alpha=0.1,
                      subsample=0.8,
                      nthread=4,
                      random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3']))

Accuracy: 58.54%
Classification Report:
              precision    recall  f1-score   support

     Class 1       0.50      0.02      0.04        52
     Class 2       0.46      0.50      0.48        64
     Class 3       0.63      0.85      0.73       130

    accuracy                           0.59       246
   macro avg       0.53      0.46      0.42       246
weighted avg       0.56      0.59      0.52       246



In [None]:
# Get feature importances
feature_importance = model.feature_importances_

# Get the names of the features
feature_names = X.columns

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importance scores
print("Feature Importance:")
print(feature_importance_df)

Feature Importance:
                Feature  Importance
0                 loc_x    0.143313
4           n_opponents    0.140572
5    distance_teammates    0.112149
1                 loc_y    0.107397
7  opponent_passes_rate    0.106868
8   opponent_duels_rate    0.103953
2                minute    0.099498
6    distance_opponents    0.096505
3           n_teammates    0.089744


## Two classes


In [None]:
#https://xgboost.readthedocs.io/en/stable/parameter.html
counterpresses_copy = counterpresses.copy()
X = counterpresses.drop('label', axis=1)
map_dict = {1:0, 2:0, 3:1}
y = np.array([map_dict[x] for x in counterpresses['label']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an XGBoost classifier
#model = XGBClassifier(objective='binary:logistic', random_state=42)
model = XGBClassifier(
 learning_rate =0.05,
 max_depth=4,
 gamma=0.4,
 reg_lambda=0.2,
 reg_alpha=0.1,
 subsample=0.8,
 objective= 'binary:logistic',
 nthread=4,
 random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2']))

Accuracy: 70.73%
Classification Report:
              precision    recall  f1-score   support

     Class 1       0.70      0.66      0.68       116
     Class 2       0.71      0.75      0.73       130

    accuracy                           0.71       246
   macro avg       0.71      0.70      0.71       246
weighted avg       0.71      0.71      0.71       246



In [None]:
# Get feature importances
feature_importance = model.feature_importances_

# Get the names of the features
feature_names = X.columns

# Create a DataFrame with feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importance scores
print("Feature Importance:")
print(feature_importance_df)

Feature Importance:
                Feature  Importance
4           n_opponents    0.158729
0                 loc_x    0.141203
5    distance_teammates    0.113803
1                 loc_y    0.109876
8   opponent_duels_rate    0.102244
6    distance_opponents    0.098902
3           n_teammates    0.096547
2                minute    0.091884
7  opponent_passes_rate    0.086812


## Results ananlysis
Some thoughts on the results:
- The performance of both models is not exceptional, but it is good. This suggests that it is possible to predict the outcome of a counter-pressure action using event and tracking data, so it is worth trying to add new features and generally study the problem more deeply.
- As expected, the classification works better with 2 classes. However, in this case the 3 classes are very unbalanced, but this should not be intrinsic to the problem, counter-presses where the opponent is not blocked should not by nature be significantly lower than counter-presses where the ball is recovered. This could be a feature of the dataset, so using a different dataset to train the model could improve model performance with 3 classes.


##How to use?
Let's say we're preparing a match for a team and we have to suggest to them "when to counter-press". The problem here is that we don’t know the counterpresses actions that will happen during the next match, so we can’t predict their outcome. In addition, the answer to this big question has to be short, simple and manageable for the players during the match, in the form of simple insights, for example  "we should counter-press around the midfield on the left and close to the opponents' goal on the right, with at least 3 players".
I can see two ways of using this model:
- We already have the model trained. We want to build a dataset to which we can apply the model. To build it, we proceed as follows:
    - For the features relating to the opponent, we insert real data about the opponent.
    - For the features relating to the counterpresses itself, we generate synthetic data reproducing the distribution of the same features in the counterpresses of the last tournament matches (or Italy matches, ...).
    
  The idea is to generate a dataset that allows us to study the counterpresses likely to happen during the match and, by applying the model on it, the predicted outcome of each one. Infact we then apply the model to the dataset and explore the predictions to extract valuable, easy-to-understand insights such as: Where should we counter-press? When? With how many players?
- We build a dataset more focused on the opponent, for example considering just their matches. We train the model on that dataset. We then explore the feature importance and focus on the top 3-4 features. For example: location, number of opponents, distance from team-mates; in which areas should we counter-press more? What is the critical number of opponents that makes a counter-press less likely to have a positive outcome?

As a final note, after this analysis of the possible applications of the model, I think it would be helpful to increase the number of opponent specific features, as these are the ones that make the model's insights more match specific, and in the first case are also the ones that we don't need to generate synthetically.
