# Eda PySpark Statbombs : Events

### Imports

In [26]:
from eda_functions import create_football_field_plotly, add_scaled_size_crosses
import plotly.express as px
import plotly.graph_objects as go
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, count, when, ceil, input_file_name, regexp_replace, struct

In [None]:
root_dir = "../../../open-data/data/events"

## Part 1 : Data Loading and Preprocessing

In [27]:
# Create a Spark session
spark = SparkSession.builder.appName("Football Data Analysis : Events").getOrCreate()

In [28]:
# Original 
events_df = spark.read.json(f"{root_dir}/*.json", multiLine=True)
events_df.printSchema()

root
 |-- 50_50: struct (nullable = true)
 |    |-- outcome: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- bad_behaviour: struct (nullable = true)
 |    |-- card: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- ball_receipt: struct (nullable = true)
 |    |-- outcome: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- ball_recovery: struct (nullable = true)
 |    |-- offensive: boolean (nullable = true)
 |    |-- recovery_failure: boolean (nullable = true)
 |-- block: struct (nullable = true)
 |    |-- deflection: boolean (nullable = true)
 |    |-- offensive: boolean (nullable = true)
 |    |-- save_block: boolean (nullable = true)
 |-- carry: struct (nullable = true)
 |    |-- end_location: array (nullable = true)
 |    |    |-- element: double (containsNull = true)
 |-- clearan

In [29]:
# Original Schema miss match_id so we add it manually
# Read the JSON files and include the filename to extract match id later
events_df = spark.read.json(f"{root_dir}/*.json", multiLine=True) \
    .withColumn("file_name", input_file_name())

# Extract match_id from the filename by removing the .json extension
events_df = events_df.withColumn("match_id", regexp_replace("file_name", r"(.*/)?([^/]+)\.json$", "$2"))

# Nest the existing structure inside the match_id struct
events_df = events_df.select(struct("*").alias("events"), "match_id")
events_df.printSchema()

root
 |-- events: struct (nullable = false)
 |    |-- 50_50: struct (nullable = true)
 |    |    |-- outcome: struct (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |-- bad_behaviour: struct (nullable = true)
 |    |    |-- card: struct (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |-- ball_receipt: struct (nullable = true)
 |    |    |-- outcome: struct (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |-- ball_recovery: struct (nullable = true)
 |    |    |-- offensive: boolean (nullable = true)
 |    |    |-- recovery_failure: boolean (nullable = true)
 |    |-- block: struct (nullable = true)
 |    |    |-- deflection: boolean (nullable = true)
 |    |    |-- offensive: boolean (nullable = true)
 |    |    |-- save_block: boolean (nullable = true)
 |    |-- carry:

## Part 2 : Data Analysis

### Events distributions

In [30]:
distinct_events_df =  events_df.select(col("events.type.name")).distinct()
events_count = distinct_events_df.count()

event_names_list = [row["name"] for row in distinct_events_df.select("name").collect()]

In [31]:
print("Number of different events possible:", events_count, "\n")
print("Different events possible: \n", event_names_list)

Number of different events possible: 35 

Different events possible: 
 ['Camera off', 'Tactical Shift', 'Shot', 'Referee Ball-Drop', 'Dispossessed', 'Own Goal For', 'Injury Stoppage', 'Duel', 'Bad Behaviour', 'Foul Won', 'Player Off', 'Shield', 'Half End', 'Camera On', 'Starting XI', 'Carry', 'Clearance', 'Ball Recovery', 'Dribbled Past', 'Substitution', 'Goal Keeper', 'Half Start', 'Ball Receipt*', 'Offside', 'Player On', 'Dribble', 'Foul Committed', 'Interception', 'Error', 'Own Goal Against', 'Pressure', '50/50', 'Block', 'Pass', 'Miscontrol']


In [32]:
distribution_events_df = events_df.groupby(col("events.type.name")).count().orderBy("count",ascending = False)

In [33]:
distribution_events_df.show()

+--------------+-------+
|          name|  count|
+--------------+-------+
|          Pass|3276889|
| Ball Receipt*|3065871|
|         Carry|2543840|
|      Pressure|1078359|
| Ball Recovery| 356058|
|          Duel| 250553|
|     Clearance| 154668|
|         Block| 127618|
|       Dribble| 118883|
|   Goal Keeper| 102794|
|Foul Committed|  97498|
|    Miscontrol|  96521|
|      Foul Won|  92722|
|  Dispossessed|  86260|
|          Shot|  84981|
|  Interception|  77768|
| Dribbled Past|  74583|
|  Substitution|  20452|
|      Half End|  13608|
|    Half Start|  13608|
+--------------+-------+


In [34]:
# Plot top 10 distrubition of events 
top_10_events_df = distribution_events_df.limit(10)

# Collect the DataFrame to a list of dictionaries
collect_top_10_events = [row.asDict() for row in top_10_events_df.collect()]

# Extract x and y values for plotting
event_types = [item["name"] for item in collect_top_10_events]
event_counts = [item["count"] for item in collect_top_10_events]

# Create the bar plot
fig = px.bar(x=event_types, y=event_counts, title="Top 10 Events Distribution", labels={"x": "Event Type", "y": "Count"})


fig.show()

### Goals Analysis : Location of the shooter

In [35]:
# Get all shots that resulted in a goal
all_goals_df = events_df.filter(col("events.shot.outcome.name") == "Goal").select('events.shot', 'events.player','events.position','events.location')

In [36]:
all_goals_df.show()

+--------------------+--------------------+--------------------+-------------+
|                shot|              player|            position|     location|
+--------------------+--------------------+--------------------+-------------+
|{NULL, {40, Right...|{4320, Neymar da ...|{19, Center Attac...|[116.5, 47.2]|
|{NULL, {38, Left ...|{7693, Bruno Petk...|{22, Right Center...|[103.2, 38.6]|
|{NULL, {40, Right...|{3441, Nikola Vla...|{12, Right Midfield}|[108.0, 40.0]|
|{NULL, {38, Left ...|{28914, Lovro Majer}|{9, Right Defensi...|[108.0, 40.0]|
|{NULL, {40, Right...|{5539, Carlos Hen...|{11, Left Defensi...|[108.0, 40.0]|
|{NULL, {40, Right...| {5463, Luka Modrić}|{11, Left Defensi...|[108.0, 40.0]|
|{NULL, {40, Right...|{25305, Pedro Gui...|{23, Center Forward}|[108.0, 40.0]|
|{NULL, {40, Right...|{16527, Mislav Or...| {16, Left Midfield}|[108.0, 40.0]|
|{NULL, {40, Right...|{3533, Xherdan Sh...|{19, Center Attac...|[109.2, 44.9]|
|{NULL, {40, Right...|{5834, Mario Gavr...|{23, Cent

The top most occurent locations are practicaly the same because of the penalties. To avoid this bias we will round the locations to the nearest integer.
Note that the goals shooted from the penalty spot ~[108.0, 40.0] are not necessary penalties shot but can be shots in open play.

In [37]:
all_goals_df = all_goals_df.withColumn("rounded_location", expr("transform(location, x -> round(x))"))
all_goals_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [38]:
top_10_goals_locations_df = all_goals_df.groupby("rounded_location").count().orderBy('count',ascending = False).limit(10)
top_locations_list = [row['rounded_location'] for row  in top_10_goals_locations_df.select("rounded_location").collect()]
top_counts_list = [row['count'] for row  in top_10_goals_locations_df.select("count").collect()]

In [39]:
print("Top 10 Goal Locations : \n")

# Create the football field plot
fig = create_football_field_plotly(ratio=0.75)

fig = add_scaled_size_crosses(fig,top_locations_list,top_counts_list)

fig.show()

Top 10 Goal Locations : 


### Goals Analysis : Type of Shots and Techniques efficiency

In [40]:
all_shots_df = events_df.select("events.shot", "events.player","events.position","events.location")

type_of_shots_counts_df = all_shots_df.groupBy("shot.type.name").agg(
    count(when(col("shot.outcome.name") == "Goal", True)).alias("goal_count"),
    count(when(col("shot.outcome.name") != "Goal", True)).alias("other_outcome_count"),
    count("*").alias("total_count")
)

# Compute the success rate
type_of_shots_counts_df = type_of_shots_counts_df.withColumn("success_rate", col("goal_count") / col("total_count")).orderBy('success_rate', ascending = True)

type_of_shots_counts_df.show(truncate=False)

+---------+----------+-------------------+-----------+-------------------+
|name     |goal_count|other_outcome_count|total_count|success_rate       |
+---------+----------+-------------------+-----------+-------------------+
|NULL     |0         |0                  |11710194   |0.0                |
|Kick Off |0         |1                  |1          |0.0                |
|Free Kick|275       |3871               |4146       |0.06632899179932465|
|Open Play|8251      |71334              |79585      |0.10367531570019475|
|Corner   |10        |14                 |24         |0.4166666666666667 |
|Penalty  |910       |315                |1225       |0.7428571428571429 |
+---------+----------+-------------------+-----------+-------------------+


In [41]:
technique_of_shots_counts_df = all_shots_df.groupBy("shot.technique.name").agg(
    count(when(col("shot.outcome.name") == "Goal", True)).alias("goal_count"),
    count(when(col("shot.outcome.name") != "Goal", True)).alias("other_outcome_count"),
    count("*").alias("total_count")
)

# Compute the success rate
technique_of_shots_counts_df = technique_of_shots_counts_df.withColumn("success_rate", col("goal_count") / col("total_count")).orderBy('success_rate', ascending = True)

technique_of_shots_counts_df.show(truncate=False)

+-------------+----------+-------------------+-----------+-------------------+
|name         |goal_count|other_outcome_count|total_count|success_rate       |
+-------------+----------+-------------------+-----------+-------------------+
|NULL         |0         |0                  |11710194   |0.0                |
|Overhead Kick|37        |404                |441        |0.08390022675736962|
|Half Volley  |1147      |10170              |11317      |0.10135194839621808|
|Backheel     |34        |298                |332        |0.10240963855421686|
|Normal       |7258      |59004              |66262      |0.10953487670157858|
|Volley       |673       |4800               |5473       |0.12296729398867166|
|Diving Header|67        |253                |320        |0.209375           |
|Lob          |230       |606                |836        |0.2751196172248804 |
+-------------+----------+-------------------+-----------+-------------------+


In [42]:
type_data = type_of_shots_counts_df.collect()
type_data_dict = [row.asDict() for row in type_data]

technique_data = technique_of_shots_counts_df.collect()
technique_data_dict = [row.asDict() for row in technique_data]

# Extract x and y values for plotting shot types
type_names = [item["name"] for item in type_data_dict]
type_success_rates = [item["success_rate"] * 100 for item in type_data_dict]  # Convert to percentage

# Extract x and y values for plotting shot techniques
technique_names = [item["name"] for item in technique_data_dict]
technique_success_rates = [item["success_rate"] * 100 for item in technique_data_dict]  # Convert to percentage

# Create horizontal bar plot for shot types
fig1 = px.bar(
    x=type_success_rates,
    y=type_names,
    orientation='h',
    title="Success Rate by Shot Type",
    labels={"x": "Success Rate (%)", "y": "Shot Type"}
)

# Create horizontal bar plot for shot techniques
fig2 = px.bar(
    x=technique_success_rates,
    y=technique_names,
    orientation='h',
    title="Success Rate by Shot Technique",
    labels={"x": "Success Rate (%)", "y": "Shot Technique"}
)

fig1.show()
fig2.show()

### Intervals with the most goals and fouls

In [43]:
# Create a helper column to bin the time into 10-minute intervals
match_moments_df = events_df.withColumn("time_bin", ceil(col("events.minute") / 10))

# Distribution of fouls during a match in 10-minute bins
fouls_distribution = match_moments_df.filter(col("foul_committed").isNotNull()) \
    .groupBy("time_bin").count() \
    .withColumnRenamed("count", "fouls_count") \
    .orderBy("time_bin", ascending=True)
    

# Distribution of goals during a match in 10-minute bins
goals_distribution = match_moments_df.filter(col("shot.outcome.name") == "Goal") \
    .groupBy("time_bin").count() \
    .withColumnRenamed("count", "goals_count") \
    .orderBy("time_bin", ascending=True)

In [44]:
# Show the results
fouls_distribution.show()
goals_distribution.show()

+--------+-----------+
|time_bin|fouls_count|
+--------+-----------+
|       0|        146|
|       1|       2091|
|       2|       2588|
|       3|       2830|
|       4|       3053|
|       5|       3355|
|       6|       3248|
|       7|       3249|
|       8|       3206|
|       9|       3372|
|      10|       1037|
|      11|         26|
|      12|         37|
|      13|          3|
+--------+-----------+
+--------+-----------+
|time_bin|goals_count|
+--------+-----------+
|       0|         32|
|       1|        803|
|       2|        860|
|       3|        912|
|       4|        952|
|       5|       1112|
|       6|       1089|
|       7|       1047|
|       8|        995|
|       9|       1094|
|      10|        351|
|      11|         13|
|      12|         28|
|      13|        148|
|      14|         10|
+--------+-----------+


In [45]:
# Collect data from DataFrames
fouls_data = fouls_distribution.collect()
goals_data = goals_distribution.collect()

# Convert to dictionary lists
fouls_dict = {row["time_bin"]: row["fouls_count"] for row in fouls_data}
goals_dict = {row["time_bin"]: row["goals_count"] for row in goals_data}

# Get all unique time bins
time_bins = sorted(set(fouls_dict.keys()).union(set(goals_dict.keys())))

# Prepare data for plotting
fouls_counts = [fouls_dict.get(bin, 0) for bin in time_bins]
goals_counts = [goals_dict.get(bin, 0) for bin in time_bins]

# Create custom x-axis labels for moments of the match
x_labels = [f"{i*10}-{(i+1)*10}" for i in time_bins]


fig = go.Figure()

fig.add_trace(go.Bar(
    x=x_labels,
    y=fouls_counts,
    name='Fouls Count',
    marker_color='indianred'
))

fig.add_trace(go.Bar(
    x=x_labels,
    y=goals_counts,
    name='Goals Count',
    marker_color='lightsalmon'
))

# Add vertical dotted lines for prolongation and penalty time
fig.add_vline(x=len(x_labels) - 6.5, line=dict(color="blue", width=2, dash="dot"), annotation_text="Prolongation", annotation_position="top right")
fig.add_vline(x=len(x_labels) - 3.5, line=dict(color="green", width=2, dash="dot"), annotation_text="Penalty Time", annotation_position="top right")

# Update layout to add some legends
fig.update_layout(
    title='Fouls and Goals Distribution Over Time Bins<br><sup>This plot does not take into account overruns in additional time.</sup>',
    xaxis=dict(title='Match Time (minutes)'),
    yaxis=dict(title='Count'),
    barmode='group' 
)

fig.show()