# Eda PySpark Statbombs : Three-Sixty

### Imports

In [1]:
import random 
import plotly.graph_objects as go
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
from eda_functions import create_football_field_plotly

In [None]:
# Define the schema for the matches dataset
three_sixty_file_path = "../../../open-data/data/three-sixty"

## Part 1 : Data Loading and Preprocessing

In [2]:
# Create a Spark session
spark = SparkSession.builder.appName("Football Data Analysis : Three-Sixty").getOrCreate()

In [3]:
# Read JSON files into DataFrames
three_sixty_df = spark.read.json(three_sixty_file_path, multiLine=True)
three_sixty_df.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- event_uuid: string (nullable = true)
 |-- freeze_frame: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- actor: boolean (nullable = true)
 |    |    |-- keeper: boolean (nullable = true)
 |    |    |-- location: array (nullable = true)
 |    |    |    |-- element: double (containsNull = true)
 |    |    |-- teammate: boolean (nullable = true)
 |-- visible_area: array (nullable = true)
 |    |-- element: double (containsNull = true)


In [4]:
# Check main data fields
three_sixty_df.select("visible_area").show(5, truncate=False)
three_sixty_df.select("freeze_frame").show(5, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------+
|visible_area                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------------+
|[120.0, 80.0, 0.0, 80.0, 0.0, 77.9359903100536, 38.4846328988436, 0.0, 82.8377301424974, 0.0, 120.0, 77.6086776060678, 120.0, 80.0]|
|[0.0, 80.0, 0.0, 68.3780973203674, 35.4551602560713, 0.0, 81.4593944743986, 0.0, 118.065295451212, 80.0, 0.0, 80.0]                |
|[0.0, 80.0, 0.0, 68.3780973203674, 35.4551602560713, 0.0, 81.4593944743986, 0.0, 118.065295451212, 80.0, 0.0, 80.0]                |
|[2.18856831879949, 80.0, 37.9809130991844, 0.0, 80.364694539173, 0.0, 114.450327068229, 80.0, 2.18856831879949, 80.0]              |
|[13.952798163308, 80.0, 42.7269914891536, 0.0, 79.77721834624

## Part 2 : Data Analysis

### Analysis of frames

In [32]:
# Here, we want to see one example of frame displayed on the field already implemented in the eda_functions.py 
# Extract a random sample (improve speed of load instead of grouping all sample)
random_sample_df = three_sixty_df.sample(withReplacement=False, fraction=0.01, seed=random.randint(0, 100)).limit(1)

# Extract player positions and teammate status from the random sample
player_positions_df = random_sample_df.select(explode(col("freeze_frame")).alias("player"))
positions_df = player_positions_df.select(
    col("player.location").alias("location"),
    col("player.teammate").alias("teammate")
)

# Collect the data
positions_list = positions_df.collect()

# Separate the coordinates and teammate status
x_teammates = [pos.location[0] for pos in positions_list if pos.teammate]
y_teammates = [pos.location[1] for pos in positions_list if pos.teammate]
x_opponents = [pos.location[0] for pos in positions_list if not pos.teammate]
y_opponents = [pos.location[1] for pos in positions_list if not pos.teammate]

# Extract visible_area from the random sample
visible_areas_list = random_sample_df.select("visible_area").collect()

# Convert to a list of lists of coordinates
visible_areas = [row['visible_area'] for row in visible_areas_list]


# Create the football field plot
fig = create_football_field_plotly()

# Plot visible area
for area in visible_areas:
    x_coords = area[0::2]  # Take every second element starting from index 0
    y_coords = area[1::2]  # Take every second element starting from index 1
    
    # Add the first point again to close the polygon
    x_coords.append(x_coords[0])
    y_coords.append(y_coords[0])

fig.add_trace(go.Scatter(x=x_coords, y=y_coords, fill='toself', mode='lines', line=dict(color='blue')))

# Plot player positions
fig.add_trace(go.Scatter(x=x_teammates, y=y_teammates, mode='markers', marker=dict(color='turquoise', size=10), name='Teammates'))
fig.add_trace(go.Scatter(x=x_opponents, y=y_opponents, mode='markers', marker=dict(color='red', size=10), name='Opponents'))

# Update layout and show plot
fig.update_layout(title='Football Field with Visible Areas and Player Positions')
fig.show()