# Eda PySpark Statbombs : Matches

### Imports

In [48]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg,col, to_date, dayofmonth, month, count, year, sum as _sum, min as _min, max as _max 
import pandas as pd
from eda_functions import count_features
import plotly.express as px
import plotly.graph_objects as go
import os
import sys

In [49]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
# Read JSON files into DataFrames
root_dir = "../../open-data/data/matches"

## Part 1 : Data Loading and Preprocessing

In [50]:
# Create a Spark session
spark = SparkSession.builder.appName("Football Data Analysis : Matches").getOrCreate()

In [51]:
matches_df = spark.read.json(f"{root_dir}/**/*.json", multiLine=True)
matches_df.printSchema()

root
 |-- away_score: long (nullable = true)
 |-- away_team: struct (nullable = true)
 |    |-- away_team_gender: string (nullable = true)
 |    |-- away_team_group: string (nullable = true)
 |    |-- away_team_id: long (nullable = true)
 |    |-- away_team_name: string (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- managers: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- country: struct (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- dob: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- nickname: string (nullable = true)
 |-- competition: struct (nullable = true)
 |    |-- competition_id: long (nullable = true)
 |    |-- competition_nam

In [52]:
# Count features at each level 
count_features(matches_df.schema)

18 features at level 0
28 features at level 1
18 features at level 2
4 features at level 3


In [55]:
"""
# Get the list of JSON files
json_files = spark.sparkContext.wholeTextFiles(root_dir + "/**/*.json").keys().collect()
# Count the number of JSON files
num_files = len(json_files)
print(f"Number of JSON files read: {num_files}")

--> This way worked for the other folders but not for this one because the json files are nested in subdirectories and not at the same level as the root directory.
"""
# Function to count JSON files
def count_json_files(directory):
    count = 0
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith(".json"):
                count += 1
    return count

# Count the JSON files in the directory and its subdirectories
num_files = count_json_files(root_dir)
print(f"Number of JSON files read: {num_files}")

Number of JSON files read: 72


In [None]:
# Get the list of JSON files
json_files = spark.sparkContext.wholeTextFiles(root_dir + "/**/*.json").keys().collect()
# Count the number of JSON files
num_files = len(json_files)
print(f"Number of JSON files read: {num_files}")

In [None]:
# Convert match_date to date type 
matches_df = matches_df.withColumn("match_date", to_date(col("match_date"), "yyyy-MM-dd"))

# Extract day and month from match_date
matches_df = matches_df.withColumn("day", dayofmonth(col("match_date"))) \
    .withColumn("year", year(col("match_date"))) \
    .withColumn("month", month(col("match_date")))

# Part 2: Data Analysis

In [None]:
# Select the season column
seasons_df = matches_df.select("season.season_name").distinct()

# Count the number of unique seasons
num_seasons = seasons_df.count()

seasons_df.orderBy("season_name").show(num_seasons,truncate=False)

In [None]:
# Count the number of matches in the dataset
num_matches = matches_df.count()
date_df = matches_df.select(col("match_date"))

# Compute number of different competitions
num_competitions = matches_df.select("competition.competition_name").distinct().count()

# Compute the minimum and maximum dates
date_range = date_df.agg(
    _min("match_date").alias("begin_date"),
    _max("match_date").alias("end_date")
).collect()

# Extract the begin and end dates
begin_date = date_range[0]["begin_date"]
end_date = date_range[0]["end_date"]

# Display the results
print(f"Dates of matches goes from {begin_date} to {end_date}")
print(f"Number of matches: {num_matches}")
print(f"Number of seasons: {num_seasons}")
print(f"Number of competitions: {num_competitions}")

### Number of Matches per Year in our Data

In [None]:
matches_per_year_df = matches_df.groupBy("year").agg(count("match_id").alias("number_of_matches"))

matches_per_year_df.show(matches_per_year_df.count(),truncate=False)

In [None]:
# Collect the results
matches_per_year_list = matches_per_year_df.orderBy("year").collect()

# Extract data into lists
years = [row['year'] for row in matches_per_year_list]
number_of_matches = [row['number_of_matches'] for row in matches_per_year_list]

# Create a dictionary from the collected data
data = {
    'year': years,
    'number_of_matches': number_of_matches
}

# Create the figure
fig = px.line(data, x='year', y='number_of_matches', title='Number of Matches in our Data per Year', markers=True)

# Update layout for legends
fig.update_layout(
    xaxis_title='Year',
    yaxis_title='Number of Matches',
    template='plotly_white'
)

fig.show()

### Some stats about the begin / end date of the matches

In [None]:
# Extract necessary fields
season_dates_df = matches_df.select(
    col("season.season_name").alias("season_name"),
    col("match_date")
)

# Group by season name and compute the minimum and maximum match dates for each season
season_date_range = season_dates_df.groupBy("season_name").agg(
    _min("match_date").alias("begin_date"),
    _max("match_date").alias("end_date")
)

# Show the results
season_date_range.show(truncate=False)

In [None]:
# Group by day and month and count the number of matches
matches_per_day_month_df = matches_df.groupBy("day", "month").agg(count("*").alias("number_of_matches"))

matches_per_day_month_df.show()

### Some stats about the begin / end date of the matches

In [None]:
# Group by day and month and count the number of matches
matches_per_day_month_df = matches_df.groupBy("day", "month").agg(count("*").alias("number_of_matches"))

# Collect the results
matches_per_day_month_list = matches_per_day_month_df.collect()

# Convert to a Dictionary
matches_dict = {(row['month'], row['day']): row['number_of_matches'] for row in matches_per_day_month_list}

# Create a DataFrame from the dictionary
matches_data = pd.DataFrame(list(matches_dict.items()), columns=['month_day', 'number_of_matches'])

# Split month_day into separate month and day columns
matches_data[['month', 'day']] = pd.DataFrame(matches_data['month_day'].tolist(), index=matches_data.index)

# Create a date range for a typical year (e.g. here with 2024)
matches_data['date'] = pd.to_datetime('2024-' + matches_data['month'].astype(str) + '-' + matches_data['day'].astype(str))

# Set date as the index
matches_data.set_index('date', inplace=True)

# Create a heatmap
fig = px.density_heatmap(
    matches_data,
    x=matches_data.index.day,
    y=matches_data.index.month,
    z='number_of_matches',
    color_continuous_scale='Blues',
    nbinsx=31,
    nbinsy=12,
    labels={'x': 'Day', 'y': 'Month', 'z': 'Number of Matches'}
)

# Update layout to add legends
fig.update_layout(
    title='Number of Matches for Each Day of the Year (Summed Across Multiple Years)',
    xaxis_title='Day',
    yaxis_title='Month',
    yaxis=dict(tickmode='array', tickvals=list(range(1, 13)), ticktext=[
        'January', 'February', 'March', 'April', 'May', 'June',
        'July', 'August', 'September', 'October', 'November', 'December'
    ]),
    coloraxis_colorbar=dict(title='Number of Matches')
)

fig.show()

### Analysis about home / away scores

In [None]:
# Select only the home_score and away_score columns
scores_matches_df = matches_df.select("home_score", "away_score")

# Compute means
mean_scores = scores_matches_df.agg(
    {"home_score": "mean", "away_score": "mean"}
).withColumnRenamed("avg(home_score)", "avg_home_score").withColumnRenamed("avg(away_score)", "avg_away_score")

mean_scores.show()

In [None]:
# Compute the average home and away scores for each year
average_scores_per_year_df = matches_df.groupBy("year").agg(
    avg("home_score").alias("average_home_score"),
    avg("away_score").alias("average_away_score")
)

# Collect the results
average_scores_per_year_list = average_scores_per_year_df.orderBy("year").collect()

# Extract data into lists
years = [row['year'] for row in average_scores_per_year_list]
average_home_scores = [row['average_home_score'] for row in average_scores_per_year_list]
average_away_scores = [row['average_away_score'] for row in average_scores_per_year_list]


fig = go.Figure()

# Add the average home scores line
fig.add_trace(go.Scatter(
    x=years,
    y=average_home_scores,
    mode='lines+markers',
    name='Average Home Score'
))

# Add the average away scores line
fig.add_trace(go.Scatter(
    x=years,
    y=average_away_scores,
    mode='lines+markers',
    name='Average Away Score'
))

# Update layout to add legends
fig.update_layout(
    title='Evolution of Average Home and Away Scores Over the Years',
    xaxis_title='Year',
    yaxis_title='Average Score',
    legend_title='Score Type',
    template='plotly_white'
)

fig.show()

### Check which competition and season are complete

In [None]:
# Extract necessary fields
selected_matches_df = matches_df.select(
    col("competition.competition_name").alias("competition_name"),
    col("season.season_name").alias("season_name"),
    col("home_team.home_team_gender").alias("competition_gender"),
    col("home_score"),
    col("away_score")
)

# Group by competition name, season name, and genders
matches_per_league_season = selected_matches_df.groupBy(
    "competition_name", "season_name", "competition_gender"
).agg(
    count("*").alias("number_of_matches"),
    _sum("home_score").alias("total_home_goals"),
    _sum("away_score").alias("total_away_goals"),
    (_sum("home_score") + _sum("away_score")).alias("total_goals")
)

# Sort by number of matches in descending order
matches_per_league_season_sorted = matches_per_league_season.orderBy("number_of_matches", ascending=False)

matches_per_league_season_sorted.show(num_seasons,truncate=False)

According to the previous results, we can see that we only have the complete data for the following competitions:
- FIFA World Cup 2018
- Women's World Cup 2023
- FIFA World Cup 2022
- SÃ©rie A 2015/16
- Premier League 2015/16
- La Liga 2015/16
- Bundesliga 2015/16
- Ligue 1 2015/16

Those ones are the one which seemed to be complete and that we have verified that we have all the data for the matches.
However we have too much data to verify this for all competitions and seasons and any variable to check if the data is complete or not.

### Analysis for competitions and seasons

In [None]:
matches_per_league_season_sorted = matches_per_league_season_sorted.withColumn(
    "avg_goals_per_match",
    col("total_goals") / col("number_of_matches")
)

# Show the sorted DataFrame
matches_per_league_season_sorted = matches_per_league_season_sorted.select("competition_name","season_name", "competition_gender","avg_goals_per_match").orderBy(col("avg_goals_per_match").desc())

matches_per_league_season_sorted.show(num_seasons,truncate=False)

In [None]:
matches_per_league_list = matches_per_league_season_sorted.select("competition_name", "season_name", "avg_goals_per_match").limit(10).collect()

# Extract data into lists
competition_names = [row['competition_name'] for row in matches_per_league_list]
season_names = [row['season_name'] for row in matches_per_league_list]
avg_goals_per_match = [row['avg_goals_per_match'] for row in matches_per_league_list]

In [None]:
# Create a plotly bar chart
fig = go.Figure()

# Add the bar chart
fig.add_trace(go.Bar(
    x=[f"{comp} {season}" for comp, season in zip(competition_names, season_names)],
    y=avg_goals_per_match,
    name='Average Goals per Match'
))

# Update layout to add legends
fig.update_layout(
    title='Average Goals per Match by Competition and Season',
    xaxis_title='Competition and Season',
    yaxis_title='Average Goals per Match',
    xaxis_tickangle=-45,
    template='plotly_white'
)

fig.show()

### Analysis for complete competitions and seasons

In [None]:
# List of competitions and seasons complete
competitions_and_seasons = [
    ("FIFA World Cup", "2018"),
    ("Women's World Cup", "2023"),
    ("FIFA World Cup", "2022"),
    ("Serie A", "2015/2016"),
    ("Premier League", "2015/2016"),
    ("La Liga", "2015/2016"),
    ("1. Bundesliga", "2015/2016"),
    ("Ligue 1", "2015/2016")
]

# Filter the DataFrame to include only the specified competitions and seasons
filtered_df = matches_per_league_season_sorted.filter(
    (col("competition_name") == "FIFA World Cup") & (col("season_name") == "2018") |
    (col("competition_name") == "Women's World Cup") & (col("season_name") == "2023") |
    (col("competition_name") == "FIFA World Cup") & (col("season_name") == "2022") |
    (col("competition_name") == "Serie A") & (col("season_name") == "2015/2016") |
    (col("competition_name") == "Premier League") & (col("season_name") == "2015/2016") |
    (col("competition_name") == "La Liga") & (col("season_name") == "2015/2016") |
    (col("competition_name") == "1. Bundesliga") & (col("season_name") == "2015/2016") |
    (col("competition_name") == "Ligue 1") & (col("season_name") == "2015/2016")
)

# Collect the results
filtered_list = filtered_df.select("competition_name", "season_name", "avg_goals_per_match").collect()

# Extract data into lists
competition_names = [row['competition_name'] for row in filtered_list]
season_names = [row['season_name'] for row in filtered_list]
avg_goals_per_match = [row['avg_goals_per_match'] for row in filtered_list]


In [None]:
fig = go.Figure()

# Add the bar chart
fig.add_trace(go.Bar(
    x=[f"{comp} {season}" for comp, season in zip(competition_names, season_names)],
    y=avg_goals_per_match,
    name='Average Goals per Match'
))

# Update layout to add legends
fig.update_layout(
    title='Average Goals per Match by Competition and Season (Filtered)',
    xaxis_title='Competition and Season',
    yaxis_title='Average Goals per Match',
    xaxis_tickangle=-45,
    template='plotly_white'
)

fig.show()