# Eda PySpark Statbombs : Competitions

### Imports

In [13]:
import plotly.express as px
from pyspark.sql import SparkSession

## Part 1 : Data Loading and Preprocessing

In [14]:
# Create a Spark session
spark = SparkSession.builder.appName("Football Data Analysis : Competitions").getOrCreate()

In [15]:
competitions_file_path = "../../../open-data/data/competitions.json"

# Read JSON files into DataFrames
competitions_df = spark.read.json(competitions_file_path, multiLine=True)
competitions_df.printSchema()

root
 |-- competition_gender: string (nullable = true)
 |-- competition_id: long (nullable = true)
 |-- competition_international: boolean (nullable = true)
 |-- competition_name: string (nullable = true)
 |-- competition_youth: boolean (nullable = true)
 |-- country_name: string (nullable = true)
 |-- match_available: string (nullable = true)
 |-- match_available_360: string (nullable = true)
 |-- match_updated: string (nullable = true)
 |-- match_updated_360: string (nullable = true)
 |-- season_id: long (nullable = true)
 |-- season_name: string (nullable = true)


In [16]:
# Show available data
competitions_df.show(5, truncate=False)

+------------------+--------------+-------------------------+----------------------+-----------------+------------+--------------------------+-------------------+--------------------------+-----------------------+---------+-----------+
|competition_gender|competition_id|competition_international|competition_name      |competition_youth|country_name|match_available           |match_available_360|match_updated             |match_updated_360      |season_id|season_name|
+------------------+--------------+-------------------------+----------------------+-----------------+------------+--------------------------+-------------------+--------------------------+-----------------------+---------+-----------+
|male              |9             |false                    |1. Bundesliga         |false            |Germany     |2023-12-12T07:43:33.436182|NULL               |2023-12-12T07:43:33.436182|NULL                   |27       |2015/2016  |
|male              |1267          |true                 

## Part 2 : Data Analysis

### Competitions by Genres and Youth 

In [17]:
gender_grouped = competitions_df.groupBy("competition_gender").count()

youth_grouped = competitions_df.groupBy("competition_youth").count()

print(f"Total number of competitions : {competitions_df.select('competition_name').count()}")

gender_grouped.show()
youth_grouped.show()

Total number of competitions : 71
+------------------+-----+
|competition_gender|count|
+------------------+-----+
|            female|    7|
|              male|   64|
+------------------+-----+

+-----------------+-----+
|competition_youth|count|
+-----------------+-----+
|            false|   71|
+-----------------+-----+


In [18]:
gender_data = gender_grouped.collect()
youth_data = youth_grouped.collect()

# Transform youth data, replace boolean values
youth_data_transformed = [
    {'competition_youth': 'youths competitions' if row['competition_youth'] else 'professionals competitions', 'count': row['count']}
    for row in youth_data
]

# Ensure visibility of 0 counts for youth competition
if not any(row['competition_youth'] == 'youth competition' for row in youth_data_transformed):
    youth_data_transformed.append({'competition_youth': 'youth competition', 'count': 0})

# Plotting Gender Data
fig = px.bar(
    x=[row['competition_gender'] for row in gender_data],
    y=[row['count'] for row in gender_data],
    labels={'x': 'Competition Gender', 'y': 'Count'},
    title='Competition Gender Counts',
    color=[row['competition_gender'] for row in gender_data],
    color_discrete_map={'male': 'blue', 'female': 'turquoise'}
)
fig.show()

# Plotting Youth Data
fig = px.bar(
    x=[row['competition_youth'] for row in youth_data_transformed],
    y=[row['count'] for row in youth_data_transformed],
    labels={'x': 'Competition Youth', 'y': 'Count'},
    title='Competition Youth Counts',
    color=[row['competition_youth'] for row in youth_data_transformed],
    color_discrete_map={'opposite of youth competition': 'blue', 'youth competition': 'turquoise'}
)
fig.show()

### Competitions by Country

In [19]:
# Group by competition country and count the number of competitions
countries_grouped = competitions_df.groupby('country_name').count().orderBy('count', ascending=False)
countries_grouped.show()

+--------------------+-----+
|        country_name|count|
+--------------------+-----+
|              Europe|   21|
|               Spain|   21|
|       International|   11|
|             England|    5|
|              France|    3|
|           Argentina|    2|
|               Italy|    2|
|United States of ...|    2|
|             Germany|    1|
|              Africa|    1|
|               India|    1|
|North and Central...|    1|
+--------------------+-----+


In [20]:
# Plot the distribution of competitions by country using Plotly Express
fig = px.bar(countries_grouped, x='country_name', y='count', title='Number of Competitions by Country / Continent',
             labels={'country_name': 'Country', 'count': 'Count'},
             # add one color for all bars
                color_discrete_sequence=['lightblue']
             )

fig.show()