# Initiating Spark Session

In [1]:
# Spark context access the hardware-level and software-leve configuration 
# For Spark 2.X
# Spark Session provides a unified interface for interacting with 
# different Spark APIs and allows applications to run on a Spark cluster. 

import pyspark
from pyspark import SparkContext, SQLContext 
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

appName = "FIFA_project"
master = "local"

### Create Configuration object for Spark.
# setAppName: set the name of the application 
# setMaster: set Spark cluster to use, here "local" indicating local machine
# set("setting configuration", "attribute"): The configuration could be "spark.driver.host" or "spark.executor.memory"
# and the second entry indicating the corresponding configuration 

conf = pyspark.SparkConf()\
    .set('spark.driver.host','127.0.0.1')\
    .setAppName(appName)\
    .setMaster(master) 

# Create Spark Context with the new configurations rather than relying on the default one
# SparkContext 
sc = SparkContext.getOrCreate(conf=conf) # getOrCreate get the current configuration or create a new one

# Linking to SQL API
# You need to create SQL Context to conduct some database operations like what we will see later.
# SQLContext

sqlContext = SQLContext(sc) # Connected to SQL API

# Spark Session 


spark = SparkSession.builder.master("local[*]").appName(appName).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/11 15:52:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/11 15:52:37 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Ingesting Data into Spark DF

In [2]:
# Store data into one spark df
from pyspark.sql.functions import lit
from functools import reduce

file_paths = ["./fifa_dataset/players_%d.csv" % n for n in range(15,23)]

# Read each CSV file and add a new column
dataframes = [spark.read.csv(file_path, header=True, inferSchema=True) for file_path in file_paths]

# Add a new column to each DataFrame
year = [str(n) for n in range(2015,2023)]
dataframes_with_column = [dataframes[i].withColumn("year", lit(year[i])) for i in range(len(dataframes))]

# Union all DataFrames
fifa = reduce(lambda df1, df2: df1.union(df2), dataframes_with_column)

# Show the final DataFrame
fifa.show()



23/11/11 15:52:41 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+---------+--------------------+-----------------+--------------------+----------------+-------+---------+---------+--------+---+----------+---------+---------+------------+-------------------+--------------------+------------+-------------+------------------+----------------+-----------+-------------------------+--------------+----------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-------------+----------------+---------+------------------+--------------------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-

In [3]:
# Check if all records are unique
print(fifa.distinct().count() == fifa.count())
fifa.count()


# Add a unique primary key id column to dataframe

fifa = fifa.withColumn("id", monotonically_increasing_id())
fifa.show()



                                                                                

True
+---------+--------------------+-----------------+--------------------+----------------+-------+---------+---------+--------+---+----------+---------+---------+------------+-------------------+--------------------+------------+-------------+------------------+----------------+-----------+-------------------------+--------------+----------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-------------+----------------+---------+------------------+--------------------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+-----------------

In [4]:
# Schema
fifa.printSchema()

root
 |-- sofifa_id: integer (nullable = true)
 |-- player_url: string (nullable = true)
 |-- short_name: string (nullable = true)
 |-- long_name: string (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- overall: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: double (nullable = true)
 |-- wage_eur: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- club_team_id: double (nullable = true)
 |-- club_name: string (nullable = true)
 |-- league_name: string (nullable = true)
 |-- league_level: integer (nullable = true)
 |-- club_position: string (nullable = true)
 |-- club_jersey_number: integer (nullable = true)
 |-- club_loaned_from: string (nullable = true)
 |-- club_joined: date (nullable = true)
 |-- club_contract_valid_until: integer (nullable = true)
 |-- nationality_id: integer (nullable = true)
 

# Write Spark DF to Postgres

In [5]:
# Wirte Spark df to postgres database table

# Specify the primary key columns
primary_key_columns = "sofifa_id, year"  # Replace with your actual column names



db_properties={}
#update your db username
db_properties['username']="tungyuhsiao"
#update your db password
db_properties['password']="10430041"
#make sure you got the right port number here
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
#make sure you had the Postgres JAR file in the right location
db_properties['driver']="org.postgresql.Driver"
db_properties['table']= "fifa"
db_properties["primary_key_cloumns"] = "sofifa_id, year"



# Write the DataFrame to PostgreSQL with the specified primary key
fifa.write.format("jdbc") \
    .mode("overwrite").option("url", db_properties["url"]) \
    .option("dbtable", db_properties["table"]) \
    .option("user", db_properties["username"]) \
    .option("password", db_properties["password"]) \
    .option("driver", db_properties["driver"]).save()
    # .option("createTableColumnTypes", "primary key (sofifa_id, year)") \
    

23/11/11 15:52:48 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [6]:
# Wirte Spark df to postgres database table

# Specify the primary key columns
primary_key_columns = "sofifa_id, year"  # Replace with your actual column names



db_properties={}
#update your db username
db_properties['username']="tungyuhsiao"
#update your db password
db_properties['password']="10430041"
#make sure you got the right port number here
db_properties['url']= "jdbc:postgresql://localhost:5432/postgres"
#make sure you had the Postgres JAR file in the right location
db_properties['driver']="org.postgresql.Driver"
db_properties['table']= "fifa"
db_properties["primary_key_cloumns"] = "sofifa_id, year"



# Write the DataFrame to PostgreSQL with the specified primary key
fifa.write.format("jdbc") \
    .mode("overwrite").option("url", db_properties["url"]) \
    .option("dbtable", db_properties["table"]) \
    .option("user", db_properties["username"]) \
    .option("password", db_properties["password"]) \
    .option("driver", db_properties["driver"]).save()
    # .option("createTableColumnTypes", "primary key (sofifa_id, year)") \
    

                                                                                

# Calculating Players Contract Ending 2023 in 2022

In [7]:
# Define a function to extract top N team with players contract ending in 2023
from pyspark.sql.functions import *
def getXClubWithMostPlayerContractEnd(X:int):
    # Filter players contract 
    fifa_2022 = fifa.filter(col("year") == "2022")


    # Filter players contract ending at 2023
    fifa_2022_p2023 = fifa_2022.filter(col("club_contract_valid_until") == 2023)

    print("Percentage of players in year 2022 with contract ending in 2023 is %f %%" % (fifa_2022_p2023.count()/ fifa_2022.count() * 100))
    # show count of players contract ending in 2023 by clubs 
    playersByClub = fifa_2022_p2023.groupby("club_name").count().sort(col("count"), ascending = False)


    # Show the team with most players contract ending in 2023
    top_teams = playersByClub.orderBy(col('count').desc()).limit(X)

    # DF excluding Top X Selected teams
    dfExcludeTopN = playersByClub.subtract(top_teams)
    
    # handle special scenario (where there are multiple teams sharing same amount of players ending in 2023)
    Edge = top_teams.tail(1)[0][1]  # extract the ranked team
    Edge_club = dfExcludeTopN.filter(col("count") == Edge)  # extract team left in the subtracted df where they share same amount of player as the last ranked team 

    output = top_teams.union(Edge_club)
    output.show()
    return output

    


In [8]:
# Customize X for user input
# X = input("Insert number of desired teams to extract")

X = 5
teams = getXClubWithMostPlayerContractEnd(5)

Percentage of players in year 2022 with contract ending in 2023 is 23.623889 %
+--------------------+-----+
|           club_name|count|
+--------------------+-----+
|En Avant de Guingamp|   19|
| Club Atlético Lanús|   17|
|       Lechia Gdańsk|   17|
|            Barnsley|   16|
|        Kasimpaşa SK|   16|
|        Bengaluru FC|   16|
+--------------------+-----+



# List the Y clubs with highest average number of players that are older than 27 years across all years 

In [9]:
def highestAVGPlayerAge(Y:int):
    oldPlayer = fifa.filter(col("age") > 27)


    # Identify & remove players not affiliated to a club 
    oldPlayerFiltered = oldPlayer.na.drop(subset = ["club_name", "age"])
    # oldPlayerFiltered.show()


    # Group by club 
    count_oldPlayer = oldPlayerFiltered.groupBy(col("club_name"), col("year")).count()
    count_oldPlayer_ordered = count_oldPlayer.groupby(col("club_name")).mean().sort(col("avg(count)"), ascending = False)
    # count_oldPlayer_ordered.show()

    firstN = count_oldPlayer_ordered.limit(Y)

    # handle special scenario

    Edge = firstN.tail(1)[0][1]

    # DF excluding Top X Selected teams
    dfExcludeTopN = count_oldPlayer_ordered.subtract(firstN)

    # handle special scenario (where there are multiple teams sharing same amount of players ending in 2023)
    Edge = firstN.tail(1)[0][1]  # extract the ranked team
    Edge_club = dfExcludeTopN.filter(col("avg(count)") == Edge)  # extract team left in the subtracted df where they share same amount of player as the last ranked team 

    output = firstN.union(Edge_club)
    output.show()

    return output



In [10]:
# Customize Y for user input
# Y = input("Insert number of desired teams to extract")
Y = 6
old_teams = highestAVGPlayerAge(Y)



+--------------------+----------+
|           club_name|avg(count)|
+--------------------+----------+
|  Dorados de Sinaloa|      19.0|
| Matsumoto Yamaga FC|      19.0|
| Shanghai Shenhua FC|      18.5|
|          Qingdao FC|      18.0|
|Club Deportivo Jo...|      17.5|
|            Altay SK|      17.0|
|         Guaireña FC|      17.0|
+--------------------+----------+



                                                                                

#  Find the most frequent nation_position in the dataset for each year

In [11]:
# Check NULL values in column national_position 
# Show players who are not affiliated to a national team 
fifa.select(col("long_name")).filter(col("nation_position").isNull())

# Remove players not in nation teams
nationPlayers = fifa.na.drop(subset = ["nation_position"])
position_count = nationPlayers.groupBy(["year", "nation_position"]).count()

# Max by year
position_count.join(position_count.groupBy("year").agg(max("count").alias("count")), on = "count", how="leftsemi").show()



+-----+----+---------------+
|count|year|nation_position|
+-----+----+---------------+
|  564|2015|            SUB|
|  511|2016|            SUB|
|  564|2017|            SUB|
|  600|2018|            SUB|
|  576|2019|            SUB|
|  588|2020|            SUB|
|  588|2021|            SUB|
|  396|2022|            SUB|
+-----+----+---------------+



#  Find the most frequent nation_position in the dataset for each year

In [12]:
# Check NULL values in column national_position 

# Show players who are not affiliated to a national team 
fifa.select(col("long_name")).filter(col("nation_position").isNull())

# Remove players not in nation teams
nationPlayers = fifa.na.drop(subset = ["nation_position"])
position_count = nationPlayers.groupBy(["year", "nation_position"]).count()

# Max by year
position_count.join(position_count.groupBy("year").agg(max("count").alias("count")), on = "count", how="leftsemi").show()



+-----+----+---------------+
|count|year|nation_position|
+-----+----+---------------+
|  564|2015|            SUB|
|  511|2016|            SUB|
|  564|2017|            SUB|
|  600|2018|            SUB|
|  576|2019|            SUB|
|  588|2020|            SUB|
|  588|2021|            SUB|
|  396|2022|            SUB|
+-----+----+---------------+



In [19]:
# Save df 
fifa.write.format("jdbc") \
    .mode("overwrite").option("url", db_properties["url"]) \
    .option("dbtable", db_properties["table"]) \
    .option("user", db_properties["username"]) \
    .option("password", db_properties["password"]) \
    .option("driver", db_properties["driver"])\
    .save()

                                                                                

In [20]:
fifa.show()

+---------+--------------------+-----------------+--------------------+----------------+-------+---------+---------+--------+---+----------+---------+---------+------------+-------------------+--------------------+------------+-------------+------------------+----------------+-----------+-------------------------+--------------+----------------+--------------+---------------+--------------------+--------------+---------+-----------+------------------------+-------------+----------------+---------+------------------+--------------------+--------------------+----+--------+-------+---------+---------+------+------------------+-------------------+--------------------------+-----------------------+-----------------+---------------+-----------+-----------------+------------------+------------------+---------------------+---------------------+----------------+------------------+----------------+----------------+-------------+-------------+--------------+----------------+--------------------+-