In [None]:

from pyspark.sql import SparkSession
import pyspark.sql.functions as F


In [None]:
spark = SparkSession.builder.appName("Football DataFrame Operations").getOrCreate()


In [None]:
# url = 'https://raw.githubusercontent.com/footballcsv/world/main/WorldCup.csv' # Original URL
# football_df = spark.read.csv(url, header=True, inferSchema=True) # Original code

# Reading from a sample CSV file available in the Colab environment
football_df = spark.read.csv("/content/nl.1.csv", header=True, inferSchema=True)
football_df.show(10)

+-----+--------------+-------------------+---+-----------------+
|Round|          Date|             Team 1| FT|           Team 2|
+-----+--------------+-------------------+---+-----------------+
|    1|Fri Aug 2 2019|         PEC Zwolle|1-3|Willem II Tilburg|
|    1|Sat Aug 3 2019|     Vitesse Arnhem|2-2|   Ajax Amsterdam|
|    1|Sat Aug 3 2019|           FC Emmen|0-1|     FC Groningen|
|    1|Sat Aug 3 2019|          VVV Venlo|3-1|     RKC Waalwijk|
|    1|Sat Aug 3 2019|          FC Twente|1-1|    PSV Eindhoven|
|    1|Sun Aug 4 2019|    Heracles Almelo|0-4|    SC Heerenveen|
|    1|Sun Aug 4 2019|Feyenoord Rotterdam|2-2| Sparta Rotterdam|
|    1|Sun Aug 4 2019|       ADO Den Haag|2-4|       FC Utrecht|
|    1|Sun Aug 4 2019|         AZ Alkmaar|4-0|  Fortuna Sittard|
|    2|Fri Aug 9 2019|   Sparta Rotterdam|4-1|        VVV Venlo|
+-----+--------------+-------------------+---+-----------------+
only showing top 10 rows



In [11]:
football_df.createOrReplaceTempView('football_table')


In [12]:
# Reading from a sample CSV file available in the Colab environment
football_df = spark.read.csv("/content/nl.1.csv", header=True, inferSchema=True)
football_df.show(10)

# Create a temporary view
football_df.createOrReplaceTempView('football_table')

+-----+---------------+----------------+---+-------------------+
|Round|           Date|          Team 1| FT|             Team 2|
+-----+---------------+----------------+---+-------------------+
|    1|Sat Sep 12 2020|   SC Heerenveen|2-0|  Willem II Tilburg|
|    1|Sat Sep 12 2020|      PEC Zwolle|0-2|Feyenoord Rotterdam|
|    1|Sat Sep 12 2020|       FC Twente|2-0|    Fortuna Sittard|
|    1|Sun Sep 13 2020|        FC Emmen|3-5|          VVV Venlo|
|    1|Sun Sep 13 2020| Heracles Almelo|2-0|       ADO Den Haag|
|    1|Sun Sep 13 2020|Sparta Rotterdam|0-1|     Ajax Amsterdam|
|    1|Sun Sep 13 2020|    RKC Waalwijk|0-1|     Vitesse Arnhem|
|    1|Sun Sep 13 2020|    FC Groningen|1-3|      PSV Eindhoven|
|    2|Fri Sep 18 2020|       VVV Venlo|1-1|         FC Utrecht|
|    2|Sat Sep 19 2020|      AZ Alkmaar|1-1|         PEC Zwolle|
+-----+---------------+----------------+---+-------------------+
only showing top 10 rows



In [5]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("Football DataFrame Operations").getOrCreate()

# Reading from a sample CSV file available in the Colab environment
football_df = spark.read.csv("nl.1.csv", header=True, inferSchema=True)
football_df.show(10)

# Create a temporary view
football_df.createOrReplaceTempView('football_table')

+-----+---------------+----------------+---+-------------------+
|Round|           Date|          Team 1| FT|             Team 2|
+-----+---------------+----------------+---+-------------------+
|    1|Sat Sep 12 2020|   SC Heerenveen|2-0|  Willem II Tilburg|
|    1|Sat Sep 12 2020|      PEC Zwolle|0-2|Feyenoord Rotterdam|
|    1|Sat Sep 12 2020|       FC Twente|2-0|    Fortuna Sittard|
|    1|Sun Sep 13 2020|        FC Emmen|3-5|          VVV Venlo|
|    1|Sun Sep 13 2020| Heracles Almelo|2-0|       ADO Den Haag|
|    1|Sun Sep 13 2020|Sparta Rotterdam|0-1|     Ajax Amsterdam|
|    1|Sun Sep 13 2020|    RKC Waalwijk|0-1|     Vitesse Arnhem|
|    1|Sun Sep 13 2020|    FC Groningen|1-3|      PSV Eindhoven|
|    2|Fri Sep 18 2020|       VVV Venlo|1-1|         FC Utrecht|
|    2|Sat Sep 19 2020|      AZ Alkmaar|1-1|         PEC Zwolle|
+-----+---------------+----------------+---+-------------------+
only showing top 10 rows



In [16]:
import pyspark.sql.functions as F

# Split the 'FT' column into home and away goals and convert them to integers
football_df = football_df.withColumn('HomeGoals', F.split(football_df['FT'], '-').getItem(0).cast('int'))
football_df = football_df.withColumn('AwayGoals', F.split(football_df['FT'], '-').getItem(1).cast('int'))

# Calculate the total goals
football_df = football_df.withColumn('TotalGoals', football_df['HomeGoals'] + football_df['AwayGoals'])

# Create or replace the temporary view with the updated DataFrame
football_df.createOrReplaceTempView('football_table')

# Now, filter the table where the total goals are less than 5
high_goals_df = spark.sql('SELECT * FROM football_table WHERE TotalGoals < 5')
high_goals_df.show(10)

+-----+---------------+----------------+---+-------------------+---------+---------+----------+
|Round|           Date|          Team 1| FT|             Team 2|HomeGoals|AwayGoals|TotalGoals|
+-----+---------------+----------------+---+-------------------+---------+---------+----------+
|    1|Sat Sep 12 2020|   SC Heerenveen|2-0|  Willem II Tilburg|        2|        0|         2|
|    1|Sat Sep 12 2020|      PEC Zwolle|0-2|Feyenoord Rotterdam|        0|        2|         2|
|    1|Sat Sep 12 2020|       FC Twente|2-0|    Fortuna Sittard|        2|        0|         2|
|    1|Sun Sep 13 2020| Heracles Almelo|2-0|       ADO Den Haag|        2|        0|         2|
|    1|Sun Sep 13 2020|Sparta Rotterdam|0-1|     Ajax Amsterdam|        0|        1|         1|
|    1|Sun Sep 13 2020|    RKC Waalwijk|0-1|     Vitesse Arnhem|        0|        1|         1|
|    1|Sun Sep 13 2020|    FC Groningen|1-3|      PSV Eindhoven|        1|        3|         4|
|    2|Fri Sep 18 2020|       VVV Venlo|

In [9]:
import pyspark.sql.functions as F

# Split the 'FT' column into home and away goals and convert them to integers
football_df = football_df.withColumn('HomeGoals', F.split(football_df['FT'], '-').getItem(0).cast('int'))
football_df = football_df.withColumn('AwayGoals', F.split(football_df['FT'], '-').getItem(1).cast('int'))

# Calculate the total goals
football_df = football_df.withColumn('TotalGoals', football_df['HomeGoals'] + football_df['AwayGoals'])

# Create or replace the temporary view with the updated DataFrame
football_df.createOrReplaceTempView('football_table')

# Now, filter the table where the total goals are less than 5
high_goals_df = spark.sql('SELECT * FROM football_table WHERE TotalGoals < 5')
high_goals_df.show(10)

+-----+---------------+----------------+---+-------------------+---------+---------+----------+
|Round|           Date|          Team 1| FT|             Team 2|HomeGoals|AwayGoals|TotalGoals|
+-----+---------------+----------------+---+-------------------+---------+---------+----------+
|    1|Sat Sep 12 2020|   SC Heerenveen|2-0|  Willem II Tilburg|        2|        0|         2|
|    1|Sat Sep 12 2020|      PEC Zwolle|0-2|Feyenoord Rotterdam|        0|        2|         2|
|    1|Sat Sep 12 2020|       FC Twente|2-0|    Fortuna Sittard|        2|        0|         2|
|    1|Sun Sep 13 2020| Heracles Almelo|2-0|       ADO Den Haag|        2|        0|         2|
|    1|Sun Sep 13 2020|Sparta Rotterdam|0-1|     Ajax Amsterdam|        0|        1|         1|
|    1|Sun Sep 13 2020|    RKC Waalwijk|0-1|     Vitesse Arnhem|        0|        1|         1|
|    1|Sun Sep 13 2020|    FC Groningen|1-3|      PSV Eindhoven|        1|        3|         4|
|    2|Fri Sep 18 2020|       VVV Venlo|

In [None]:
football_df = football_df.select('Round','Team 1','FT','Team 2')
football_df = football_df.withColumnRenamed('Round','Year').withColumnRenamed('Team 1','Team').withColumnRenamed('FT','Goals').withColumnRenamed('Team 2','Opponent')
football_df = football_df.withColumnRenamed('Goals','GoalsScored')
football_df.sort(F.desc('GoalsScored')).show(10)

+----+-------------------+-----------+----------------+
|Year|               Team|GoalsScored|        Opponent|
+----+-------------------+-----------+----------------+
|   6|         PEC Zwolle|        6-2|    RKC Waalwijk|
|  13|    Heracles Almelo|        6-1|       VVV Venlo|
|  18|     Ajax Amsterdam|        6-1|    ADO Den Haag|
|  12|         FC Utrecht|        6-0| Fortuna Sittard|
|   8|Feyenoord Rotterdam|        5-1|       FC Twente|
|   6|         AZ Alkmaar|        5-1|Sparta Rotterdam|
|  25|   Sparta Rotterdam|        5-1|        FC Emmen|
|  26|         FC Utrecht|        5-1|Sparta Rotterdam|
|  16|      PSV Eindhoven|        5-0| Fortuna Sittard|
|   6|      PSV Eindhoven|        5-0|  Vitesse Arnhem|
+----+-------------------+-----------+----------------+
only showing top 10 rows



In [None]:
football_df.groupBy('Team').agg(F.sum('GoalsScored').alias('TotalGoals'), F.max('GoalsScored').alias('MaxGoals')).show(10)


+-------------------+----------+--------+
|               Team|TotalGoals|MaxGoals|
+-------------------+----------+--------+
|       ADO Den Haag|      NULL|     3-3|
|         AZ Alkmaar|      NULL|     5-1|
|     Ajax Amsterdam|      NULL|     6-1|
|           FC Emmen|      NULL|     4-2|
|       FC Groningen|      NULL|     3-0|
|          FC Twente|      NULL|     4-1|
|         FC Utrecht|      NULL|     6-0|
|Feyenoord Rotterdam|      NULL|     5-1|
|    Fortuna Sittard|      NULL|     4-2|
|    Heracles Almelo|      NULL|     6-1|
+-------------------+----------+--------+
only showing top 10 rows

