In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, avg, when, count

# Initialize Spark
spark = SparkSession.builder.appName("EPL Analysis").getOrCreate()

# Load CSV into DataFrame
df = spark.read.csv("EPL.csv", header=True, inferSchema=True)
df.createOrReplaceTempView("epl")

# Create RDD from DataFrame
rdd = df.rdd

In [2]:
print("=== Sample RDD Data ===")
print(rdd.take(5))

=== Sample RDD Data ===
[Row(home_team='Sheffield United', away_team='Liverpool', home_goals=1.0, away_goals=1.0, result='D', season='2006-2007'), Row(home_team='Arsenal', away_team='Aston Villa', home_goals=1.0, away_goals=1.0, result='D', season='2006-2007'), Row(home_team='Everton', away_team='Watford', home_goals=2.0, away_goals=1.0, result='H', season='2006-2007'), Row(home_team='Newcastle United', away_team='Wigan Athletic', home_goals=2.0, away_goals=1.0, result='H', season='2006-2007'), Row(home_team='Portsmouth', away_team='Blackburn Rovers', home_goals=3.0, away_goals=0.0, result='H', season='2006-2007')]


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, expr

# Start Spark
spark = SparkSession.builder.appName("EPL Analysis").getOrCreate()

# Load the data
df = spark.read.csv("EPL.csv", header=True, inferSchema=True)

# ✅ Concise and correct aggregation
season_goals = df.groupBy("Season").agg(expr("sum(home_goals + away_goals) as TotalGoals"))
season_goals_high = season_goals.orderBy(col("TotalGoals").desc()).limit(1)

# Show result
season_goals_high.show()

+---------+----------+
|   Season|TotalGoals|
+---------+----------+
|2011-2012|    1066.0|
+---------+----------+



In [5]:
season_goals_low = season_goals.orderBy(col("TotalGoals").asc()).limit(1)

In [8]:
# 4. Team with Highest Average Goals per Season
home_goals = df.groupBy("home_team").agg(_sum("home_goals").alias("HomeGoals"))
away_goals = df.groupBy("away_team").agg(_sum("away_goals").alias("AwayGoals"))

# Rename for join
home_goals = home_goals.withColumnRenamed("home_team", "Team")
away_goals = away_goals.withColumnRenamed("away_team", "Team")

# Combine home and away goals
team_goals = home_goals.join(away_goals, "Team", "outer").na.fill(0)
team_goals = team_goals.withColumn("TotalGoals", col("HomeGoals") + col("AwayGoals"))

# Count number of seasons each team has played
team_seasons = df.select("season", "home_team").union(df.select("season", "away_team")) \
                 .dropDuplicates() \
                 .groupBy("home_team").agg(count("*").alias("Seasons"))

# Rename for join
team_seasons = team_seasons.withColumnRenamed("home_team", "Team")

# Calculate average goals
team_avg_goals = team_goals.join(team_seasons, "Team")
team_avg_goals = team_avg_goals.withColumn("AvgGoalsPerSeason", col("TotalGoals") / col("Seasons"))

# Get team with highest average goals per season
team_avg_top = team_avg_goals.orderBy(col("AvgGoalsPerSeason").desc()).limit(1)

# >>> Show the output <<<
team_avg_top.show(truncate=False)


+-----------------+---------+---------+----------+-------+-----------------+
|Team             |HomeGoals|AwayGoals|TotalGoals|Seasons|AvgGoalsPerSeason|
+-----------------+---------+---------+----------+-------+-----------------+
|Manchester United|495.0    |372.0    |867.0     |12     |72.25            |
+-----------------+---------+---------+----------+-------+-----------------+



In [7]:
# 5. Manchester United Probabilities
mu_matches = df.filter((col("home_team") == "Manchester United") | (col("away_team") == "Manchester United"))

mu_results = mu_matches.withColumn("Result",
    when((col("home_team") == "Manchester United") & (col("result") == "H"), "Win")
    .when((col("away_team") == "Manchester United") & (col("result") == "A"), "Win")
    .when(col("result") == "D", "Draw")
    .otherwise("Lose")
)

total_games = mu_results.count()
result_counts = mu_results.groupBy("Result").agg(count("*").alias("Count"))
result_probs = result_counts.withColumn("Probability", col("Count") / total_games)

# Display All
print("\n=== Season with Least Goals ===")
season_goals_low.show()

print("\n=== Manchester United Match Result Probabilities ===")
result_probs.show()


=== Season with Least Goals ===
+---------+----------+
|   Season|TotalGoals|
+---------+----------+
|2006-2007|     931.0|
+---------+----------+


=== Manchester United Match Result Probabilities ===
+------+-----+-------------------+
|Result|Count|        Probability|
+------+-----+-------------------+
|   Win|  290| 0.6359649122807017|
|  Draw|   89|0.19517543859649122|
|  Lose|   77|0.16885964912280702|
+------+-----+-------------------+

