In [2]:
'''
You are given two datasets:

players_df: Contains information about cricket players, their total runs, and the number of 50s and 100s they have scored.
countries_df: Maps country short codes (SRT) to full country names.
Your task is to:

Extract the player's first name from the player column.
Extract the country code from the player column and map it to the full country name using countries_df.
Compute the sum of 50s and 100s from the 50s/100s column.
Filter the results to include only players where the sum of 50s and 100s is greater than 95.
Display the output sorted by runs in descending order.
Input Schema
players_df

Column	Type	Description
player	string	Player name with country code (e.g., "Sachin-IND")
runs	int	Total runs scored
50s/100s	string	Number of 50s and 100s scored, separated by /
countries_df

Column	Type	Description
SRT	string	Country code
country	string	Full country name
Expected Output Schema
Column	Type	Description
playername	string	First name of the player
country	string	Full country name
runs	int	Total runs scored
sum	int	Sum of 50s and 100s
Example
Input
players_df

player	runs	50s/100s
Sachin-IND	18694	93/49
Ricky-AUS	11274	66/31
Lara-WI	10222	45/21
Rahul-IND	10355	95/11
Jhonnty-SA	7051	43/5
Hayden-AUS	8722	67/19
countries_df

SRT	country
IND	India
AUS	Australia
WI	WestIndies
SA	SouthAfrica
Output
playername	country	runs	sum
Ricky	Australia	11274	97
Sachin	India	18694	142
Rahul	India	10355	106
Starter Code (PySpark)
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col, expr

# Initialize Spark session
spark = SparkSession.builder.appName("PlayerStats").getOrCreate()

# Create players_df
players_data = [
    ("Sachin-IND", 18694, "93/49"),
    ("Ricky-AUS", 11274, "66/31"),
    ("Lara-WI", 10222, "45/21"),
    ("Rahul-IND", 10355, "95/11"),
    ("Jhonnty-SA", 7051, "43/5"),
    ("Hayden-AUS", 8722, "67/19")
]

players_df = spark.createDataFrame(players_data, ["player", "runs", "50s/100s"])

# Create countries_df
countries_data = [
    ("IND", "India"),
    ("AUS", "Australia"),
    ("WI", "WestIndies"),
    ("SA", "SouthAfrica")
]

countries_df = spark.createDataFrame(countries_data, ["SRT", "country"])

#Your solution starts here

Use display(df) to show the final DataFrame.
'''

# Initialize Spark session
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

# Create players_df
players_data = [
    ("Sachin-IND", 18694, "93/49"),
    ("Ricky-AUS", 11274, "66/31"),
    ("Lara-WI", 10222, "45/21"),
    ("Rahul-IND", 10355, "95/11"),
    ("Jhonnty-SA", 7051, "43/5"),
    ("Hayden-AUS", 8722, "67/19")
]

players_df = spark.createDataFrame(players_data, ["player", "runs", "50s/100s"])

# Create countries_df
countries_data = [
    ("IND", "India"),
    ("AUS", "Australia"),
    ("WI", "WestIndies"),
    ("SA", "SouthAfrica")
]

countries_df = spark.createDataFrame(countries_data, ["SRT", "country"])

df_result = (
  players_df.withColumn("playername", F.split(F.col("player"), "-")[0]) # Extract first name
  .withColumn("SRT", F.split(F.col("player"), "-")[1]) # Extract country code
  .withColumn("50s", F.split(F.col("50s/100s"), "/")[0].cast("int")) # Extract 50s
  .withColumn("100s", F.split(F.col("50s/100s"), "/")[1].cast("int")) # Extract 100s
  .withColumn("sum", F.col("50s") + F.col("100s")) # Compute sum
  .filter(F.col("sum") > 95) # Filter for sum of 50s and 100s > 95
  .join(countries_df, on = "SRT", how = "left") # Join to get full country name
  .select("playername", "country", "runs", "sum") # select required columns
  .orderBy(F.col("runs").desc()) # Descending order of runs
)

# Display result.
df_result.show()

+----------+---------+-----+---+
|playername|  country| runs|sum|
+----------+---------+-----+---+
|    Sachin|    India|18694|142|
|     Ricky|Australia|11274| 97|
|     Rahul|    India|10355|106|
+----------+---------+-----+---+

