<a href="https://colab.research.google.com/github/codeprakash309/PySparkCodeHub/blob/PySparkCodeHub(Prakash)/All_Advanced_Transformations2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, to_date, datediff, lit, rank
from pyspark.sql.window import Window


In [3]:
# ✅ Initialize Spark Session
spark = SparkSession.builder.appName("AdvancedTransformations").getOrCreate()

In [5]:
# ✅ Employee dataset
data = [
    ("Alice", 30, "HR", 40000, "New York", "2021-01-10"),
    ("Bob", 35, "Finance", 60000, "Chicago", "2019-07-15"),
    ("Cathy", 28, "IT", 75000, "San Francisco", "2020-03-01"),
    ("David", 45, "Finance", 80000, "New York", "2018-09-10"),
    ("Eva", 32, "IT", 72000, "Boston", "2022-05-12"),
    ("Frank", 29, "HR", 42000, "Chicago", "2020-11-20"),
    ("Grace", 41, "Marketing", 52000, "Boston", "2023-01-01"),
    ("Henry", 36, "Finance", 70000, "San Francisco", "2017-02-15"),
    ("Ivy", 27, "IT", 68000, "New York", "2020-10-25"),
    ("Jake", 39, "Marketing", 55000, "Chicago", "2020-01-30"),
    ("Karen", 31, "HR", 45000, "Boston", "2019-08-05"),
    ("Leo", 26, "Finance", 58000, "New York", "2021-06-18"),
    ("Mona", 33, "IT", 76000, "Chicago", "2018-12-25"),
    ("Nick", 40, "Marketing", 60000, "San Francisco", "2019-03-15")
]


In [6]:
columns = ["Name", "Age", "Department", "Salary", "City", "Joining_Date"]

In [7]:
df = spark.createDataFrame(data, columns)

# Convert Joining_Date to proper date format
df = df.withColumn("Joining_Date", to_date(col("Joining_Date"), "yyyy-MM-dd"))

In [8]:
# 💰 1. Calculate yearly bonus = 15% of salary
df = df.withColumn("Yearly_Bonus", col("Salary") * 0.15)


In [9]:
# 📊 2. Find top 2 highest-paid employees per department
windowSpec = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_top2 = df.withColumn("Rank", rank().over(windowSpec)).filter(col("Rank") <= 2)

In [10]:
# 🧹 3. Filter employees who joined before 2021
df_before_2021 = df.filter(col("Joining_Date") < to_date(lit("2021-01-01")))

In [11]:
# 🔀 4. Join with performance score dataset
performance_data = [
    ("Alice", 88),
    ("Bob", 92),
    ("Cathy", 95),
    ("David", 90),
    ("Eva", 80),
    ("Frank", 76),
    ("Grace", 85),
    ("Henry", 94),
    ("Ivy", 87),
    ("Jake", 82),
    ("Karen", 79),
    ("Leo", 84),
    ("Mona", 96),
    ("Nick", 89)
]

In [13]:
df_perf = spark.createDataFrame(performance_data, ["Name", "Performance_Score"])
df_joined = df.join(df_perf, on="Name", how="left")

In [15]:
# ✅ Show all results
print("🎯 Full Data with Bonus:")
df.select("Name", "Salary", "Yearly_Bonus").show()

🎯 Full Data with Bonus:
+-----+------+------------+
| Name|Salary|Yearly_Bonus|
+-----+------+------------+
|Alice| 40000|      6000.0|
|  Bob| 60000|      9000.0|
|Cathy| 75000|     11250.0|
|David| 80000|     12000.0|
|  Eva| 72000|     10800.0|
|Frank| 42000|      6300.0|
|Grace| 52000|      7800.0|
|Henry| 70000|     10500.0|
|  Ivy| 68000|     10200.0|
| Jake| 55000|      8250.0|
|Karen| 45000|      6750.0|
|  Leo| 58000|      8700.0|
| Mona| 76000|     11400.0|
| Nick| 60000|      9000.0|
+-----+------+------------+



In [16]:
print("📊 Top 2 Highest-Paid per Department:")
df_top2.select("Name", "Department", "Salary", "Rank").show()

📊 Top 2 Highest-Paid per Department:
+-----+----------+------+----+
| Name|Department|Salary|Rank|
+-----+----------+------+----+
|David|   Finance| 80000|   1|
|Henry|   Finance| 70000|   2|
|Karen|        HR| 45000|   1|
|Frank|        HR| 42000|   2|
| Mona|        IT| 76000|   1|
|Cathy|        IT| 75000|   2|
| Nick| Marketing| 60000|   1|
| Jake| Marketing| 55000|   2|
+-----+----------+------+----+



In [17]:
print("🧹 Employees Joined Before 2021:")
df_before_2021.select("Name", "Joining_Date").show()

🧹 Employees Joined Before 2021:
+-----+------------+
| Name|Joining_Date|
+-----+------------+
|  Bob|  2019-07-15|
|Cathy|  2020-03-01|
|David|  2018-09-10|
|Frank|  2020-11-20|
|Henry|  2017-02-15|
|  Ivy|  2020-10-25|
| Jake|  2020-01-30|
|Karen|  2019-08-05|
| Mona|  2018-12-25|
| Nick|  2019-03-15|
+-----+------------+



In [18]:
print("🔀 Data After Joining Performance Score:")
df_joined.select("Name", "Salary", "Performance_Score").show()

🔀 Data After Joining Performance Score:
+-----+------+-----------------+
| Name|Salary|Performance_Score|
+-----+------+-----------------+
|Grace| 52000|               85|
|  Eva| 72000|               80|
|  Bob| 60000|               92|
|Alice| 40000|               88|
|David| 80000|               90|
|Cathy| 75000|               95|
|Frank| 42000|               76|
|  Ivy| 68000|               87|
| Jake| 55000|               82|
| Nick| 60000|               89|
|  Leo| 58000|               84|
| Mona| 76000|               96|
|Karen| 45000|               79|
|Henry| 70000|               94|
+-----+------+-----------------+

