<a href="https://colab.research.google.com/github/codeprakash309/PySparkCodeHub/blob/PySparkCodeHub(Prakash)/DataFrameSQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg,col

spark = SparkSession.builder.appName("PracticeTransformations").getOrCreate()

In [2]:
data = [
    ("Alice", 30, "HR", 40000, "New York"),
    ("Bob", 35, "Finance", 60000, "Chicago"),
    ("Cathy", 28, "IT", 75000, "San Francisco"),
    ("David", 45, "Finance", 80000, "New York"),
    ("Eva", 32, "IT", 72000, "Boston"),
    ("Frank", 29, "HR", 42000, "Chicago"),
    ("Grace", 41, "Marketing", 52000, "Boston"),
    ("Henry", 36, "Finance", 70000, "San Francisco"),
    ("Ivy", 27, "IT", 68000, "New York"),
    ("Jake", 39, "Marketing", 55000, "Chicago"),
    ("Karen", 31, "HR", 45000, "Boston"),
    ("Leo", 26, "Finance", 58000, "New York"),
    ("Mona", 33, "IT", 76000, "Chicago"),
    ("Nick", 40, "Marketing", 60000, "San Francisco")
]

columns = ["Name", "Age", "Department", "Salary", "City"]

In [3]:
df = spark.createDataFrame(data, columns)
df.show()

+-----+---+----------+------+-------------+
| Name|Age|Department|Salary|         City|
+-----+---+----------+------+-------------+
|Alice| 30|        HR| 40000|     New York|
|  Bob| 35|   Finance| 60000|      Chicago|
|Cathy| 28|        IT| 75000|San Francisco|
|David| 45|   Finance| 80000|     New York|
|  Eva| 32|        IT| 72000|       Boston|
|Frank| 29|        HR| 42000|      Chicago|
|Grace| 41| Marketing| 52000|       Boston|
|Henry| 36|   Finance| 70000|San Francisco|
|  Ivy| 27|        IT| 68000|     New York|
| Jake| 39| Marketing| 55000|      Chicago|
|Karen| 31|        HR| 45000|       Boston|
|  Leo| 26|   Finance| 58000|     New York|
| Mona| 33|        IT| 76000|      Chicago|
| Nick| 40| Marketing| 60000|San Francisco|
+-----+---+----------+------+-------------+



In [5]:
#Filter employees older than 35
df.filter(df.Age > 35).show()

+-----+---+----------+------+-------------+
| Name|Age|Department|Salary|         City|
+-----+---+----------+------+-------------+
|David| 45|   Finance| 80000|     New York|
|Grace| 41| Marketing| 52000|       Boston|
|Henry| 36|   Finance| 70000|San Francisco|
| Jake| 39| Marketing| 55000|      Chicago|
| Nick| 40| Marketing| 60000|San Francisco|
+-----+---+----------+------+-------------+



In [9]:
#Group by Department and get average Salary
df.groupBy("Department").agg(avg("Salary").alias("Avg_Salary")).show()

+----------+------------------+
|Department|        Avg_Salary|
+----------+------------------+
|        HR|42333.333333333336|
|   Finance|           67000.0|
| Marketing|55666.666666666664|
|        IT|           72750.0|
+----------+------------------+



In [11]:
#Add a column for Tax (e.g., 10% of Salary)
df = df.withColumn("Tax", col("Salary") * 0.10)
df.show()

+-----+---+----------+------+-------------+------+
| Name|Age|Department|Salary|         City|   Tax|
+-----+---+----------+------+-------------+------+
|Alice| 30|        HR| 40000|     New York|4000.0|
|  Bob| 35|   Finance| 60000|      Chicago|6000.0|
|Cathy| 28|        IT| 75000|San Francisco|7500.0|
|David| 45|   Finance| 80000|     New York|8000.0|
|  Eva| 32|        IT| 72000|       Boston|7200.0|
|Frank| 29|        HR| 42000|      Chicago|4200.0|
|Grace| 41| Marketing| 52000|       Boston|5200.0|
|Henry| 36|   Finance| 70000|San Francisco|7000.0|
|  Ivy| 27|        IT| 68000|     New York|6800.0|
| Jake| 39| Marketing| 55000|      Chicago|5500.0|
|Karen| 31|        HR| 45000|       Boston|4500.0|
|  Leo| 26|   Finance| 58000|     New York|5800.0|
| Mona| 33|        IT| 76000|      Chicago|7600.0|
| Nick| 40| Marketing| 60000|San Francisco|6000.0|
+-----+---+----------+------+-------------+------+



In [13]:
#Count number of employees per City
df.groupBy("City").count().show()

+-------------+-----+
|         City|count|
+-------------+-----+
|San Francisco|    3|
|      Chicago|    4|
|     New York|    4|
|       Boston|    3|
+-------------+-----+



In [15]:
#Sort by Salary (descending)
df.orderBy(col("Salary").desc()).show()

+-----+---+----------+------+-------------+------+
| Name|Age|Department|Salary|         City|   Tax|
+-----+---+----------+------+-------------+------+
|David| 45|   Finance| 80000|     New York|8000.0|
| Mona| 33|        IT| 76000|      Chicago|7600.0|
|Cathy| 28|        IT| 75000|San Francisco|7500.0|
|  Eva| 32|        IT| 72000|       Boston|7200.0|
|Henry| 36|   Finance| 70000|San Francisco|7000.0|
|  Ivy| 27|        IT| 68000|     New York|6800.0|
|  Bob| 35|   Finance| 60000|      Chicago|6000.0|
| Nick| 40| Marketing| 60000|San Francisco|6000.0|
|  Leo| 26|   Finance| 58000|     New York|5800.0|
| Jake| 39| Marketing| 55000|      Chicago|5500.0|
|Grace| 41| Marketing| 52000|       Boston|5200.0|
|Karen| 31|        HR| 45000|       Boston|4500.0|
|Frank| 29|        HR| 42000|      Chicago|4200.0|
|Alice| 30|        HR| 40000|     New York|4000.0|
+-----+---+----------+------+-------------+------+



In [16]:
#Join with another DataFrame (e.g., Department Budget)
budget_data = [
    ("HR", 100000),
    ("Finance", 200000),
    ("IT", 250000),
    ("Marketing", 150000)
]
budget_columns = ["Department", "Budget"]

df_budget = spark.createDataFrame(budget_data, budget_columns)

df_joined = df.join(df_budget, on="Department", how="left")
df_joined.show()

+----------+-----+---+------+-------------+------+------+
|Department| Name|Age|Salary|         City|   Tax|Budget|
+----------+-----+---+------+-------------+------+------+
|        HR|Alice| 30| 40000|     New York|4000.0|100000|
|        HR|Frank| 29| 42000|      Chicago|4200.0|100000|
|   Finance|  Bob| 35| 60000|      Chicago|6000.0|200000|
|   Finance|David| 45| 80000|     New York|8000.0|200000|
| Marketing|Grace| 41| 52000|       Boston|5200.0|150000|
|        IT|Cathy| 28| 75000|San Francisco|7500.0|250000|
|        IT|  Eva| 32| 72000|       Boston|7200.0|250000|
|        HR|Karen| 31| 45000|       Boston|4500.0|100000|
|   Finance|Henry| 36| 70000|San Francisco|7000.0|200000|
|   Finance|  Leo| 26| 58000|     New York|5800.0|200000|
| Marketing| Jake| 39| 55000|      Chicago|5500.0|150000|
| Marketing| Nick| 40| 60000|San Francisco|6000.0|150000|
|        IT|  Ivy| 27| 68000|     New York|6800.0|250000|
|        IT| Mona| 33| 76000|      Chicago|7600.0|250000|
+----------+--