## **_Activity 1 - Selecting Specific Columns_**

In [0]:
# -----------------------------
# Loading Data into DataFrame
# -----------------------------
df = spark \
    .read \
    .format("csv") \
    .option("header", True) \
    .load("/Volumes/workspace/default/my_data/employee_data.csv")  # File path

# Displaying the loaded data
df.show()


# -----------------------------------------------
# Selecting specific columns using select()
# -----------------------------------------------
df1 = df.select("name", "address")   # Selecting columns directly by name
df1.show()


# ---------------------------------------------------------
# Selecting columns using selectExpr()
# selectExpr allows SQL expressions inside select()
# Useful when we want to apply calculations or transformations
# ---------------------------------------------------------
df2 = df.selectExpr(
    "name",
    "address",
    "salary * 0.5 as bonus"   # Creating a new column using an expression
)

df2.show()


# -------------------------------------------------
# Selecting a column using col() function
# col() is useful when we want to apply functions,
# transformations, or rename columns using alias()
# -------------------------------------------------
from pyspark.sql.functions import col

df3 = df.select(
    col("name").alias("first_name")   # Renaming 'name' column to 'first_name'
)

# Displaying the result
df3.show()


+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
+---+------+------+---------+----------+----------+

+------+---------+
|  name|  address|
+------+---------+
|  John|   Mumbai|
|  Emma|    Delhi|
|   Raj|     Pune|
| Priya|Bangalore|
|  Alex|Hyderabad|
|  Sara|  Chenn

## **_Activity 2 - Adding New Columns_**
### - **_Using withColumn_**

In [0]:
from pyspark.sql.functions import *

# -------------------------------------------------------------------
# Adding a new column "bonus"
# withColumn() is used to create or replace a column
# Here, bonus = 5% of salary
# -------------------------------------------------------------------
df.withColumn("bonus", col("salary") * 0.05).show()


# -------------------------------------------------------------------
# Adding a constant value column using lit()
# lit("India") creates a column with the same value for all rows
# -------------------------------------------------------------------
df.withColumn("country", lit("India")).show()


# -------------------------------------------------------------------
# Modifying an existing column
# If the column already exists, withColumn() will overwrite it
# Here, converting address to uppercase using upper()
# -------------------------------------------------------------------
df.withColumn("address", upper(col("address"))).show()


# -------------------------------------------------------------------
# Changing data type of a column using cast()
# Converting "id" column to integer type
# -------------------------------------------------------------------
df.withColumn("id", col("id").cast("int")).show()


# -------------------------------------------------------------------
# Adding multiple columns at the same time by chaining withColumn()
# 1) Creating bonus column
# 2) Adding country column with constant value
# -------------------------------------------------------------------
(df.withColumn("bonus", col("salary") * 0.05) 
  .withColumn("country", lit("India")) 
  .show())


+---+------+------+---------+----------+----------+------+
| id|  name|salary|  address|department|joineddate| bonus|
+---+------+------+---------+----------+----------+------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|2500.0|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|3100.0|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|2250.0|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|3500.0|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|2750.0|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|2400.0|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|3600.0|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|2650.0|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|3000.0|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|2900.0|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|3250.0|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|2450.0|
+---+------+------+---------+----------+----------+------+

+---+------+------+---------+----------+----------+----

## **_Activity 3 - Droping A Columns_**
### - **Using Drop**

In [0]:
# -------------------------------------------------------------------
# Dropping columns from a DataFrame
# drop() can remove one or multiple columns at the same time
# -------------------------------------------------------------------

# Dropping multiple columns: "department" and "address"
df2 = df.drop("department", "address")
df2.show()

# -------------------------------------------------------------------
# Example: Dropping a single column
# df.drop("department").show()
# -------------------------------------------------------------------


+---+------+------+----------+
| id|  name|salary|joineddate|
+---+------+------+----------+
|  1|  John| 50000|2021-01-15|
|  2|  Emma| 62000|2020-03-10|
|  3|   Raj| 45000|2022-07-20|
|  4| Priya| 70000|2019-11-05|
|  5|  Alex| 55000|2021-05-12|
|  6|  Sara| 48000|2020-08-25|
|  7|Nikhil| 72000|2022-02-14|
|  8| Aisha| 53000|2021-09-30|
|  9|   Tom| 60000|2023-01-18|
| 10|  Riya| 58000|2020-12-02|
| 11|Vikram| 65000|2019-04-22|
| 12|  Zara| 49000|2022-10-11|
+---+------+------+----------+



## **_Activity 4 - Renameing A Columns_**
### - **Using withColumnRenamed**
### - **Using toDF**

In [0]:
# -------------------------------------------------------------------
# Renaming specific columns using withColumnRenamed()
# We can rename one or multiple columns by chaining the function
# -------------------------------------------------------------------
df.withColumnRenamed("id", "emp_id") \
  .withColumnRenamed("name", "emp_name") \
  .show()

# -------------------------------------------------------------------
# Renaming ALL columns at once using toDF()
# toDF() requires the same number of new column names as the existing ones
# -------------------------------------------------------------------
df.toDF("emp_id", "emp_name", "emp_sal", "emp_add", "emp_dept", "emp_joindate").show()

# -------------------------------------------------------------------
# Getting all column names from the DataFrame
# df.columns returns a Python list of column names
# -------------------------------------------------------------------
df.columns

# -------------------------------------------------------------------
# Renaming all columns programmatically
# Adding prefix "emp_" to every column name
# Using list comprehension to modify column names dynamically
# -------------------------------------------------------------------
data = ["emp_" + i for i in df.columns]  # Creating new column name list
df.toDF(*data).show()                    # Unpacking list using *

# ------------------------------------------------------------
# Without using * (WRONG)
# df.toDF(data) look like this df.toDF(["emp_id", "emp_name", "emp_sal"])
# Here we pass a single list as one argument -> toDF() will fail
# ------------------------------------------------------------
# Using *data (CORRECT)
# The * operator UNPACKS the list into separate arguments:
# ["emp_id", "emp_name", "emp_sal"]
# becomes
# "emp_id", "emp_name", "emp_sal"
# df.toDF(*data) becomes df.toDF("emp_id", "emp_name", "emp_sal")
# ------------------------------------------------------------


+------+--------+------+---------+----------+----------+
|emp_id|emp_name|salary|  address|department|joineddate|
+------+--------+------+---------+----------+----------+
|     1|    John| 50000|   Mumbai|        IT|2021-01-15|
|     2|    Emma| 62000|    Delhi|        HR|2020-03-10|
|     3|     Raj| 45000|     Pune|   Finance|2022-07-20|
|     4|   Priya| 70000|Bangalore|        IT|2019-11-05|
|     5|    Alex| 55000|Hyderabad| Marketing|2021-05-12|
|     6|    Sara| 48000|  Chennai|   Finance|2020-08-25|
|     7|  Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|     8|   Aisha| 53000|  Kolkata|        IT|2021-09-30|
|     9|     Tom| 60000|    Surat| Marketing|2023-01-18|
|    10|    Riya| 58000|   Jaipur|   Finance|2020-12-02|
|    11|  Vikram| 65000|   Nagpur|        HR|2019-04-22|
|    12|    Zara| 49000|   Indore|        IT|2022-10-11|
+------+--------+------+---------+----------+----------+

+------+--------+-------+---------+---------+------------+
|emp_id|emp_name|emp_sal|  e

## **_Activity 5 - Removing Duplicates_**
### - **Using distinct**
### - **Using dropDuplicates**

In [0]:
# Creating Sample Data
data = [
    (1, "Amit", 50000, "Sales"),
    (2, "Neha", 60000, "HR"),
    (1, "Amit", 50000, "Sales"),   # duplicate row
    (4, "Raj", 70000, "IT"),
    (5, "Neha", 60000, "HR"),      # duplicate row
    (6, "Sneha", 55000, "Sales"),
    (4, "Raj", 70000, "IT"),       # duplicate row
    (8, "Amit", 52000, "Marketing"),
    (9, "Neha", 60000, "HR")       # duplicate row
]

columns = ["id", "name", "salary", "dept"]

emp_df = spark.createDataFrame(data, columns)

In [0]:
# -------------------------------------------------------------------
# distinct() - removes duplicate rows from the entire DataFrame
# Works on all columns (cannot pass column names)
# -------------------------------------------------------------------
emp_df.distinct().show()


# -------------------------------------------------------------------
# dropDuplicates() - also removes duplicate rows from all columns
# Same as distinct(), but more commonly used in ETL pipelines
# -------------------------------------------------------------------
emp_df.dropDuplicates().show()


# -------------------------------------------------------------------
# dropDuplicates(["name"])
# Removes duplicate rows based ONLY on the "name" column
# Keeps the first occurrence of each unique name
# -------------------------------------------------------------------
emp_df.dropDuplicates(subset=["name"]).show()


# -------------------------------------------------------------------
# dropDuplicates(["name", "salary"])
# Removes duplicates based on a combination of "name" AND "salary"
# Useful for checking multi-column duplicates
# -------------------------------------------------------------------
emp_df.dropDuplicates(subset=["name", "salary"]).show()


# -------------------------------------------------------------------
# distinct("name") - NOT allowed
# distinct() does NOT accept specific column names
# It works only on the entire row, not on a single column
# -------------------------------------------------------------------
# emp_df.distinct("name").show()  # ❌ This will throw an error


+---+-----+------+---------+
| id| name|salary|     dept|
+---+-----+------+---------+
|  1| Amit| 50000|    Sales|
|  2| Neha| 60000|       HR|
|  4|  Raj| 70000|       IT|
|  5| Neha| 60000|       HR|
|  6|Sneha| 55000|    Sales|
|  8| Amit| 52000|Marketing|
|  9| Neha| 60000|       HR|
+---+-----+------+---------+

+---+-----+------+---------+
| id| name|salary|     dept|
+---+-----+------+---------+
|  1| Amit| 50000|    Sales|
|  2| Neha| 60000|       HR|
|  4|  Raj| 70000|       IT|
|  5| Neha| 60000|       HR|
|  6|Sneha| 55000|    Sales|
|  8| Amit| 52000|Marketing|
|  9| Neha| 60000|       HR|
+---+-----+------+---------+

+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  1| Amit| 50000|Sales|
|  2| Neha| 60000|   HR|
|  4|  Raj| 70000|   IT|
|  6|Sneha| 55000|Sales|
+---+-----+------+-----+

+---+-----+------+---------+
| id| name|salary|     dept|
+---+-----+------+---------+
|  1| Amit| 50000|    Sales|
|  2| Neha| 60000|       HR|
|  4|  Raj| 70

%md
## **_Activity 6 - Removing Null Values_**
### - **Using dropna**

In [0]:
data = [
    (1, "Amit", 50000, "Sales"),
    (2, "Neha", None, "HR"),
    (3, None, 70000, "IT"),
    (4, "Sneha", 55000, None),
    (5, None, None, None),
    (None , None, None, None)
]
columns = ["id", "name", "salary", "dept"]
emp_df = spark.createDataFrame(data, columns)
emp_df.show()

+----+-----+------+-----+
|  id| name|salary| dept|
+----+-----+------+-----+
|   1| Amit| 50000|Sales|
|   2| Neha|  NULL|   HR|
|   3| NULL| 70000|   IT|
|   4|Sneha| 55000| NULL|
|   5| NULL|  NULL| NULL|
|NULL| NULL|  NULL| NULL|
+----+-----+------+-----+



In [0]:
# -------------------------------------------------------------------
# dropna() removes rows that contain NULL values
# how = "any"  → remove the row if *at least one* column is NULL
# This is the default behavior
# -------------------------------------------------------------------
emp_df.dropna().show()                 # Same as emp_df.dropna(how="any").show()


# -------------------------------------------------------------------
# how = "all"
# Remove the row ONLY if *all* columns contain NULL values
# If even one column has a non-null value, the row is kept
# -------------------------------------------------------------------
emp_df.dropna(how="all").show()


# -------------------------------------------------------------------
# subset = ["salary"]
# Remove the row if SALARY is NULL
# Only checks the specified column(s)
# -------------------------------------------------------------------
emp_df.dropna(subset=["salary"]).show()


# -------------------------------------------------------------------
# subset = ["salary", "dept"]
# Remove the row if BOTH salary AND dept have NULL values
# If either column is non-null, the row is kept
# -------------------------------------------------------------------
emp_df.dropna(subset=["salary", "dept"]).show()


+---+----+------+-----+
| id|name|salary| dept|
+---+----+------+-----+
|  1|Amit| 50000|Sales|
+---+----+------+-----+

+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  1| Amit| 50000|Sales|
|  2| Neha|  NULL|   HR|
|  3| NULL| 70000|   IT|
|  4|Sneha| 55000| NULL|
|  5| NULL|  NULL| NULL|
+---+-----+------+-----+

+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  1| Amit| 50000|Sales|
|  3| NULL| 70000|   IT|
|  4|Sneha| 55000| NULL|
+---+-----+------+-----+

+---+----+------+-----+
| id|name|salary| dept|
+---+----+------+-----+
|  1|Amit| 50000|Sales|
|  3|NULL| 70000|   IT|
+---+----+------+-----+



## **_Activity 7 - Filter Data_**
### - **Using filter**
### - **Using Where --> where is alias for filter**

In [0]:
data = [
    (1, "Amit", 50000, "Sales"),
    (2, "Neha", 100000, "HR"),
    (3, "ravi", 70000, "IT"),
    (4, "Sneha", 55000, "IT"),
    (4, "Karan", 60000, "Sales")
]
columns = ["id", "name", "salary", "dept"]
emp_df = spark.createDataFrame(data, columns)
emp_df.show()

+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  1| Amit| 50000|Sales|
|  2| Neha|100000|   HR|
|  3| ravi| 70000|   IT|
|  4|Sneha| 55000|   IT|
|  4|Karan| 60000|Sales|
+---+-----+------+-----+



In [0]:
# -------------------------------------------------------------------
# Filter rows using SQL expression (string-based condition)
# -------------------------------------------------------------------
emp_df.filter("salary > 50000").show()
emp_df.where("salary > 50000").show()   # where() is an alias of filter()

# -------------------------------------------------------------------
# Filter using column expressions with AND (&)
# Each condition must be wrapped in parentheses
# -------------------------------------------------------------------
emp_df.filter(
    (col("salary") > 50000) & (col("dept") == "HR")
).show()

emp_df.where(
    (col("salary") > 50000) & (col("dept") == "HR")
).show()

# -------------------------------------------------------------------
# Filter using OR (|)
# Get employees working in HR OR Sales department
# -------------------------------------------------------------------
emp_df.filter(
    (col("dept") == "HR") | (col("dept") == "Sales")
).show()

# -------------------------------------------------------------------
# Using isin() - cleaner alternative to OR
# dept column must match any value inside the given list
# -------------------------------------------------------------------
emp_df.filter(
    col("dept").isin(["Sales", "IT"])
).show()

# -------------------------------------------------------------------
# Using between() - inclusive of both lower and upper bounds
# Fetch salary between 40,000 and 60,000 (inclusive)
# -------------------------------------------------------------------
emp_df.filter(
    col("salary").between(40000, 60000)
).show()

# -------------------------------------------------------------------
# Pattern matching using like()
# like("S%")   → starts with 'S'
# like("%a%")  → contains 'a'
# -------------------------------------------------------------------
emp_df.filter(col("dept").like("S%")).show()    # department starts with S
emp_df.filter(col("name").like("%a%")).show()   # name contains 'a'


+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  2| Neha|100000|   HR|
|  3| ravi| 70000|   IT|
|  4|Sneha| 55000|   IT|
|  4|Karan| 60000|Sales|
+---+-----+------+-----+

+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  2| Neha|100000|   HR|
|  3| ravi| 70000|   IT|
|  4|Sneha| 55000|   IT|
|  4|Karan| 60000|Sales|
+---+-----+------+-----+

+---+----+------+----+
| id|name|salary|dept|
+---+----+------+----+
|  2|Neha|100000|  HR|
+---+----+------+----+

+---+----+------+----+
| id|name|salary|dept|
+---+----+------+----+
|  2|Neha|100000|  HR|
+---+----+------+----+

+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  1| Amit| 50000|Sales|
|  2| Neha|100000|   HR|
|  4|Karan| 60000|Sales|
+---+-----+------+-----+

+---+-----+------+-----+
| id| name|salary| dept|
+---+-----+------+-----+
|  1| Amit| 50000|Sales|
|  3| ravi| 70000|   IT|
|  4|Sneha| 55000|   IT|
|  4|Karan| 60000|Sales|
+---+-----+----

## **_Activity 8 - Sorting Rows_**
### - **Using sort**
### - **Using orderBy**

In [0]:
df.show()

+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
+---+------+------+---------+----------+----------+



In [0]:
# -------------------------------------------------------------------
# Sorting DataFrame by a single column (ascending by default)
# -------------------------------------------------------------------
df.sort("salary").show()

# -------------------------------------------------------------------
# Sorting by a column in descending order using desc()
# -------------------------------------------------------------------
df.sort(col("salary").desc()).show()

# -------------------------------------------------------------------
# Sorting by multiple columns
# First by 'department', then by 'salary' (both ascending)
# -------------------------------------------------------------------
df.sort("department", "salary").show()

# -------------------------------------------------------------------
# Sorting by multiple columns with mixed order
# 1) department → ascending
# 2) salary     → descending
# -------------------------------------------------------------------
df.sort("department", col("salary").desc()).show()


+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  3|   Raj| 45000|     Pune|   Finance|2022-07-20|
|  6|  Sara| 48000|  Chennai|   Finance|2020-08-25|
| 12|  Zara| 49000|   Indore|        IT|2022-10-11|
|  1|  John| 50000|   Mumbai|        IT|2021-01-15|
|  8| Aisha| 53000|  Kolkata|        IT|2021-09-30|
|  5|  Alex| 55000|Hyderabad| Marketing|2021-05-12|
| 10|  Riya| 58000|   Jaipur|   Finance|2020-12-02|
|  9|   Tom| 60000|    Surat| Marketing|2023-01-18|
|  2|  Emma| 62000|    Delhi|        HR|2020-03-10|
| 11|Vikram| 65000|   Nagpur|        HR|2019-04-22|
|  4| Priya| 70000|Bangalore|        IT|2019-11-05|
|  7|Nikhil| 72000|Ahmedabad|        HR|2022-02-14|
+---+------+------+---------+----------+----------+

+---+------+------+---------+----------+----------+
| id|  name|salary|  address|department|joineddate|
+---+------+------+---------+----------+----------+
|  7|Nikhil