In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat, lit, floor, rand

# Initialize a Spark session
spark = SparkSession.builder.appName("ComplexETL").getOrCreate()


In [None]:
import pandas as pd

# Create a sample DataFrame
data = {
    "cust_id": [1,2,3,4,5],
    "first_name": ["Arun", "Priya", "Kumar", "Divya", "Siva"],
    "last_name": ["Kumar", "Sharma", "Rao", "Raj", "Moorthy"],
    "cust_order": [10, 12, 7, 15, 9],
    "cust_status": ["Active", "Inactive", "Active", "Active", "Inactive"],
}
pdf = pd.DataFrame(data)
sample_csv_path = "sample_customers.csv"
pdf.to_csv(sample_csv_path, index=False)


In [None]:
source_path = sample_csv_path                  # Demo source CSV
target_path = "output_customers"               # Folder for PySpark output


In [None]:
df = spark.read.csv(
    source_path,
    header=True,
    schema="cust_id int, first_name string, last_name string, cust_order int, cust_status string"
)
df.show()


+-------+----------+---------+----------+-----------+
|cust_id|first_name|last_name|cust_order|cust_status|
+-------+----------+---------+----------+-----------+
|      1|      Arun|    Kumar|        10|     Active|
|      2|     Priya|   Sharma|        12|   Inactive|
|      3|     Kumar|      Rao|         7|     Active|
|      4|     Divya|      Raj|        15|     Active|
|      5|      Siva|  Moorthy|         9|   Inactive|
+-------+----------+---------+----------+-----------+



In [None]:
df = df.withColumn(
    "full_name",
    concat(col("first_name"), lit(" "), col("last_name"))
)


In [None]:
df = df.withColumn(
    "net_salary",
    floor(lit(10000) + rand() * lit(50))  # Simulate salary near 10,000
)


In [None]:
df = df.withColumn(
    "age",
    floor(lit(20) + rand() * lit(31))   # Ages 20-50
)


In [None]:
df = df.filter(col("age") >= 30)


In [None]:
avg_salary_by_age = df.groupBy("age").agg({"net_salary": "avg"}).withColumnRenamed("avg(net_salary)", "avg_salary")
avg_salary_by_age.show()


+---+----------+
|age|avg_salary|
+---+----------+
| 33|   10037.0|
| 42|   10042.0|
+---+----------+



In [None]:
df = df.orderBy("age")
df.show()


+-------+----------+---------+----------+-----------+------------+----------+---+
|cust_id|first_name|last_name|cust_order|cust_status|   full_name|net_salary|age|
+-------+----------+---------+----------+-----------+------------+----------+---+
|      2|     Priya|   Sharma|        12|   Inactive|Priya Sharma|     10037| 33|
|      1|      Arun|    Kumar|        10|     Active|  Arun Kumar|     10042| 42|
+-------+----------+---------+----------+-----------+------------+----------+---+



In [None]:
df.write.csv(target_path, mode="overwrite", header=True)


In [None]:
import shutil

shutil.make_archive("output_customers_zip", 'zip', target_path)
from google.colab import files
files.download("output_customers_zip.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>