<a href="https://colab.research.google.com/github/git791/FDE_LAB_3/blob/main/Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#ELT using Spark
#Extract and load


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ELT Demo").getOrCreate()

# Load raw data

In [3]:
raw_df = spark.read.csv(
    "/content/drive/MyDrive/FDE/customers.csv",
    header=True,
    inferSchema=True
)

# Load raw data (unchanged)

In [4]:
raw_df.write.mode("overwrite").parquet("/content/drive/MyDrive/FDE/spark_output/raw_customers")

#Transform inside Spark after Loading

In [5]:
from pyspark.sql.functions import col, upper, when, avg, count

In [7]:
df = spark.read.parquet("/content/drive/MyDrive/FDE/spark_output/raw_customers")

df = df.dropDuplicates()
df = df.fillna({"city": "Unknown"})
df = df.withColumn("customer_name", upper(col("customer_name")))

df = df.withColumn(
    "spend_category",
    when(col("annual_spend") < 80000, "Low")
    .when((col("annual_spend") <= 120000), "Medium")
    .otherwise("High")
)

elt_output = df.groupBy("city", "spend_category").agg(
    count("customer_id").alias("total_customers"),
    avg("annual_spend").alias("avg_spend")
)

elt_output.write.mode("overwrite").parquet("elt_analytics_ready")
elt_output.show()


+---------+--------------+---------------+-----------------+
|     city|spend_category|total_customers|        avg_spend|
+---------+--------------+---------------+-----------------+
|  Unknown|          High|              1|         132000.0|
|  Unknown|           Low|              3|71333.33333333333|
|     Pune|          High|              1|         145000.0|
|    Noida|           Low|              1|          78000.0|
|  Kolkata|           Low|              1|          68000.0|
|Bengaluru|          High|              1|         150000.0|
|   Mumbai|          High|              1|         140000.0|
|  Kolkata|        Medium|              1|          90000.0|
|  Gurgaon|          High|              1|         155000.0|
|Ahmedabad|        Medium|              1|         110000.0|
|  Chennai|           Low|              1|          72000.0|
|Bengaluru|        Medium|              1|          82000.0|
|    Delhi|          High|              1|         170000.0|
|  Unknown|        Mediu