<a href="https://colab.research.google.com/github/codingniket/Python-Training/blob/main/23_12_2025/Excercise_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when,regexp_replace, split, trim, array_compact, transform, get_json_object, lower
spark = SparkSession.builder.appName("cache").getOrCreate()
from pyspark.sql.types import (StructType, StructField, StringType,LongType,IntegerType,ArrayType,MapType,DateType)


In [37]:
data = [
    ("ORD001","Delhi","Laptop",45000),
    ("ORD002","Mumbai","Mobile",32000),
    ("ORD003","Bangalore","Laptop",52000),
    ("ORD004","Delhi","Tablet",28000),
    ("ORD005","Mumbai","Laptop",61000),
    ("ORD006","Chennai","Mobile",30000),
    ("ORD007","Delhi","Laptop",47000),
    ("ORD008","Bangalore","Tablet",35000),
    ("ORD009","Mumbai","Laptop",58000),
    ("ORD010","Delhi","Mobile",29000)
]

columns = ["order_id","city","product","amount"]

df = spark.createDataFrame(data, columns)

In [38]:
high = df.filter(df.amount > 30000)
high.show()

+--------+---------+-------+------+
|order_id|     city|product|amount|
+--------+---------+-------+------+
|  ORD001|    Delhi| Laptop| 45000|
|  ORD002|   Mumbai| Mobile| 32000|
|  ORD003|Bangalore| Laptop| 52000|
|  ORD005|   Mumbai| Laptop| 61000|
|  ORD007|    Delhi| Laptop| 47000|
|  ORD008|Bangalore| Tablet| 35000|
|  ORD009|   Mumbai| Laptop| 58000|
+--------+---------+-------+------+



In [39]:
high.count()

7

In [40]:
high.groupBy("city").sum("amount").show()
high.groupBy("product").avg("amount").show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|      87000|
|   Mumbai|     151000|
|    Delhi|      92000|
+---------+-----------+

+-------+-----------+
|product|avg(amount)|
+-------+-----------+
| Laptop|    52600.0|
| Mobile|    32000.0|
| Tablet|    35000.0|
+-------+-----------+



In [41]:
high.cache()

DataFrame[order_id: string, city: string, product: string, amount: bigint]

In [42]:
high.groupBy("city").sum("amount").show()
high.groupBy("product").avg("amount").show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|      87000|
|   Mumbai|     151000|
|    Delhi|      92000|
+---------+-----------+

+-------+-----------+
|product|avg(amount)|
+-------+-----------+
| Laptop|    52600.0|
| Mobile|    32000.0|
| Tablet|    35000.0|
+-------+-----------+



In [43]:
high.unpersist() #Deallocate

DataFrame[order_id: string, city: string, product: string, amount: bigint]

In [44]:
df.count()

10

Excercise 1

In [45]:
customer_data = [
("C001","Delhi","Premium"),
("C002","Mumbai","Standard"),
("C003","Bangalore","Premium"),
("C004","Chennai","Standard"),
("C005","Mumbai","Premium")
]

In [46]:
city_lookup = [
("Delhi","Tier-1"),
("Mumbai","Tier-1"),
("Bangalore","Tier-1"),
("Chennai","Tier-2")
]

In [47]:
sales_data = [
    ("TXN001","Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("TXN002","Mumbai","Mobile ","electronics","32000","05/01/2024","Completed"),
    ("TXN003","Bangalore","Tablet"," Electronics ","30000","2024/01/06","Completed"),
    ("TXN004","Delhi","Laptop","Electronics","","2024-01-07","Cancelled"),
    ("TXN005","Chennai","Mobile","Electronics","invalid","2024-01-08","Completed"),
    ("TXN006","Mumbai","Tablet","Electronics",None,"2024-01-08","Completed"),
    ("TXN007","Delhi","Laptop","electronics","45000","09-01-2024","Completed"),
    ("TXN008","Bangalore","Mobile","Electronics","28000","2024-01-09","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("TXN009","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed")
]


1. Create schemas explicitly for all datasets

2. Load raw data into DataFrames

3. Handle incorrect data types gracefully

4. Identify corrupt and invalid records

In [48]:
customer_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("membership_type", StringType(), True)
])

customer_df =  spark.createDataFrame(customer_data,customer_schema)

In [49]:
city_schema = StructType([
    StructField("city", StringType(), True),
    StructField("tier", StringType(), True)
])

city_df =  spark.createDataFrame(city_lookup,city_schema)

In [50]:
sales_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("product",StringType(),True),
    StructField("category",StringType(),True),
    StructField("amount",StringType(),True),
    StructField("date",StringType(),True),
     StructField("status",StringType(),True),
])


sales_df =  spark.createDataFrame(sales_data,sales_schema)

In [51]:
from pyspark.sql.functions import col, upper ,to_date, coalesce, split, lit, array_remove, try_to_timestamp,regexp_extract

In [52]:
numeric_price_str = regexp_extract(col("amount"), r"(\d+)", 0)
clean_df = sales_df.withColumn("amount",when((numeric_price_str == "") | numeric_price_str.isNull(), lit(0)).otherwise(numeric_price_str.cast('int')))\
.withColumn(
    "date",
    coalesce(
        to_date(try_to_timestamp(col("date"), lit("yyyy-MM-dd"))),
        to_date(try_to_timestamp(col("date"), lit("dd/MM/yyyy"))),
        to_date(try_to_timestamp(col("date"), lit("yyyy/MM/dd")))
    )
)\
.withColumn("category", upper(trim(col("category"))))\
.withColumn("product",trim(col("product")))\
.withColumn("city",trim(col("city")))

clean_df.show()

+--------------+---------+-------+-----------+------+----------+---------+
|transaction_id|     city|product|   category|amount|      date|   status|
+--------------+---------+-------+-----------+------+----------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|2024-01-05|Completed|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|2024-01-06|Completed|
|        TXN004|    Delhi| Laptop|ELECTRONICS|     0|2024-01-07|Cancelled|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|2024-01-08|Completed|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|2024-01-08|Completed|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      NULL|Completed|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|
+--------------+---------

In [55]:
filtered_df = clean_df.filter(clean_df["status"] == "Completed")
filtered_df.show()

+--------------+---------+-------+-----------+------+----------+---------+
|transaction_id|     city|product|   category|amount|      date|   status|
+--------------+---------+-------+-----------+------+----------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|2024-01-05|Completed|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|2024-01-06|Completed|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|2024-01-08|Completed|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|2024-01-08|Completed|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      NULL|Completed|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|
+--------------+---------+-------+-----------+------+----------+---------+



In [56]:
unique = filtered_df.dropDuplicates(['transaction_id'])
unique.show()

+--------------+---------+-------+-----------+------+----------+---------+
|transaction_id|     city|product|   category|amount|      date|   status|
+--------------+---------+-------+-----------+------+----------+---------+
|        TXN001|    Delhi| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|
|        TXN002|   Mumbai| Mobile|ELECTRONICS| 32000|2024-01-05|Completed|
|        TXN003|Bangalore| Tablet|ELECTRONICS| 30000|2024-01-06|Completed|
|        TXN005|  Chennai| Mobile|ELECTRONICS|     0|2024-01-08|Completed|
|        TXN006|   Mumbai| Tablet|ELECTRONICS|     0|2024-01-08|Completed|
|        TXN007|    Delhi| Laptop|ELECTRONICS| 45000|      NULL|Completed|
|        TXN008|Bangalore| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|
|        TXN009|   Mumbai| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|
+--------------+---------+-------+-----------+------+----------+---------+



12. Join sales data with city lookup

13. Use broadcast join where appropriate

14. Explain join strategy used

15. Enrich sales data with city tier

In [57]:
from pyspark.sql.functions import broadcast

enriched_sales_df = unique.join(broadcast(city_df), on='city', how='left')
enriched_sales_df.show()

+---------+--------------+-------+-----------+------+----------+---------+------+
|     city|transaction_id|product|   category|amount|      date|   status|  tier|
+---------+--------------+-------+-----------+------+----------+---------+------+
|    Delhi|        TXN001| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|Tier-1|
|   Mumbai|        TXN002| Mobile|ELECTRONICS| 32000|2024-01-05|Completed|Tier-1|
|Bangalore|        TXN003| Tablet|ELECTRONICS| 30000|2024-01-06|Completed|Tier-1|
|  Chennai|        TXN005| Mobile|ELECTRONICS|     0|2024-01-08|Completed|Tier-2|
|   Mumbai|        TXN006| Tablet|ELECTRONICS|     0|2024-01-08|Completed|Tier-1|
|    Delhi|        TXN007| Laptop|ELECTRONICS| 45000|      NULL|Completed|Tier-1|
|Bangalore|        TXN008| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|Tier-1|
|   Mumbai|        TXN009| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|Tier-1|
+---------+--------------+-------+-----------+------+----------+---------+------+



16. Revenue per city
17. Revenue per product
18. Rank cities by total revenue
19. Rank products within each city
20. Identify top-performing city per day

In [59]:
total_revenue_per_city = enriched_sales_df.groupBy("city").agg({"amount":"sum"})
total_revenue_per_city.show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|      58000|
|  Chennai|          0|
|   Mumbai|      87000|
|    Delhi|      90000|
+---------+-----------+



In [60]:
total_revenue_per_city = enriched_sales_df.groupBy("product").agg({"amount":"sum"})
total_revenue_per_city.show()

+-------+-----------+
|product|sum(amount)|
+-------+-----------+
| Laptop|     145000|
| Mobile|      60000|
| Tablet|      30000|
+-------+-----------+



In [61]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc

city_revenue_ranked = enriched_sales_df.groupBy("city").agg({"amount": "sum"}).withColumnRenamed("sum(amount)", "total_revenue")
city_revenue_ranked = city_revenue_ranked.orderBy(desc("total_revenue"))

window_spec = Window.orderBy(desc("total_revenue"))
city_revenue_ranked = city_revenue_ranked.withColumn("city_rank", rank().over(window_spec))
city_revenue_ranked.show()

+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|    Delhi|        90000|        1|
|   Mumbai|        87000|        2|
|Bangalore|        58000|        3|
|  Chennai|            0|        4|
+---------+-------------+---------+



In [62]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc, sum

product_revenue_per_city = enriched_sales_df.groupBy("city", "product").agg(sum("amount").alias("product_revenue"))

window_spec_city_product = Window.partitionBy("city").orderBy(desc("product_revenue"))

product_revenue_ranked = product_revenue_per_city.withColumn("product_rank", rank().over(window_spec_city_product))
product_revenue_ranked.show()

+---------+-------+---------------+------------+
|     city|product|product_revenue|product_rank|
+---------+-------+---------------+------------+
|Bangalore| Tablet|          30000|           1|
|Bangalore| Mobile|          28000|           2|
|  Chennai| Mobile|              0|           1|
|    Delhi| Laptop|          90000|           1|
|   Mumbai| Laptop|          55000|           1|
|   Mumbai| Mobile|          32000|           2|
|   Mumbai| Tablet|              0|           3|
+---------+-------+---------------+------------+



21. Identify reusable DataFrames
22. Apply caching appropriately
23. Compare performance with and without cache
24. Repartition data by city
25. Explain why partitioning helps

In [63]:
enriched_sales_df.cache()
enriched_sales_df.show()

+---------+--------------+-------+-----------+------+----------+---------+------+
|     city|transaction_id|product|   category|amount|      date|   status|  tier|
+---------+--------------+-------+-----------+------+----------+---------+------+
|   Mumbai|        TXN006| Tablet|ELECTRONICS|     0|2024-01-08|Completed|Tier-1|
|Bangalore|        TXN003| Tablet|ELECTRONICS| 30000|2024-01-06|Completed|Tier-1|
|   Mumbai|        TXN002| Mobile|ELECTRONICS| 32000|2024-01-05|Completed|Tier-1|
|Bangalore|        TXN008| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|Tier-1|
|  Chennai|        TXN005| Mobile|ELECTRONICS|     0|2024-01-08|Completed|Tier-2|
|    Delhi|        TXN007| Laptop|ELECTRONICS| 45000|      NULL|Completed|Tier-1|
|   Mumbai|        TXN009| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|Tier-1|
|    Delhi|        TXN001| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|Tier-1|
+---------+--------------+-------+-----------+------+----------+---------+------+



In [64]:
import time

start_time_cached = time.time()
total_revenue_cached = enriched_sales_df.groupBy("city").agg({"amount":"sum"}).collect()
end_time_cached = time.time()
print(f"Time taken with caching: {end_time_cached - start_time_cached:.4f} seconds")

Time taken with caching: 5.0342 seconds


In [65]:
enriched_sales_df.unpersist()

start_time_uncached = time.time()
total_revenue_uncached = enriched_sales_df.groupBy("city").agg({"amount":"sum"}).collect()
end_time_uncached = time.time()
print(f"Time taken without caching: {end_time_uncached - start_time_uncached:.4f} seconds")

Time taken without caching: 1.1964 seconds


In [66]:
repartitioned_sales_df = enriched_sales_df.repartition('city')
repartitioned_sales_df.cache()
repartitioned_sales_df.show()

+---------+--------------+-------+-----------+------+----------+---------+------+
|     city|transaction_id|product|   category|amount|      date|   status|  tier|
+---------+--------------+-------+-----------+------+----------+---------+------+
|Bangalore|        TXN003| Tablet|ELECTRONICS| 30000|2024-01-06|Completed|Tier-1|
|Bangalore|        TXN008| Mobile|ELECTRONICS| 28000|2024-01-09|Completed|Tier-1|
|  Chennai|        TXN005| Mobile|ELECTRONICS|     0|2024-01-08|Completed|Tier-2|
|   Mumbai|        TXN002| Mobile|ELECTRONICS| 32000|2024-01-05|Completed|Tier-1|
|   Mumbai|        TXN006| Tablet|ELECTRONICS|     0|2024-01-08|Completed|Tier-1|
|   Mumbai|        TXN009| Laptop|ELECTRONICS| 55000|2024-01-10|Completed|Tier-1|
|    Delhi|        TXN001| Laptop|ELECTRONICS| 45000|2024-01-05|Completed|Tier-1|
|    Delhi|        TXN007| Laptop|ELECTRONICS| 45000|      NULL|Completed|Tier-1|
+---------+--------------+-------+-----------+------+----------+---------+------+



```markdown
### Explaining Repartitioning by 'city' for Performance Improvement

Repartitioning the `enriched_sales_df` by the 'city' column, especially before caching, offers significant performance benefits for subsequent operations that involve grouping or joining by 'city'. Here's why:

1.  **Data Locality for Aggregations**: When data is repartitioned by 'city', all rows belonging to the same city are moved to the same partition (and thus typically to the same executor). This means that for operations like `groupBy('city')`, `sum('amount')`, or `rank().over(Window.partitionBy('city'))`, Spark can perform these computations locally within each partition without shuffling data across the network. Shuffling data is a very expensive operation in Spark, involving network I/O, serialization, and deserialization. By pre-shuffling the data by 'city', these subsequent operations become much faster.

2.  **Reduced Shuffle for Joins**: If there were future join operations involving `enriched_sales_df` and another DataFrame on the 'city' column, having `enriched_sales_df` already partitioned by 'city' would significantly reduce or eliminate the need for a shuffle phase during the join. If the other DataFrame is also partitioned by 'city' (or can be broadcasted), Spark can perform a more efficient shuffled hash join or sort-merge join.

3.  **Optimization for Window Functions**: Window functions that partition by 'city', such as `Window.partitionBy("city").orderBy(desc("product_revenue"))` for ranking products within each city, greatly benefit from this. All data for a given city is already together, so the window operation can be computed efficiently without requiring an additional shuffle stage.

4.  **Caching Efficiency**: When `repartitioned_sales_df` is cached after repartitioning, the data is stored in memory (or disk) in its repartitioned state. This means that every time an action is triggered on this cached DataFrame that benefits from the 'city' partitioning, Spark doesn't have to re-read and re-shuffle the data, leading to consistent and faster performance across multiple queries.

In essence, repartitioning aligns the physical distribution of the data with the logical operations that will be performed on it, minimizing costly data movement and maximizing local computation.
```

26. Write cleaned data to Parquet
27. Write aggregated data to ORC
28. Compare file structure and size
29. Explain why Avro is not used here
30. Design a future streaming ingestion using Avro

In [67]:
repartitioned_sales_df.write.mode("overwrite").parquet("cleaned_data.parquet")

In [68]:
repartitioned_sales_df.write.mode("overwrite").orc("cleaned_orders.orc")

31. Identify common mistakes (intentional bugs)
32. Debug schema mismatch errors
33. Debug NoneType DataFrame errors
34. Use explain() to identify inefficiencies

In [69]:
enriched_sales_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Deduplicate [transaction_id#868]
:  +- Filter (status#874 = Completed)
:     +- Project [transaction_id#868, trim(city#869, None) AS city#879, product#878, category#877, amount#875, date#876, status#874]
:        +- Project [transaction_id#868, city#869, trim(product#870, None) AS product#878, category#877, amount#875, date#876, status#874]
:           +- Project [transaction_id#868, city#869, product#870, upper(trim(category#871, None)) AS category#877, amount#875, date#876, status#874]
:              +- Project [transaction_id#868, city#869, product#870, category#871, amount#875, coalesce(to_date(try_to_timestamp(date#873, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date#873, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date#873, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Som

In [77]:
enriched_sales_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Deduplicate [transaction_id#868]
:  +- Filter (status#874 = Completed)
:     +- Project [transaction_id#868, trim(city#869, None) AS city#879, product#878, category#877, amount#875, date#876, status#874]
:        +- Project [transaction_id#868, city#869, trim(product#870, None) AS product#878, category#877, amount#875, date#876, status#874]
:           +- Project [transaction_id#868, city#869, product#870, upper(trim(category#871, None)) AS category#877, amount#875, date#876, status#874]
:              +- Project [transaction_id#868, city#869, product#870, category#871, amount#875, coalesce(to_date(try_to_timestamp(date#873, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date#873, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date#873, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Som

In [76]:
enriched_sales_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- status: string (nullable = true)
 |-- tier: string (nullable = true)



In [75]:
assert enriched_sales_df is not None

In [74]:
enriched_sales_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [city])
:- Deduplicate [transaction_id#868]
:  +- Filter (status#874 = Completed)
:     +- Project [transaction_id#868, trim(city#869, None) AS city#879, product#878, category#877, amount#875, date#876, status#874]
:        +- Project [transaction_id#868, city#869, trim(product#870, None) AS product#878, category#877, amount#875, date#876, status#874]
:           +- Project [transaction_id#868, city#869, product#870, upper(trim(category#871, None)) AS category#877, amount#875, date#876, status#874]
:              +- Project [transaction_id#868, city#869, product#870, category#871, amount#875, coalesce(to_date(try_to_timestamp(date#873, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date#873, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date#873, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Som

35. Validate record counts
36. Ensure no nulls in critical fields
37. Confirm schema correctness
38. Document optimization decisions

In [73]:
enriched_sales_df.count()

8

In [72]:
enriched_sales_df.filter(col("amount").isNull()).count()

0

In [71]:
enriched_sales_df.printSchema()

root
 |-- city: string (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- status: string (nullable = true)
 |-- tier: string (nullable = true)



In [70]:
enriched_sales_df.describe().show()

+-------+---------+--------------+-------+-----------+------------------+---------+------+
|summary|     city|transaction_id|product|   category|            amount|   status|  tier|
+-------+---------+--------------+-------+-----------+------------------+---------+------+
|  count|        8|             8|      8|          8|                 8|        8|     8|
|   mean|     NULL|          NULL|   NULL|       NULL|           29375.0|     NULL|  NULL|
| stddev|     NULL|          NULL|   NULL|       NULL|20283.261924200317|     NULL|  NULL|
|    min|Bangalore|        TXN001| Laptop|ELECTRONICS|                 0|Completed|Tier-1|
|    max|   Mumbai|        TXN009| Tablet|ELECTRONICS|             55000|Completed|Tier-2|
+-------+---------+--------------+-------+-----------+------------------+---------+------+

