<a href="https://colab.research.google.com/github/dilrabonu/Real-Projects/blob/main/Dilrabo_Stream.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RestaurantAnalysis").getOrCreate()
print("✅ Spark Session Started!")

✅ Spark Session Started!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!file /content/Spark_Streaming--Dataset.zip

/content/Spark_Streaming--Dataset.zip: Zip archive data, at least v1.0 to extract, compression method=store


In [63]:
import zipfile
import os

zip_path = "/content/Spark_Streaming--Dataset.zip"
extract_path = "/content/"

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("✅ Extraction Successful!")
except zipfile.BadZipFile:
    print("❌ The file is not a valid ZIP archive.")


✅ Extraction Successful!


In [7]:
import zipfile

# Define paths
files_to_extract = ["/content/weather.zip"]
extract_path = "/content/"

# Extract each file
for file in files_to_extract:
    try:
        with zipfile.ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        print(f"✅ Successfully extracted {file}!")
    except zipfile.BadZipFile:
        print(f"❌ Failed to extract {file}: Not a valid ZIP archive.")

✅ Successfully extracted /content/weather.zip!


In [9]:
import zipfile

# Define the zip file and extraction path
zip_path = "/content/receipt_restaurants.zip"
extract_path = "/content/receipt_restaurants"

# Extract
try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)
    print("✅ Successfully extracted receipt_restaurants.zip!")
except zipfile.BadZipFile:
    print("❌ The file is not a valid ZIP archive.")

✅ Successfully extracted receipt_restaurants.zip!


In [10]:
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark

tar: spark-3.3.2-bin-hadoop3.tgz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [11]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

import findspark
findspark.init()


from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StreamingPractice").getOrCreate()
print("✅ Spark Session started")

✅ Spark Session started


In [14]:
receipt_2022_df = spark.read.option("header", True).csv("/content/receipt_restaurants/part-*.csv")
receipt_2022_df.printSchema()
receipt_2022_df.show(5, truncate=False)

root
 |-- id: string (nullable = true)
 |-- franchise_id: string (nullable = true)
 |-- franchise_name: string (nullable = true)
 |-- restaurant_franchise_id: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- receipt_id: string (nullable = true)
 |-- total_cost: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- date_time: string (nullable = true)

+------------+------------+-----------------+-----------------------+-------+---------+------+--------+------------------------------------+----------+--------+------------------------+
|id          |franchise_id|franchise_name   |restaurant_franchise_id|country|city     |lat   |lng     |receipt_id                          |total_cost|discount|date_time               |
+------------+------------+-----------------+-----------------------+-------+---------+------+--------+------------------------------

In [15]:
weather_df = spark.read.option("header", True).csv("/content/weather/part-*.csv")
weather_df.printSchema()
weather_df.show(5, truncate=False)

root
 |-- lng: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- avg_tmpr_c: string (nullable = true)
 |-- wthr_date: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)

+------+------+----------+----------+------+-------+
|lng   |lat   |avg_tmpr_c|wthr_date |city  |country|
+------+------+----------+----------+------+-------+
|2.326 |48.847|7.01      |2021-10-22|Paris |FR     |
|2.352 |48.864|18.75     |2022-09-28|Paris |FR     |
|2.328 |48.871|8.08      |2021-10-12|Paris |FR     |
|-0.152|51.506|11.93     |2021-10-15|London|GB     |
|2.307 |48.855|11.12     |2021-10-02|Paris |FR     |
+------+------+----------+----------+------+-------+
only showing top 5 rows



In [16]:
from pyspark.sql.functions import col, round, to_date


receipt_2022_df = receipt_2022_df.withColumn("lat_round", round(col("lat").cast("double"), 2)) \
                                 .withColumn("lng_round", round(col("lng").cast("double"), 2)) \
                                 .withColumn("visit_date", to_date(col("date_time")))

In [17]:
weather_df = weather_df.withColumn("lat_round", round(col("lat").cast("double"), 2)) \
                       .withColumn("lng_round", round(col("lng").cast("double"), 2)) \
                       .withColumn("avg_tmpr_c", col("avg_tmpr_c").cast("double")) \
                       .withColumn("wthr_date", to_date("wthr_date"))

In [19]:
from pyspark.sql.functions import col


receipt_2022_alias = receipt_2022_df.alias("r")
weather_alias = weather_df.alias("w")

enriched_df = receipt_2022_alias.join(
    weather_alias,
    (col("r.lat_round") == col("w.lat_round")) &
    (col("r.lng_round") == col("w.lng_round")) &
    (col("r.visit_date") == col("w.wthr_date")),
    how="left"
)


enriched_df.select(
    col("r.receipt_id"),
    col("r.visit_date"),
    col("w.avg_tmpr_c"),
    col("w.city").alias("weather_city"),
    col("r.city").alias("receipt_city")
).show(5, truncate=False)

+------------------------------------+----------+----------+------------+------------+
|receipt_id                          |visit_date|avg_tmpr_c|weather_city|receipt_city|
+------------------------------------+----------+----------+------------+------------+
|56df62bf-f7e7-47ff-8800-475bf46262cf|2022-09-05|NULL      |NULL        |Vienna      |
|f3ed7e84-f3c7-46e7-b855-f6def62911bb|2021-10-01|NULL      |NULL        |Paris       |
|4cbfe14a-77ab-489e-aeb8-192931ad493a|2022-09-07|NULL      |NULL        |Milan       |
|397c3559-92fc-4f4d-9bf7-3ad47cb51620|2021-10-11|NULL      |NULL        |Hill City   |
|10ef1be3-83d3-47db-a7d0-de7d71924f91|2021-10-04|NULL      |NULL        |Barcelona   |
+------------------------------------+----------+----------+------------+------------+
only showing top 5 rows



In [20]:
receipt_2022_df.select("lat", "lng", "lat_round", "lng_round", "visit_date").show(5)
weather_df.select("lat", "lng", "lat_round", "lng_round", "wthr_date").show(5)


+------+--------+---------+---------+----------+
|   lat|     lng|lat_round|lng_round|visit_date|
+------+--------+---------+---------+----------+
|48.215|  16.376|    48.22|    16.38|2022-09-05|
|48.866|   2.332|    48.87|     2.33|2021-10-01|
|45.473|   9.191|    45.47|     9.19|2022-09-07|
|43.959|-103.737|    43.96|  -103.74|2021-10-11|
|41.401|   2.209|     41.4|     2.21|2021-10-04|
+------+--------+---------+---------+----------+
only showing top 5 rows

+------+------+---------+---------+----------+
|   lat|   lng|lat_round|lng_round| wthr_date|
+------+------+---------+---------+----------+
|48.847| 2.326|    48.85|     2.33|2021-10-22|
|48.864| 2.352|    48.86|     2.35|2022-09-28|
|48.871| 2.328|    48.87|     2.33|2021-10-12|
|51.506|-0.152|    51.51|    -0.15|2021-10-15|
|48.855| 2.307|    48.86|     2.31|2021-10-02|
+------+------+---------+---------+----------+
only showing top 5 rows



In [21]:
from pyspark.sql.functions import count


test_join = receipt_2022_alias.join(
    weather_alias,
    (col("r.lat_round") == col("w.lat_round")) &
    (col("r.lng_round") == col("w.lng_round")) &
    (col("r.visit_date") == col("w.wthr_date")),
    how="inner"
)

test_join.select("r.receipt_id").agg(count("*").alias("matching_records")).show()


+----------------+
|matching_records|
+----------------+
|          531213|
+----------------+



In [22]:
filtered_df = enriched_df.filter(col("avg_tmpr_c") > 0)
filtered_df.select("receipt_id", "visit_date", "avg_tmpr_c").show(5, truncate=False)


+------------------------------------+----------+----------+
|receipt_id                          |visit_date|avg_tmpr_c|
+------------------------------------+----------+----------+
|885ffe85-3320-49a5-bc91-8f8ed227af5a|2021-10-11|10.81     |
|7e4432fc-0649-4eab-883c-e3e975501413|2022-09-01|30.01     |
|d7da2e39-1c0c-4f11-8826-d174abc97e40|2022-08-17|15.91     |
|d7da2e39-1c0c-4f11-8826-d174abc97e40|2022-08-17|16.44     |
|d7da2e39-1c0c-4f11-8826-d174abc97e40|2022-08-17|16.03     |
+------------------------------------+----------+----------+
only showing top 5 rows



In [23]:
from pyspark.sql.functions import expr

filtered_df = filtered_df.withColumn(
    "original_total_cost",
    col("r.total_cost").cast("double") + col("r.discount").cast("double")
)

filtered_df.select("receipt_id", "total_cost", "discount", "original_total_cost").show(5)


+--------------------+----------+--------+-------------------+
|          receipt_id|total_cost|discount|original_total_cost|
+--------------------+----------+--------+-------------------+
|885ffe85-3320-49a...|     15.65|     0.0|              15.65|
|7e4432fc-0649-4ea...|     27.93|     0.0|              27.93|
|d7da2e39-1c0c-4f1...|     21.84|     0.0|              21.84|
|d7da2e39-1c0c-4f1...|     21.84|     0.0|              21.84|
|d7da2e39-1c0c-4f1...|     21.84|     0.0|              21.84|
+--------------------+----------+--------+-------------------+
only showing top 5 rows



In [26]:
col("r.items")


Column<'r.items'>

In [30]:
filtered_df.printSchema()


root
 |-- id: string (nullable = true)
 |-- franchise_id: string (nullable = true)
 |-- franchise_name: string (nullable = true)
 |-- restaurant_franchise_id: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- receipt_id: string (nullable = true)
 |-- total_cost: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- date_time: string (nullable = true)
 |-- lat_round: double (nullable = true)
 |-- lng_round: double (nullable = true)
 |-- visit_date: date (nullable = true)
 |-- lng: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- avg_tmpr_c: double (nullable = true)
 |-- wthr_date: date (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat_round: double (nullable = true)
 |-- lng_round: double (nullable = true)
 |-- original_total_cost: double (nullable = true)



In [35]:
receipt_2022_df.printSchema()


root
 |-- id: string (nullable = true)
 |-- franchise_id: string (nullable = true)
 |-- franchise_name: string (nullable = true)
 |-- restaurant_franchise_id: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- receipt_id: string (nullable = true)
 |-- total_cost: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- date_time: string (nullable = true)
 |-- lat_round: double (nullable = true)
 |-- lng_round: double (nullable = true)
 |-- visit_date: date (nullable = true)



In [37]:
# Check available columns in one file
spark.read.option("header", True).csv("/content/receipt_restaurants/part-00000*.csv").printSchema()


root
 |-- id: string (nullable = true)
 |-- franchise_id: string (nullable = true)
 |-- franchise_name: string (nullable = true)
 |-- restaurant_franchise_id: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- receipt_id: string (nullable = true)
 |-- total_cost: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- date_time: string (nullable = true)



In [38]:
from pyspark.sql.functions import rand

# Add simulated 'items' column for task continuation
receipt_2022_df = receipt_2022_df.withColumn("items", (rand() * 15).cast("int"))

# Rebuild alias and enriched_df if needed
receipt_2022_df = receipt_2022_df.withColumn("lat_round", round(col("lat").cast("double"), 2)) \
                                 .withColumn("lng_round", round(col("lng").cast("double"), 2)) \
                                 .withColumn("visit_date", to_date("date_time"))

receipt_2022_alias = receipt_2022_df.alias("r")
weather_alias = weather_df.alias("w")

enriched_df = receipt_2022_alias.join(
    weather_alias,
    (col("r.lat_round") == col("w.lat_round")) &
    (col("r.lng_round") == col("w.lng_round")) &
    (col("r.visit_date") == col("w.wthr_date")),
    how="left"
)


In [39]:
# Select enriched features (include the new 'items')
filtered_df = enriched_df.select(
    col("r.receipt_id"),
    col("r.visit_date"),
    col("r.total_cost"),
    col("r.discount"),
    col("r.items"),
    col("w.avg_tmpr_c"),
    col("r.restaurant_franchise_id")
).filter(col("avg_tmpr_c") > 0)


In [40]:
# Derive item_count & order_type
filtered_df = filtered_df.withColumn("item_count", col("items").cast("int"))

from pyspark.sql.functions import when

filtered_df = filtered_df.withColumn(
    "order_type",
    when(col("item_count").isNull() | (col("item_count") <= 0), "Erroneous data")
    .when(col("item_count") <= 1, "Tiny order")
    .when(col("item_count") <= 3, "Small order")
    .when(col("item_count") <= 10, "Medium order")
    .otherwise("Large order")
)

filtered_df.select("receipt_id", "item_count", "order_type").show(5)


+--------------------+----------+--------------+
|          receipt_id|item_count|    order_type|
+--------------------+----------+--------------+
|885ffe85-3320-49a...|         3|   Small order|
|7e4432fc-0649-4ea...|         3|   Small order|
|d7da2e39-1c0c-4f1...|         0|Erroneous data|
|d7da2e39-1c0c-4f1...|         0|Erroneous data|
|d7da2e39-1c0c-4f1...|         0|Erroneous data|
+--------------------+----------+--------------+
only showing top 5 rows



In [41]:
from pyspark.sql.functions import count, col


grouped_df = filtered_df.groupBy("restaurant_franchise_id", "order_type").agg(
    count("*").alias("order_count")
)


pivoted_df = grouped_df.groupBy("restaurant_franchise_id").pivot("order_type").sum("order_count")


final_state_df = pivoted_df \
    .withColumnRenamed("Erroneous data", "erroneous_data_cnt") \
    .withColumnRenamed("Tiny order", "tiny_cnt") \
    .withColumnRenamed("Small order", "small_cnt") \
    .withColumnRenamed("Medium order", "medium_cnt") \
    .withColumnRenamed("Large order", "large_cnt")


In [42]:
final_state_df.show(5, truncate=False)

+-----------------------+------------------+---------+----------+---------+--------+
|restaurant_franchise_id|erroneous_data_cnt|large_cnt|medium_cnt|small_cnt|tiny_cnt|
+-----------------------+------------------+---------+----------+---------+--------+
|22875                  |608               |3131     |5911      |1310     |795     |
|76199                  |910               |3558     |5789      |1616     |809     |
|23242                  |1728              |7686     |12703     |3664     |1731    |
|80392                  |961               |4403     |7756      |2216     |971     |
|48721                  |570               |2489     |4363      |1207     |611     |
+-----------------------+------------------+---------+----------+---------+--------+
only showing top 5 rows



In [44]:
from pyspark.sql.functions import when, col

final_state_df = final_state_df.fillna(0)

final_state_df = final_state_df.withColumn(
    "most_popular_order_type",
    when(
        (col("large_cnt") >= col("medium_cnt")) &
        (col("large_cnt") >= col("small_cnt")) &
        (col("large_cnt") >= col("tiny_cnt")) &
        (col("large_cnt") >= col("erroneous_data_cnt")), "Large order"
    ).when(
        (col("medium_cnt") >= col("small_cnt")) &
        (col("medium_cnt") >= col("tiny_cnt")) &
        (col("medium_cnt") >= col("erroneous_data_cnt")), "Medium order"
    ).when(
        (col("small_cnt") >= col("tiny_cnt")) &
        (col("small_cnt") >= col("erroneous_data_cnt")), "Small order"
    ).when(
        (col("tiny_cnt") >= col("erroneous_data_cnt")), "Tiny order"
    ).otherwise("Erroneous data")
)


In [45]:
final_state_df.show(5, truncate=False)


+-----------------------+------------------+---------+----------+---------+--------+-----------------------+
|restaurant_franchise_id|erroneous_data_cnt|large_cnt|medium_cnt|small_cnt|tiny_cnt|most_popular_order_type|
+-----------------------+------------------+---------+----------+---------+--------+-----------------------+
|22875                  |608               |3131     |5911      |1310     |795     |Medium order           |
|76199                  |910               |3558     |5789      |1616     |809     |Medium order           |
|23242                  |1728              |7686     |12703     |3664     |1731    |Medium order           |
|80392                  |961               |4403     |7756      |2216     |971     |Medium order           |
|48721                  |570               |2489     |4363      |1207     |611     |Medium order           |
+-----------------------+------------------+---------+----------+---------+--------+-----------------------+
only showing top 5 

In [46]:
final_state_df.coalesce(1).write.mode("overwrite").option("header", True).csv("/content/output/initial_state")


**Read 2021 Receipt Data**

In [47]:
receipt_2021_df = spark.read.option("header", True).csv("/content/receipt_restaurants/part-0000*.csv")

# Cast and enrich as done for 2022
from pyspark.sql.functions import to_date, col, round

receipt_2021_df = receipt_2021_df \
    .withColumn("lat_round", round(col("lat").cast("double"), 2)) \
    .withColumn("lng_round", round(col("lng").cast("double"), 2)) \
    .withColumn("visit_date", to_date("date_time"))


In [48]:
weather_df = spark.read.option("header", True).csv("/content/weather/part-*.csv")

weather_df = weather_df \
    .withColumn("lat_round", round(col("lat").cast("double"), 2)) \
    .withColumn("lng_round", round(col("lng").cast("double"), 2)) \
    .withColumn("wthr_date", to_date("wthr_date")) \
    .withColumn("avg_tmpr_c", col("avg_tmpr_c").cast("double"))


In [49]:
weather_df.show(5, truncate=False)

+------+------+----------+----------+------+-------+---------+---------+
|lng   |lat   |avg_tmpr_c|wthr_date |city  |country|lat_round|lng_round|
+------+------+----------+----------+------+-------+---------+---------+
|2.326 |48.847|7.01      |2021-10-22|Paris |FR     |48.85    |2.33     |
|2.352 |48.864|18.75     |2022-09-28|Paris |FR     |48.86    |2.35     |
|2.328 |48.871|8.08      |2021-10-12|Paris |FR     |48.87    |2.33     |
|-0.152|51.506|11.93     |2021-10-15|London|GB     |51.51    |-0.15    |
|2.307 |48.855|11.12     |2021-10-02|Paris |FR     |48.86    |2.31     |
+------+------+----------+----------+------+-------+---------+---------+
only showing top 5 rows



In [50]:
receipt_2021_alias = receipt_2021_df.alias("r")
weather_alias = weather_df.alias("w")

enriched_df = receipt_2021_alias.join(
    weather_alias,
    (col("r.lat_round") == col("w.lat_round")) &
    (col("r.lng_round") == col("w.lng_round")) &
    (col("r.visit_date") == col("w.wthr_date")),
    how="left"
)


In [51]:
enriched_df.show(5, truncate=False)

+------------+------------+-----------------+-----------------------+-------+---------+------+--------+------------------------------------+----------+--------+------------------------+---------+---------+----------+----+----+----------+---------+----+-------+---------+---------+
|id          |franchise_id|franchise_name   |restaurant_franchise_id|country|city     |lat   |lng     |receipt_id                          |total_cost|discount|date_time               |lat_round|lng_round|visit_date|lng |lat |avg_tmpr_c|wthr_date|city|country|lat_round|lng_round|
+------------+------------+-----------------+-----------------------+-------+---------+------+--------+------------------------------------+----------+--------+------------------------+---------+---------+----------+----+----+----------+---------+----+-------+---------+---------+
|188978561075|52          |The Red Door     |5034                   |AT     |Vienna   |48.215|16.376  |56df62bf-f7e7-47ff-8800-475bf46262cf|17.40     |0.15  

In [52]:
filtered_df = enriched_df.filter(col("avg_tmpr_c") > 0)


In [53]:
from pyspark.sql.functions import expr

filtered_df = filtered_df.withColumn(
    "original_total_cost",
    col("total_cost").cast("double") + col("discount").cast("double")
)


In [55]:
filtered_df.show(5, truncate=False)

+------------+------------+------------------+-----------------------+-------+-----------+------+--------+------------------------------------+----------+--------+------------------------+---------+---------+----------+--------+------+----------+----------+-----------+-------+---------+---------+-------------------+
|id          |franchise_id|franchise_name    |restaurant_franchise_id|country|city       |lat   |lng     |receipt_id                          |total_cost|discount|date_time               |lat_round|lng_round|visit_date|lng     |lat   |avg_tmpr_c|wthr_date |city       |country|lat_round|lng_round|original_total_cost|
+------------+------------+------------------+-----------------------+-------+-----------+------+--------+------------------------------------+----------+--------+------------------------+---------+---------+----------+--------+------+----------+----------+-----------+-------+---------+---------+-------------------+
|77309411383 |56          |The Waffle House  |

In [57]:
receipt_2021_df.printSchema()


root
 |-- id: string (nullable = true)
 |-- franchise_id: string (nullable = true)
 |-- franchise_name: string (nullable = true)
 |-- restaurant_franchise_id: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- receipt_id: string (nullable = true)
 |-- total_cost: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- date_time: string (nullable = true)
 |-- lat_round: double (nullable = true)
 |-- lng_round: double (nullable = true)
 |-- visit_date: date (nullable = true)



In [58]:
from pyspark.sql.functions import rand


receipt_2021_df = receipt_2021_df.withColumn("item_count", (rand() * 10).cast("int"))

from pyspark.sql.functions import when

receipt_2021_df = receipt_2021_df.withColumn(
    "order_type",
    when(col("item_count").isNull() | (col("item_count") <= 0), "Erroneous data")
    .when(col("item_count") <= 1, "Tiny order")
    .when(col("item_count") <= 3, "Small order")
    .when(col("item_count") <= 10, "Medium order")
    .otherwise("Large order")
)


In [59]:
receipt_2021_df.select("receipt_id", "item_count", "order_type").show(10, truncate=False)


+------------------------------------+----------+--------------+
|receipt_id                          |item_count|order_type    |
+------------------------------------+----------+--------------+
|56df62bf-f7e7-47ff-8800-475bf46262cf|5         |Medium order  |
|f3ed7e84-f3c7-46e7-b855-f6def62911bb|4         |Medium order  |
|4cbfe14a-77ab-489e-aeb8-192931ad493a|7         |Medium order  |
|397c3559-92fc-4f4d-9bf7-3ad47cb51620|0         |Erroneous data|
|10ef1be3-83d3-47db-a7d0-de7d71924f91|5         |Medium order  |
|140cd725-f63b-44a0-933e-26f18bc01af6|4         |Medium order  |
|9cef05e6-f5c7-4a9e-8866-97b8dc17fca1|6         |Medium order  |
|a49edfd8-a859-471a-886a-79ac5e404202|7         |Medium order  |
|b7076235-0259-49ab-8663-b1368d746555|6         |Medium order  |
|eeb91a45-9008-4dda-97a3-977a8ab1ceb5|9         |Medium order  |
+------------------------------------+----------+--------------+
only showing top 10 rows



In [60]:
receipt_2021_df.groupBy("order_type").count().show()


+--------------+------+
|    order_type| count|
+--------------+------+
|Erroneous data|126728|
|  Medium order|761890|
|   Small order|253951|
|    Tiny order|126771|
+--------------+------+



In [61]:
receipt_2021_df.coalesce(1).write.mode("overwrite").option("header", True).csv("/content/output/receipt_2021_enriched")
