In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

In [2]:
# Start Spark session
spark = SparkSession.builder \
    .appName("AmazonReviewsIngestion") \
    .getOrCreate()

25/09/25 01:43:34 WARN Utils: Your hostname, dsbda-vm resolves to a loopback address: 127.0.1.1; using 192.168.64.3 instead (on interface enp0s1)
25/09/25 01:43:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/25 01:43:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Schema for the dataset, specifying the data types for cols
schema = StructType([
    StructField("product_id", StringType(), True),
    StructField("product_title", StringType(), True),
    StructField("star_rating", StringType(), True),
    StructField("helpful_votes", StringType(), True),
    StructField("total_votes", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", StringType(), True),
])

In [4]:
# Local path to the dataset
local_file_path = "clean_books_10k.csv"

reviews_df = spark.read.csv(
    local_file_path,
    sep=",",
    header=True,       
    schema=schema,     # Fixed schema
    multiLine=True,
    quote='"',
    escape='"'
)

print("Data Sample:")
reviews_df.show(5)

print("Schema:")
reviews_df.printSchema()

Data Sample:
+----------+--------------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|product_id|       product_title|star_rating|helpful_votes|total_votes|     review_headline|         review_body|review_date|
+----------+--------------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|0312977379|    Beware the Night|        4.0|         61.0|       79.0|A book that actua...|Unlike many books...| 2005-10-13|
|1420832158|JEET KUNE DO: THE...|        5.0|          1.0|        4.0|Something  For Ev...|This book is the ...| 2005-10-13|
|0312977379|    Beware the Night|        5.0|         12.0|       18.0|    Beware the Night|When I started th...| 2005-10-13|
|0312336853|Shooter: The Auto...|        5.0|          1.0|        4.0|Hard to put this ...|This book has som...| 2005-10-13|
|0756607574|             Panties|        4.0|          5.0|       12.0|         A Nice Read|This book is 

In [5]:
# HDFS output directory
hdfs_path = "hdfs://localhost:54310/user/ubuntu/books_dataset/"

reviews_df.coalesce(1).write \ # Uses coalesce to ensure the data is written as a single file
    .option("header", True) \
    .mode("overwrite") \
    .csv(hdfs_path)

print(f"Data written to HDFS at {hdfs_path}")

Data written to HDFS at hdfs://localhost:54310/user/ubuntu/books_dataset/


In [6]:
# Verify the write action by reading back
reloaded_df = spark.read.csv(
    hdfs_path,
    sep=",",
    header=True,
    schema=schema,
    multiLine=True,
    quote='"',
    escape="\\"
)

print("Reloaded from HDFS:")
reloaded_df.show(5)

Reloaded from HDFS:
+----------+--------------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|product_id|       product_title|star_rating|helpful_votes|total_votes|     review_headline|         review_body|review_date|
+----------+--------------------+-----------+-------------+-----------+--------------------+--------------------+-----------+
|0312977379|    Beware the Night|        4.0|         61.0|       79.0|A book that actua...|Unlike many books...| 2005-10-13|
|1420832158|JEET KUNE DO: THE...|        5.0|          1.0|        4.0|Something  For Ev...|This book is the ...| 2005-10-13|
|0312977379|    Beware the Night|        5.0|         12.0|       18.0|    Beware the Night|When I started th...| 2005-10-13|
|0312336853|Shooter: The Auto...|        5.0|          1.0|        4.0|Hard to put this ...|This book has som...| 2005-10-13|
|0756607574|             Panties|        4.0|          5.0|       12.0|         A Nice Read|This b

In [8]:
streaming_input_path = "hdfs://localhost:54310/user/ubuntu/streaming_input/"

# Create streaming DF to read CSV file
streaming_df = spark.readStream \
    .option("sep", ",") \
    .option("header", True) \
    .option("multiLine", True) \ 
    .option("quote", "\"") \
    .option("escape", "\"") \
    .schema(schema) \
    .csv(streaming_input_path)

# Example query: count reviews by rating as files arrive
query = streaming_df.groupBy("star_rating").count()

query_writer = query.writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

print("Directory-based streaming started. Copy CSV chunks into 'streaming_folder/' to see updates.")

Directory-based streaming started. Copy CSV chunks into 'streaming_folder/' to see updates.


25/09/25 01:49:19 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-12d058e3-2152-4b52-9275-db500c828b20. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/09/25 01:49:19 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  117|
|        5.0|  560|
|        4.0|  184|
|        2.0|   50|
|        3.0|   89|
+-----------+-----+



25/09/25 01:49:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 5.0
 Schema: star_rating
Expected: star_rating but found: 5.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_1.csv
                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  219|
|        5.0| 1115|
|        4.0|  380|
|        2.0|  110|
|        3.0|  175|
+-----------+-----+



25/09/25 01:50:00 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 5.0
 Schema: star_rating
Expected: star_rating but found: 5.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_2.csv
                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  318|
|        5.0| 1700|
|        4.0|  559|
|        2.0|  175|
|        3.0|  246|
+-----------+-----+



25/09/25 01:50:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 4.0
 Schema: star_rating
Expected: star_rating but found: 4.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_3.csv
                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  421|
|        5.0| 2284|
|        4.0|  729|
|        2.0|  227|
|        3.0|  336|
+-----------+-----+



25/09/25 01:50:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 4.0
 Schema: star_rating
Expected: star_rating but found: 4.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_4.csv
                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  510|
|        5.0| 2837|
|        4.0|  914|
|        2.0|  305|
|        3.0|  430|
+-----------+-----+



25/09/25 01:50:32 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 5.0
 Schema: star_rating
Expected: star_rating but found: 5.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_5.csv
                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  569|
|        5.0| 3443|
|        4.0| 1121|
|        2.0|  357|
|        3.0|  505|
+-----------+-----+



25/09/25 01:50:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 4.0
 Schema: star_rating
Expected: star_rating but found: 4.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_6.csv
                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  655|
|        5.0| 3990|
|        4.0| 1321|
|        2.0|  429|
|        3.0|  599|
+-----------+-----+



25/09/25 01:50:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 5.0
 Schema: star_rating
Expected: star_rating but found: 5.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_7.csv


-------------------------------------------
Batch: 7
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  728|
|        5.0| 4574|
|        4.0| 1532|
|        2.0|  478|
|        3.0|  681|
+-----------+-----+



25/09/25 01:51:04 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 2.0
 Schema: star_rating
Expected: star_rating but found: 2.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_8.csv


-------------------------------------------
Batch: 8
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  801|
|        5.0| 5153|
|        4.0| 1738|
|        2.0|  535|
|        3.0|  765|
+-----------+-----+



25/09/25 01:51:15 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 5.0
 Schema: star_rating
Expected: star_rating but found: 5.0
CSV file: hdfs://localhost:54310/user/ubuntu/streaming_input/part_9.csv
                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+-----------+-----+
|star_rating|count|
+-----------+-----+
|        1.0|  884|
|        5.0| 5813|
|        4.0| 1859|
|        2.0|  589|
|        3.0|  846|
+-----------+-----+

