In [2]:
import pyspark
from pyspark.sql import SparkSession

pyspark_version = pyspark.__version__
kafka_jar_package = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark_version}"

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("GreenTripsConsumer") \
    .config("spark.jars.packages", kafka_jar_package) \
    .getOrCreate()

24/03/21 10:05:27 WARN Utils: Your hostname, codespaces-45ac75 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/03/21 10:05:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/workspaces/GabrielZoomcamp2024/week_5_batch_processing/spark/spark-3.3.2-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/codespace/.ivy2/cache
The jars for the packages stored in: /home/codespace/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b0b3ef19-7e2d-4db2-82de-424e085d597e;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.3.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.3.2 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.32 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.2 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.2 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in central
:

24/03/21 10:05:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/03/21 10:05:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
green_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "green-trips") \
    .option("startingOffsets", "earliest") \
    .load()

In [4]:
from pyspark.sql import types

schema = types.StructType() \
    .add("lpep_pickup_datetime", types.StringType()) \
    .add("lpep_dropoff_datetime", types.StringType()) \
    .add("PULocationID", types.IntegerType()) \
    .add("DOLocationID", types.IntegerType()) \
    .add("passenger_count", types.DoubleType()) \
    .add("trip_distance", types.DoubleType()) \
    .add("tip_amount", types.DoubleType())

In [5]:
from pyspark.sql import functions as F

green_stream = green_stream \
  .select(F.from_json(F.col("value").cast('STRING'), schema).alias("data")) \
  .select("data.*")

In [6]:
popular_destinations = green_stream \
    .withColumn("current_timestamp", F.current_timestamp()) \
    .groupBy(F.window(F.col("current_timestamp"), "5 minutes"), "DOLocationID") \
    .agg(F.count("*").alias("destination_count")) \
    .orderBy(F.desc("destination_count"))

In [7]:
query = popular_destinations \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", "false") \
    .start()

query.awaitTermination()

24/03/21 10:06:19 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-60b97171-85b4-47b9-829d-eb4886b9bb18. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/21 10:06:19 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |17942            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16064            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14189            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13029            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12016            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11593            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11012            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8002             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7585      

                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18015            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16116            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14254            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13084            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12047            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11619            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11061            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8035             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7597      

                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18058            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16147            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14285            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13121            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12066            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11642            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11096            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8060             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7607      

                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18100            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16185            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14315            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13141            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12087            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11660            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11111            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8082             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7616      

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18133            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16215            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14350            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13158            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12104            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11671            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11129            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8100             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7623      

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18133            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16215            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14350            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13158            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12104            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11671            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11129            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8100             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7623      

                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18133            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16215            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14350            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13158            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12104            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11671            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11129            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8100             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7623      

                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18133            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16215            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14350            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13158            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12104            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11671            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11129            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8100             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7623      

                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18133            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16215            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14350            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13158            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12104            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11671            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11129            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8100             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7623      



-------------------------------------------
Batch: 9
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18133            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16215            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14350            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13158            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12104            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11671            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11129            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8100             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7623      

ERROR:root:KeyboardInterrupt while sending command.             (134 + 2) / 200]
Traceback (most recent call last):
  File "/workspaces/GabrielZoomcamp2024/week_5_batch_processing/spark/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/workspaces/GabrielZoomcamp2024/week_5_batch_processing/spark/spark-3.3.2-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/python/3.10.13/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

KeyboardInterrupt: 

                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------+------------+-----------------+
|window                                    |DOLocationID|destination_count|
+------------------------------------------+------------+-----------------+
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|74          |18133            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|42          |16215            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|41          |14350            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|75          |13158            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|129         |12104            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|7           |11671            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|166         |11129            |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|236         |8100             |
|{2024-03-21 10:05:00, 2024-03-21 10:10:00}|223         |7623     