In [2]:
import json
import time 

from kafka import KafkaProducer

def json_serializer(data):
    return json.dumps(data).encode('utf-8')

server = 'localhost:9092'

producer = KafkaProducer(
    bootstrap_servers=[server],
    value_serializer=json_serializer
)

producer.bootstrap_connected()

True

In [3]:
t0 = time.time()

topic_name = 'test-topic'

for i in range(10):
    message = {'number': i}
    producer.send(topic_name, value=message)
    print(f"Sent: {message}")
    time.sleep(0.05)
    
t1 = time.time()
print(f'took {(t1 - t0):.2f} seconds')
print("data sent")
producer.flush()

t2 = time.time()
print(f'took {(t2 - t1):.2f} seconds')

Sent: {'number': 0}
Sent: {'number': 1}
Sent: {'number': 2}
Sent: {'number': 3}
Sent: {'number': 4}
Sent: {'number': 5}
Sent: {'number': 6}
Sent: {'number': 7}
Sent: {'number': 8}
Sent: {'number': 9}
took 0.56 seconds
data sent
took 0.00 seconds


In [12]:
# Create the directory (replace "data" with your desired directory name)
!mkdir data

# Download the file with the directory included in the path
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz -O "data/green_tripdata_2019-10.csv.gz"


--2024-04-07 11:45:50--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/green/green_tripdata_2019-10.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/ea580e9e-555c-4bd0-ae73-43051d8e7c0b?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240407%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240407T104549Z&X-Amz-Expires=300&X-Amz-Signature=429d84358e8c00b2d5c7b61321386bb526b8710e071860e7b57231bd5b85e69f&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dgreen_tripdata_2019-10.csv.gz&response-content-type=application%2Foctet-stream [following]
--2024-04-07 11:45:51--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/ea580e9e

In [4]:
import os

# Replace "../../../../" with the actual absolute path to your home directory
home_directory = "../../../../../"
os.environ["SPARK_HOME"] = os.path.join(home_directory, "spark-3.3.2-bin-hadoop3")

In [5]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [6]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark = SparkSession \
    .builder \
    .appName("Spark-Notebook") \
    .getOrCreate()

24/04/08 06:10:14 WARN Utils: Your hostname, Endiesworld resolves to a loopback address: 127.0.1.1; using 172.22.195.180 instead (on interface eth0)
24/04/08 06:10:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/04/08 06:10:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
# Define path to the file, including the .gz extension
file_path = "data/green_tripdata_2019-10.csv.gz"

# Read the data as a DataFrame with options for header and compression
# df = spark.read.option("header", True).option("compression", "gzip").csv(file_path)
df = spark.read.option("header", "true").option("compression", "gzip").option("inferSchema", "true").csv(file_path)


# Show the schema and the first few rows (optional)
df.printSchema()
df.show(3)

                                                                                

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)

+--------+--------------------+---------------------+------------------+----------+-

In [8]:
columns = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID','DOLocationID','passenger_count','trip_distance','tip_amount']

In [9]:
# Select specific columns
selected_columns = df.select(columns)
print("Selected Columns:")
selected_columns.show(3)

Selected Columns:
+--------------------+---------------------+------------+------------+---------------+-------------+----------+
|lpep_pickup_datetime|lpep_dropoff_datetime|PULocationID|DOLocationID|passenger_count|trip_distance|tip_amount|
+--------------------+---------------------+------------+------------+---------------+-------------+----------+
| 2019-10-01 00:26:02|  2019-10-01 00:39:58|         112|         196|              1|         5.88|       0.0|
| 2019-10-01 00:18:11|  2019-10-01 00:22:38|          43|         263|              1|          0.8|       0.0|
| 2019-10-01 00:09:31|  2019-10-01 00:24:47|         255|         228|              2|          7.5|       0.0|
+--------------------+---------------------+------------+------------+---------------+-------------+----------+
only showing top 3 rows



In [49]:
for row in selected_columns.collect():
    row_dict = row.asDict()
    data = {}
    for column_name, column_value in row_dict.items():
        data[column_name] = "{}".format(column_value)
    print(data)
    break

                                                                                

{'lpep_pickup_datetime': '2019-10-01 00:26:02', 'lpep_dropoff_datetime': '2019-10-01 00:39:58', 'PULocationID': '112', 'DOLocationID': '196', 'passenger_count': '1', 'trip_distance': '5.88', 'tip_amount': '0.0'}


In [10]:
t0 = time.time()

topic_name = 'green-trips'

for row in selected_columns.collect():
    row_dict = row.asDict()
    data = {}
    for column_name, column_value in row_dict.items():
        data[column_name] = "{}".format(column_value)
    producer.send(topic_name, value=data)
    

producer.flush()

t1 = time.time()
print(f'took {(t1 - t0):.2f} seconds to send data')

                                                                                

took 185.60 seconds to send data


In [None]:
row_dict = {col: getattr(row, col) for col in row._fields}
#     print(row_dict)

In [36]:
# # Assuming 'df_spark' is your PySpark DataFrame
# from pyspark.sql.functions import col

# # Convert timezone-aware datetime column to timezone-naive
# df_spark = selected_columns.withColumn('lpep_pickup_datetime', col('lpep_pickup_datetime').cast('timestamp'))
# df_spark = df_spark.withColumn('lpep_pickup_datetime', col('lpep_pickup_datetime').cast('timestamp'))

# # Convert PySpark DataFrame to Pandas DataFrame
# pandas_df = df_spark.toPandas()
# pandas_df.show(3)

In [37]:
# pandas_df = selected_columns.toPandas().astype({'lpep_pickup_datetime': 'datetime64[ns]', 'lpep_dropoff_datetime': 'datetime64[ns]'})


# for row in pandas_df.itertuples(index=False):
#     row_dict = {col: getattr(row, col) for col in row._fields}
#     print(row_dict)
#     break

#     # TODO implement sending the data here

In [None]:
# t0 = time.time()

# topic_name = 'green-trips'

# for i in range(10):
#     message = {'number': i}
#     producer.send(topic_name, value=message)
#     print(f"Sent: {message}")
#     time.sleep(0.05)
    
# t1 = time.time()
# print(f'took {(t1 - t0):.2f} seconds')
# print("data sent")
# producer.flush()

# t2 = time.time()
# print(f'took {(t2 - t1):.2f} seconds')

# Define a function to process each row
def process_row(row):
    row_dict = {col: getattr(row, col) for col in row._fields}
    print(row_dict)

# Apply the function to each row
df.foreach(process_row)