In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, when, to_timestamp

# Create a SparkSession
spark = SparkSession.builder \
    .appName("UserClicksHashing") \
    .getOrCreate()

# Read the CSV file containing user click timestamps
clicks_df = spark.read.csv("gs://ibd_bucket-as/input.csv", header=True, inferSchema=True)

# Parse the timestamp column to a timestamp type
clicks_df = clicks_df.withColumn("timestamp", to_timestamp("timestamp", "HH:mm"))

def hash_timestamp(hour_value):
    return when((hour_value >= 0) & (hour_value < 6), "0-6") \
           .when((hour_value >= 6) & (hour_value < 12), "6-12") \
           .when((hour_value >= 12) & (hour_value < 18), "12-18") \
           .otherwise("18-24")

# Apply the hash function to the timestamps and count the clicks in each bucket
clicks_count_df = clicks_df.select(hour(col("timestamp")).alias("hour")) \
    .groupBy(hash_timestamp(col("hour")).alias("time_interval")) \
    .count() \
    .orderBy("time_interval")

# Show the resulting counts
clicks_count_df.show()

# Stop the SparkSession
spark.stop()
