In [1]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
3,application_1689280007503_0010,pyspark,idle,Link,Link


SparkSession available as 'spark'.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://35.171.129.215/p/119
Connected. Call `.close()` to terminate connection gracefully.

In [2]:
connector = fs.get_storage_connector("moneylion_kafka")

In [3]:
from pyspark.sql.functions import from_json, window, col, sum, udf, when, collect_set, approx_count_distinct
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, TimestampType, LongType, IntegerType, BooleanType

In [4]:
# Read data stream from Kafka
df = connector.read_stream(topic='user-transaction-device')

parse_schema = StructType([StructField('user_id', StringType(), True),
                          StructField('device_id', StringType(), True),
                          StructField('created_at', StringType(), True)])

# Deserialise data from and create streaming query
df = df.selectExpr("CAST(value AS STRING)")\
                   .select(from_json("value", parse_schema).alias("value"))\
                   .select("value.user_id", "value.device_id", "value.created_at")\
                   .selectExpr("CAST(user_id as string)", "CAST(device_id as string)", "CAST(created_at as timestamp)")


In [5]:
user_df = df.groupBy("user_id", window("created_at", "3 days", "1 days"))\
                   .agg(collect_set("device_id").alias("user_devices"))\
                   .select("user_id", "user_devices", "window", "window.end")\
                   .withColumnRenamed("end", "created_at")

user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_devices: array (nullable = false)
 |    |-- element: string (containsNull = false)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- created_at: timestamp (nullable = true)

In [6]:
user_device_history = fs.get_or_create_feature_group(
    name="user_device_history",
    version=2,
    description="History of distinct devices used by a user in past 3 days",
    primary_key=['user_id'],
    event_time='created_at',
    online_enabled=True,
    stream=True
)

user_device_history.insert_stream(user_df, output_mode="update")

Feature Group created successfully, explore it at 
https://35.171.129.215/p/119/fs/67/fg/21
<pyspark.sql.streaming.StreamingQuery object at 0x7fe62cd27be0>

In [7]:
device_df = df.groupBy("device_id", window("created_at", "3 days", "1 days"))\
                   .agg(approx_count_distinct("user_id").alias("device_users"))\
                   .select("device_id", "device_users", "window", "window.end")\
                   .withColumnRenamed("end", "created_at")

device_df.printSchema()

root
 |-- device_id: string (nullable = true)
 |-- device_users: long (nullable = false)
 |-- window: struct (nullable = true)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- created_at: timestamp (nullable = true)

In [8]:
device_user_history = fs.get_or_create_feature_group(
    name="device_user_history",
    version=2,
    description="History of distinct users who used a device in past 3 days",
    primary_key=['device_id'],
    event_time='created_at',
    online_enabled=True,
    stream=True
)

device_user_history.insert_stream(device_df, output_mode="update")

Feature Group created successfully, explore it at 
https://35.171.129.215/p/119/fs/67/fg/22
<pyspark.sql.streaming.StreamingQuery object at 0x7fe62cd3afa0>

In [9]:
# spark.streams.get(spark.streams.active[0].id).stop()

In [10]:
# # to be used in transformation function
# import hopsworks

# project = hopsworks.login()
# fs = project.get_feature_store()

# def max_num_users_share_device_id(value):
#     device_fv = fs.get_feature_view(
#         name='device_fv',
#         version=1
#     )

#     entries = [{"device_id": device_id} for device_id in value]

#     data = device_fv.get_feature_vectors(
#         entry = entries
#     )

#     return max(list(zip(*data))[1])