In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

In [None]:
connector = fs.get_storage_connector("moneylion_kafka")

In [None]:
from pyspark.sql.functions import from_json, window, col, sum, udf, when, collect_set, approx_count_distinct, concat_ws
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, TimestampType, LongType, IntegerType, BooleanType

In [None]:
data = [(None, "{\"user_id\": \"user-1\", \"device_id\": \"device-1\", \"created_at\": \"2023-07-02 15:40:32\"}")]

testdf = spark.createDataFrame(data=data,schema=StructType([StructField('key', StringType(), True),
                          StructField('value', StringType(), True)]))

testdf.selectExpr("CAST(value AS STRING)")\
                   .select(from_json("value", parse_schema).alias("value"))\
                   .select("value.user_id", "value.device_id", "value.created_at")\
                   .withColumn("unique_id", concat_ws("_", col("user_id"), col("created_at")))\
                   .selectExpr("CAST(unique_id as string)", "CAST(user_id as string)", "CAST(device_id as string)", "CAST(created_at as timestamp)").show(truncate=False)

In [None]:
# Read data stream from Kafka
df = connector.read_stream(topic='user-transaction-device')

parse_schema = StructType([StructField('user_id', StringType(), True),
                          StructField('device_id', StringType(), True),
                          StructField('created_at', StringType(), True)])

# Deserialise data from and create streaming query
df = df.selectExpr("CAST(value AS STRING)")\
                   .select(from_json("value", parse_schema).alias("value"))\
                   .select("value.user_id", "value.device_id", "value.created_at")\
                   .withColumn("unique_id", concat_ws("_", col("user_id"), col("created_at")))\
                   .selectExpr("CAST(unique_id as string)", "CAST(user_id as string)", "CAST(device_id as string)", "CAST(created_at as timestamp)")

df.printSchema()

In [None]:
user_transaction = fs.get_or_create_feature_group(
    name="user_transaction_raw",
    version=1,
    description="Raw user transaction",
    primary_key=['unique_id'],
    event_time='created_at',
    online_enabled=True,
    stream=True
)

user_transaction.insert_stream(df, output_mode="update")