In [1]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
2,application_1689280007503_0007,pyspark,idle,Link,Link


SparkSession available as 'spark'.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://35.171.129.215/p/119
Connected. Call `.close()` to terminate connection gracefully.

In [2]:
connector = fs.get_storage_connector("moneylion_kafka")

In [3]:
from pyspark.sql.functions import from_json, window, col, sum, udf, when
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, TimestampType, LongType, IntegerType, BooleanType

In [4]:
import requests
api_url = "https://randomuser.me/api/"
response = requests.get(api_url)
response.json()['results'][0]['gender']

'male'

In [5]:
# Read data stream from Kafka
df = connector.read_stream(topic='user-transaction-api')

full_schema = StructType([StructField('user_id', StringType(), True),
                          StructField('created_at', StringType(), True)])

@udf(returnType=DoubleType())
def get_longitude(user_id):
    api_url = "https://randomuser.me/api/"
    response = requests.get(api_url)
    return float(response.json()['results'][0]['location']['coordinates']['longitude'])

# Deserialise data from and create streaming query
df = df.selectExpr("CAST(value AS STRING)")\
                   .select(from_json("value", full_schema).alias("value"))\
                   .select("value.user_id", "value.created_at")\
                   .selectExpr("CAST(user_id as string)", "CAST(created_at as timestamp)")\
                   .withColumn("longitude", get_longitude(col("user_id")))\
                   .groupBy("user_id", window("created_at", "2 days", "1 days"))\
                   .agg(sum("longitude").alias("longitude_sum"))\
                   .select("user_id", "longitude_sum", "window.end")\
                   .withColumnRenamed("end", "created_at")

df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- longitude_sum: double (nullable = true)
 |-- created_at: timestamp (nullable = true)

In [6]:
user_transaction_longitude = fs.get_or_create_feature_group(
    name="user_transaction_longitude",
    version=1,
    description="User transaction longitude",
    primary_key=['user_id'],
    event_time='created_at',
    online_enabled=True,
    stream=True
)

user_transaction_longitude.insert_stream(df, output_mode="update")

Feature Group created successfully, explore it at 
https://35.171.129.215/p/119/fs/67/fg/19
<pyspark.sql.streaming.StreamingQuery object at 0x7f5d9c8bdd00>

In [7]:
@udf(returnType=StringType())
def get_gender(user_id):
    api_url = "https://randomuser.me/api/"
    response = requests.get(api_url)
    gender = response.json()['results'][0]['gender']
    if gender == 'male':
        return 'M'
    return 'F'


df = df.withColumn("gender", get_gender(col("user_id")))\
       .select("user_id", "longitude_sum", "gender", "created_at")

df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- longitude_sum: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- created_at: timestamp (nullable = true)

In [8]:
user_transaction_gender = fs.get_or_create_feature_group(
    name="user_transaction_gender",
    version=1,
    description="User transaction gender",
    primary_key=['user_id'],
    event_time='created_at',
    online_enabled=True,
    stream=True
)

user_transaction_gender.insert_stream(df, output_mode="update")

Feature Group created successfully, explore it at 
https://35.171.129.215/p/119/fs/67/fg/20
<pyspark.sql.streaming.StreamingQuery object at 0x7f5d9c8d5520>

In [9]:
# spark.streams.get(spark.streams.active[0].id).stop()