In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

Starting Spark application


ID,Application ID,Kind,State,Spark UI,Driver log
16,application_1710157917162_1233,pyspark,idle,Link,Link


SparkSession available as 'spark'.


In [2]:
import hopsworks
import random
import pandas as pd

In [3]:
fs = hopsworks.login().get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://staging.cloud.hopsworks.ai/p/124
Connected. Call `.close()` to terminate connection gracefully.

# Setup data 

In [4]:
transactions_df = pd.read_csv("https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/transactions.csv", parse_dates=["datetime"])

In [5]:
transactions_fg = fs.get_or_create_feature_group(
    name="transactions_fg",
    version=1,
    description="Transactions data",
    primary_key=["tid"],
    online_enabled=True,
)

In [6]:
transactions_fg.insert(transactions_df, write_options = {"wait_for_job": True})

Feature Group created successfully, explore it at 
https://staging.cloud.hopsworks.ai/p/124/fs/72/fg/71
(None, None)

In [12]:
spark_dataframe = transactions_fg.read()

## Cleaning categories so that it does not have "/" or " " in the text 

In [13]:
cleaned_data = spark_dataframe.withColumn('category_parsed', trim(regexp_replace('category', '[\s/]', '_')))



## Creating Schema for Pandas udf

In [14]:
categories = cleaned_data.select("category_parsed").distinct().collect()
schema = [f'{cat.category_parsed} int' for cat in categories]
schema = ", ".join(schema)

## Pandas Udf for one hot encoding

In [15]:
@pandas_udf(schema)
def one_hot(category: pd.Series) -> pd.DataFrame :
    one_hot = pd.get_dummies(category)
    return one_hot



In [16]:
(cleaned_data.select("*", one_hot("category_parsed").alias("one_hot"))
             .select("category", "category_parsed", "one_hot.*")
             .show())

+--------------------+--------------------+---------------+-------+-----------+--------+-------------+---------------+--------+--------------------+---------------+------------------+
|            category|     category_parsed|Sports_Outdoors|Grocery|Electronics|Clothing|Health_Beauty|Holliday_Travel|Jewelery|Restaurant_Cafeteria|Cash_Withdrawal|Domestic_Transport|
+--------------------+--------------------+---------------+-------+-----------+--------+-------------+---------------+--------+--------------------+---------------+------------------+
|             Grocery|             Grocery|              0|      1|          0|       0|            0|              0|       0|                   0|              0|                 0|
|Restaurant/Cafeteria|Restaurant_Cafeteria|              0|      0|          0|       0|            0|              0|       0|                   1|              0|                 0|
|Restaurant/Cafeteria|Restaurant_Cafeteria|              0|      0|          0| 