# Step 0. Data Preparation

In [None]:
!pip install faker --quiet
!pip install hsfs==3.7.6 --quiet
!pip install hopsworks --quiet
!pip install httpimport --quiet
!pip install ipython-secrets --quiet

In [None]:
import httpimport

url = "https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/integrations/pyspark_streaming/synthetic_data"

synthetic_data = httpimport.load("synthetic_data", url)

In [None]:
import pandas as pd

from confluent_kafka import Producer

## Creating Simulated Data

In [None]:
data_simulator = synthetic_data.synthetic_data()

profiles_df, trans_df = data_simulator.create_simulated_transactions()

## Connecting to Hopsworks Feature Store

In [None]:
import hopsworks
from ipython_secrets import *
KEY = get_secret('HOPSWORKS_API_KEY')
project = hopsworks.login(host="c.app.hopsworks.ai", api_key_value=KEY)

In [None]:
kafka_api = project.get_kafka_api()

fs = project.get_feature_store()

## Creating Feature Groups

In [None]:
profile_fg = fs.get_or_create_feature_group(
        name="profile",
        primary_key=["cc_num"],
        partition_key=["cc_provider"],
        online_enabled=True,
        version=1
)

profile_fg.insert(profiles_df, overwrite=True)

In [None]:
profiles_df.head()

## Kafka Topic and Schema Creation

In [None]:
# create kafka topic
KAFKA_INPUT_TOPIC = "transactions_topic_" + str(project.id)
SCHEMA_NAME = "transactions_schema_" + str(project.id)

In [None]:
schema = {
    "type": "record",
    "name": KAFKA_INPUT_TOPIC,
    "namespace": "io.hops.examples.feldera.example",
    "fields": [
        {
            "name": "tid",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "date_time",
            "type": [
                "null",
                {
                    "type": "string",
                    "logicalType": "timestamp-micros"
                }
            ]
        },
        {
            "name": "cc_num",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "category",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "amount",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "latitude",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "longitude",
            "type": [
                "null",
                "double"
            ]
        },
        {
            "name": "city",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "country",
            "type": [
                "null",
                "string"
            ]
        },
        {
            "name": "fraud_label",
            "type": [
                "null",
                "int"
            ]
        },
    ]
}

In [None]:
if KAFKA_INPUT_TOPIC not in [topic.name for topic in kafka_api.get_topics()]:
    kafka_api.create_schema(KAFKA_INPUT_TOPIC, schema)
    kafka_api.create_topic(KAFKA_INPUT_TOPIC, KAFKA_INPUT_TOPIC, 1, replicas=1, partitions=1)

In [None]:
from hsfs import engine

kafka_config = engine.get_instance()._get_kafka_config(fs.id, {})

## Sending Data using created Kafka Topic

In [None]:
trans_df = trans_df.rename(columns={"datetime": "date_time"})

trans_df["tid"] = trans_df["tid"].astype("string")
trans_df["date_time"] = trans_df["date_time"].astype("datetime64[s]").astype("string")
trans_df["cc_num"] = trans_df["cc_num"].astype("string")
trans_df["category"] = trans_df["category"].astype("string")
trans_df["amount"] = trans_df["amount"].astype("double")
trans_df["latitude"] = trans_df["latitude"].astype("double")
trans_df["longitude"] = trans_df["longitude"].astype("double")
trans_df["city"] = trans_df["city"].astype("string")
trans_df["country"] = trans_df["country"].astype("string")
trans_df["fraud_label"] = trans_df["fraud_label"].astype("int")

In [None]:
producer = Producer(kafka_config)

for index, transaction in trans_df.iterrows():
    producer.produce(KAFKA_INPUT_TOPIC, transaction.to_json())
    
    if index % 5000 == 0:
        producer.flush()
        print(f'Finished sending index {index}')

producer.flush()