# Step 0. Data Preparation

In [2]:
!pip install Faker --quiet
!pip install httpimport --quiet

In [3]:
import httpimport

url = "https://raw.githubusercontent.com/logicalclocks/hopsworks-tutorials/master/integrations/pyspark_streaming/synthetic_data"

synthetic_data = httpimport.load("synthetic_data", url)

In [4]:
from confluent_kafka import Producer

## Creating Simulated Data

In [5]:
data_simulator = synthetic_data.synthetic_data()

profiles_df, trans_df = data_simulator.create_simulated_transactions()

## Connecting to Hopsworks Feature Store

In [6]:
import hopsworks
project = hopsworks.login()

2025-10-02 22:55:58,053 INFO: Initializing external client
2025-10-02 22:55:58,054 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-10-02 22:55:59,816 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/398


In [7]:
kafka_api = project.get_kafka_api()

fs = project.get_feature_store()

In [9]:
profiles_df

Unnamed: 0,cc_num,cc_provider,cc_type,cc_expiration_date,name,mail,birthdate,age,city,country_of_residence
0,4561945063212434,mastercard,credit,02/27,Andrea Watson,Andrea Watson,1950-04-02,75,Collinwood,US
1,4270311811493341,mastercard,debit,07/26,Barbara Mendez,Barbara Mendez,1976-05-18,49,Palestine,US
2,4798484817726015,visa,debit,04/29,James Lewis,James Lewis,1955-05-18,70,Woodbridge,US
3,4100614711627579,mastercard,credit,04/29,John Garcia,John Garcia,1977-07-28,48,Independence,US
4,4809183942269373,mastercard,credit,01/27,Katelyn Cochran,Katelyn Cochran,2002-11-21,22,Thomasville,US
...,...,...,...,...,...,...,...,...,...,...
95,4999241535506310,visa,credit,05/25,Maurice Simpson,Maurice Simpson,1984-09-05,41,Searcy,US
96,4295351106565222,visa,debit,04/29,Jason Jones,Jason Jones,2005-03-30,20,Cypress Hills,US
97,4473353116781546,visa,credit,07/29,Amanda Sanders,Amanda Sanders,1927-07-06,98,Venice,US
98,4065232444199817,mastercard,debit,04/25,Andrew Franco,Andrew Franco,1987-06-05,38,Copperas Cove,US


## Creating Feature Groups

In [11]:
profile_fg = fs.get_or_create_feature_group(
    name="profile",
    primary_key=["cc_num"],
    partition_key=["cc_provider"],
    online_enabled=True,
    version=1,
)

%6|1759438772.650|FAIL|rdkafka#producer-1| [thrd:ssl://51.161.81.188:9093/bootstrap]: ssl://51.161.81.188:9093/1: Disconnected (after 48246ms in state UP, 1 identical error(s) suppressed)


In [12]:
profile_fg.insert(profiles_df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/398/fs/335/fg/1534427


Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████████████████████████| Rows 100/100 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: profile_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/profile_1_offline_fg_materialization/executions


(Job('profile_1_offline_fg_materialization', 'SPARK'), None)

In [13]:
profiles_df.head()

Unnamed: 0,cc_num,cc_provider,cc_type,cc_expiration_date,name,mail,birthdate,age,city,country_of_residence
0,4561945063212434,mastercard,credit,02/27,Andrea Watson,Andrea Watson,1950-04-02,75,Collinwood,US
1,4270311811493341,mastercard,debit,07/26,Barbara Mendez,Barbara Mendez,1976-05-18,49,Palestine,US
2,4798484817726015,visa,debit,04/29,James Lewis,James Lewis,1955-05-18,70,Woodbridge,US
3,4100614711627579,mastercard,credit,04/29,John Garcia,John Garcia,1977-07-28,48,Independence,US
4,4809183942269373,mastercard,credit,01/27,Katelyn Cochran,Katelyn Cochran,2002-11-21,22,Thomasville,US


## Kafka Topic and Schema Creation

In [14]:
# create kafka topic
KAFKA_INPUT_TOPIC = "transactions_topic_" + str(project.id)
SCHEMA_NAME = "transactions_schema_" + str(project.id)

In [15]:
schema = {
    "type": "record",
    "name": KAFKA_INPUT_TOPIC,
    "namespace": "io.hops.examples.feldera.example",
    "fields": [
        {"name": "tid", "type": ["null", "string"]},
        {
            "name": "date_time",
            "type": ["null", {"type": "string", "logicalType": "timestamp-micros"}],
        },
        {"name": "cc_num", "type": ["null", "string"]},
        {"name": "category", "type": ["null", "string"]},
        {"name": "amount", "type": ["null", "double"]},
        {"name": "latitude", "type": ["null", "double"]},
        {"name": "longitude", "type": ["null", "double"]},
        {"name": "city", "type": ["null", "string"]},
        {"name": "country", "type": ["null", "string"]},
        {"name": "fraud_label", "type": ["null", "int"]},
    ],
}

In [16]:
if KAFKA_INPUT_TOPIC not in [topic.name for topic in kafka_api.get_topics()]:
    kafka_api.create_schema(KAFKA_INPUT_TOPIC, schema)
    kafka_api.create_topic(
        KAFKA_INPUT_TOPIC, KAFKA_INPUT_TOPIC, 1, replicas=1, partitions=1
    )

%6|1759438822.733|FAIL|rdkafka#producer-1| [thrd:ssl://51.161.81.188:9093/bootstrap]: ssl://51.161.81.188:9093/1: Disconnected (after 47786ms in state UP, 1 identical error(s) suppressed)


In [18]:
# from hsfs import engine
# kafka_config = engine.get_instance()._get_kafka_config(fs.id, {})

kafka_config = kafka_api.get_default_config()
kafka_config

{'security.protocol': 'SSL',
 'ssl.ca.location': '/tmp/c.app.hopsworks.ai/dowlingj/ca_chain.pem',
 'ssl.certificate.location': '/tmp/c.app.hopsworks.ai/dowlingj/client_cert.pem',
 'ssl.key.location': '/tmp/c.app.hopsworks.ai/dowlingj/client_key.pem',
 'client.id': 'LAPTOP-7SJ0Q078',
 'group.id': 'my-group-id',
 'ssl.endpoint.identification.algorithm': 'none',
 'bootstrap.servers': '51.161.81.188:9093,51.161.81.208:9093,51.161.80.189:9093'}

## Sending Data using created Kafka Topic

In [19]:
trans_df = trans_df.rename(columns={"datetime": "date_time"})

trans_df["tid"] = trans_df["tid"].astype("string")
trans_df["date_time"] = trans_df["date_time"].astype("datetime64[s]").astype("string")
trans_df["cc_num"] = trans_df["cc_num"].astype("string")
trans_df["category"] = trans_df["category"].astype("string")
trans_df["amount"] = trans_df["amount"].astype("double")
trans_df["latitude"] = trans_df["latitude"].astype("double")
trans_df["longitude"] = trans_df["longitude"].astype("double")
trans_df["city"] = trans_df["city"].astype("string")
trans_df["country"] = trans_df["country"].astype("string")
trans_df["fraud_label"] = trans_df["fraud_label"].astype("int")

In [20]:
producer = Producer(kafka_config)

for index, transaction in trans_df.iterrows():
    producer.produce(KAFKA_INPUT_TOPIC, transaction.to_json())

    if index % 5000 == 0:
        producer.flush()
        print(f"Finished sending index {index}")

producer.flush()

%4|1759438903.226|CONFWARN|LAPTOP-7SJ0Q078#producer-4| [thrd:app]: Configuration property group.id is a consumer property and will be ignored by this producer instance


Finished sending index 0
Finished sending index 5000
Finished sending index 10000
Finished sending index 15000
Finished sending index 20000
Finished sending index 25000
Finished sending index 30000
Finished sending index 35000
Finished sending index 40000
Finished sending index 45000
Finished sending index 50000
Finished sending index 55000


0