In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('ccfraud',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

print(f"Root dir: {root_dir}")

# Add the root directory to the `PYTHONPATH` 
if root_dir not in sys.path:
    sys.path.append(root_dir)
    print(f"Added the following directory to the PYTHONPATH: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Local environment
Root dir: /home/jdowling/Projects/mlfs-book
Added the following directory to the PYTHONPATH: /home/jdowling/Projects/mlfs-book
HopsworksSettings initialized!


In [2]:
import hopsworks
from datetime import datetime, timedelta
import synth_transactions as st
from hsfs.feature import Feature
from mlfs.ccfraud.features import cc_trans_fg

last_processed_date = datetime(2025, 1, 1)
current_date = datetime(2025, 10, 5)

project = hopsworks.login(engine="python")
fs = project.get_feature_store()

name = "cc_trans_fg"

trans_fg = fs.get_feature_group("credit_card_transactions")
cc_fraud_fg = fs.get_feature_group("cc_fraud")
card_details_fg = fs.get_feature_group("card_details")

cc_trans_fg = fs.get_or_create_feature_group(
    name=name,
    primary_key=["t_id"],
    online_enabled=True,
    version=1,
    event_time="event_time",
    features=[        
        Feature("t_id", type="bigint"),
        Feature("cc_num", type="string"),
        Feature("merchant_id", type="string"),
        Feature("amount", type="double"),
        Feature("ip_address", type="string"),
        Feature("card_present", type="boolean"),
        Feature("haversine_distance", type="boolean"),
        Feature("time_since_last_trans", type="bigint"),
        Feature("days_to_card_expiry", type="bigint"),
        Feature("is_fraud", type="boolean"),
        Feature("event_time", type="timestamp"),
    ],
    # transformation_functions=[cc_trans_fg.time_since_last_trans, cc_trans_fg.haversine_distance]
)

2025-11-10 22:06:02,317 INFO: Initializing external client
2025-11-10 22:06:02,318 INFO: Base URL: https://stagingmain.devnet.hops.works:443
2025-11-10 22:06:03,238 INFO: Python Engine initialized.

Logged in to project, explore it here https://stagingmain.devnet.hops.works:443/p/122





In [3]:
try:
    cc_trans_fg.save()
except Exception as e:
    print(e)

Feature Group created successfully, explore it at 
https://stagingmain.devnet.hops.works:443/p/122/fs/70/fg/79


In [25]:
from hsfs.feature import Feature
trans_df = trans_fg.filter(Feature("ts") > last_processed_date).read()
trans_df

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (7.64s) 


Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts
0,235393,7739-7037-4842-5846,ACC_007712,MERCH_00097,51.28,85.7.109.198,True,2025-09-24 14:03:26+00:00
1,327170,7630-2329-8390-2340,ACC_001494,MERCH_00201,3.01,186.74.252.222,False,2025-10-02 18:32:10+00:00
2,399165,1713-9423-3243-2732,ACC_006061,MERCH_00309,24.00,191.209.210.46,False,2025-09-16 06:57:30+00:00
3,487672,8511-3913-8072-8323,ACC_006464,MERCH_00094,25.75,196.6.221.19,True,2025-10-03 11:41:21+00:00
4,328344,3742-1345-5351-9722,ACC_001400,MERCH_00365,5.58,61.12.139.45,True,2025-10-03 01:19:11+00:00
...,...,...,...,...,...,...,...,...
500044,15323,4447-8847-6698-7103,ACC_000733,MERCH_00385,146.04,108.187.4.114,False,2025-09-21 04:55:52+00:00
500045,288280,5009-4671-2294-5255,ACC_000091,MERCH_00052,24.95,79.33.230.69,True,2025-09-12 15:43:24+00:00
500046,61807,8390-3300-7982-8312,ACC_006113,MERCH_00123,7.70,81.112.234.185,True,2025-09-10 15:25:14+00:00
500047,268022,4032-2742-9068-7218,ACC_007790,MERCH_00361,147.77,79.90.191.34,True,2025-09-23 13:57:33+00:00


In [5]:
fraud_df = cc_fraud_fg.read()
fraud_df

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.33s) 


Unnamed: 0,t_id,cc_num,explanation,ts
0,500028,6693-6874-8669-2799,Chain attack: Multiple small transactions ($36...,2025-09-16 07:22:21+00:00
1,500042,2577-3855-7044-7124,Chain attack: Multiple small transactions ($46...,2025-09-16 21:43:42+00:00
2,500037,2577-3855-7044-7124,Chain attack: Multiple small transactions ($23...,2025-09-16 21:24:17+00:00
3,500000,8458-1227-1197-5619,Chain attack: Multiple small transactions ($16...,2025-09-10 09:47:17+00:00
4,500005,8458-1227-1197-5619,Chain attack: Multiple small transactions ($14...,2025-09-10 09:44:11+00:00
5,500030,6693-6874-8669-2799,Chain attack: Multiple small transactions ($33...,2025-09-16 07:34:02+00:00
6,500025,6693-6874-8669-2799,Chain attack: Multiple small transactions ($25...,2025-09-16 07:22:44+00:00
7,500018,6106-9951-9916-8954,Chain attack: Multiple small transactions ($10...,2025-09-09 23:23:18+00:00
8,500048,3150-3733-1696-9876,Geographic fraud: Card present transaction in ...,2025-09-11 17:59:52+00:00
9,500017,6106-9951-9916-8954,Chain attack: Multiple small transactions ($37...,2025-09-09 23:06:28+00:00


In [26]:
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500049 entries, 0 to 500048
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype                  
---  ------        --------------   -----                  
 0   t_id          500049 non-null  int64                  
 1   cc_num        500049 non-null  object                 
 2   account_id    500049 non-null  object                 
 3   merchant_id   500049 non-null  object                 
 4   amount        500049 non-null  float64                
 5   ip_address    500049 non-null  object                 
 6   card_present  500049 non-null  bool                   
 7   ts            500049 non-null  datetime64[us, Etc/UTC]
dtypes: bool(1), datetime64[us, Etc/UTC](1), float64(1), int64(1), object(4)
memory usage: 27.2+ MB


In [27]:
# Sort by cc_num and ts
trans_df = trans_df.sort_values(["cc_num", "ts"])

# Compute previous timestamp per cc_num
# trans_df["prev_ts"] = trans_df.groupby("cc_num")["ts"].shift(1)

trans_df["prev_ts"] = trans_df["ts"].shift(1)

trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_ts
431397,148071,1001-8827-6069-2529,ACC_001690,MERCH_00052,45.04,82.36.250.114,True,2025-09-09 07:07:48+00:00,NaT
348640,15773,1001-8827-6069-2529,ACC_001690,MERCH_00204,13.37,60.37.254.81,False,2025-09-13 11:26:54+00:00,2025-09-09 07:07:48+00:00
274501,411740,1001-8827-6069-2529,ACC_001690,MERCH_00047,15.28,110.23.134.5,True,2025-09-15 09:37:19+00:00,2025-09-13 11:26:54+00:00
113926,367726,1001-8827-6069-2529,ACC_001690,MERCH_00377,16.78,196.93.236.27,False,2025-09-15 13:32:12+00:00,2025-09-15 09:37:19+00:00
301383,457518,1001-8827-6069-2529,ACC_001690,MERCH_00420,17.28,61.98.190.145,False,2025-09-16 05:19:12+00:00,2025-09-15 13:32:12+00:00
...,...,...,...,...,...,...,...,...,...
113097,183403,9999-7987-9453-1349,ACC_003356,MERCH_00163,113.17,27.23.225.196,False,2025-09-29 10:10:32+00:00,2025-09-24 09:45:03+00:00
248798,21248,9999-7987-9453-1349,ACC_003356,MERCH_00013,52.75,196.10.142.246,True,2025-09-29 13:04:21+00:00,2025-09-29 10:10:32+00:00
408596,401223,9999-7987-9453-1349,ACC_003356,MERCH_00038,66.70,186.64.30.156,True,2025-09-30 00:07:29+00:00,2025-09-29 13:04:21+00:00
150865,326217,9999-7987-9453-1349,ACC_003356,MERCH_00342,31.81,119.183.74.116,False,2025-10-02 03:55:00+00:00,2025-09-30 00:07:29+00:00


In [28]:
trans_df["prev_card_present"] = trans_df["card_present"].shift(1)


In [29]:
trans_df["prev_ip_address"] = trans_df["ip_address"].shift(1)


In [30]:
# Check if any of the new transactions are marked as fraudulent
# Note. There is another 'batch fraud pipeline' to also update later arriving fraud updates
trans_df["is_fraud"] = trans_df["t_id"].isin(fraud_df["t_id"])
trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_ts,prev_card_present,prev_ip_address,is_fraud
431397,148071,1001-8827-6069-2529,ACC_001690,MERCH_00052,45.04,82.36.250.114,True,2025-09-09 07:07:48+00:00,NaT,,,False
348640,15773,1001-8827-6069-2529,ACC_001690,MERCH_00204,13.37,60.37.254.81,False,2025-09-13 11:26:54+00:00,2025-09-09 07:07:48+00:00,True,82.36.250.114,False
274501,411740,1001-8827-6069-2529,ACC_001690,MERCH_00047,15.28,110.23.134.5,True,2025-09-15 09:37:19+00:00,2025-09-13 11:26:54+00:00,False,60.37.254.81,False
113926,367726,1001-8827-6069-2529,ACC_001690,MERCH_00377,16.78,196.93.236.27,False,2025-09-15 13:32:12+00:00,2025-09-15 09:37:19+00:00,True,110.23.134.5,False
301383,457518,1001-8827-6069-2529,ACC_001690,MERCH_00420,17.28,61.98.190.145,False,2025-09-16 05:19:12+00:00,2025-09-15 13:32:12+00:00,False,196.93.236.27,False
...,...,...,...,...,...,...,...,...,...,...,...,...
113097,183403,9999-7987-9453-1349,ACC_003356,MERCH_00163,113.17,27.23.225.196,False,2025-09-29 10:10:32+00:00,2025-09-24 09:45:03+00:00,True,1.168.80.210,False
248798,21248,9999-7987-9453-1349,ACC_003356,MERCH_00013,52.75,196.10.142.246,True,2025-09-29 13:04:21+00:00,2025-09-29 10:10:32+00:00,False,27.23.225.196,False
408596,401223,9999-7987-9453-1349,ACC_003356,MERCH_00038,66.70,186.64.30.156,True,2025-09-30 00:07:29+00:00,2025-09-29 13:04:21+00:00,True,196.10.142.246,False
150865,326217,9999-7987-9453-1349,ACC_003356,MERCH_00342,31.81,119.183.74.116,False,2025-10-02 03:55:00+00:00,2025-09-30 00:07:29+00:00,True,186.64.30.156,False


In [31]:
trans_df['is_fraud'].value_counts()

is_fraud
False    500000
True         49
Name: count, dtype: int64

In [32]:
from mlfs.ccfraud import features

trans_df['time_since_last_trans'] = features.cc_trans_fg.time_since_last_trans(trans_df['ts'], trans_df['prev_ts'])
trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_ts,prev_card_present,prev_ip_address,is_fraud,time_since_last_trans
431397,148071,1001-8827-6069-2529,ACC_001690,MERCH_00052,45.04,82.36.250.114,True,2025-09-09 07:07:48+00:00,NaT,,,False,0
348640,15773,1001-8827-6069-2529,ACC_001690,MERCH_00204,13.37,60.37.254.81,False,2025-09-13 11:26:54+00:00,2025-09-09 07:07:48+00:00,True,82.36.250.114,False,361146
274501,411740,1001-8827-6069-2529,ACC_001690,MERCH_00047,15.28,110.23.134.5,True,2025-09-15 09:37:19+00:00,2025-09-13 11:26:54+00:00,False,60.37.254.81,False,166225
113926,367726,1001-8827-6069-2529,ACC_001690,MERCH_00377,16.78,196.93.236.27,False,2025-09-15 13:32:12+00:00,2025-09-15 09:37:19+00:00,True,110.23.134.5,False,14093
301383,457518,1001-8827-6069-2529,ACC_001690,MERCH_00420,17.28,61.98.190.145,False,2025-09-16 05:19:12+00:00,2025-09-15 13:32:12+00:00,False,196.93.236.27,False,56820
...,...,...,...,...,...,...,...,...,...,...,...,...,...
113097,183403,9999-7987-9453-1349,ACC_003356,MERCH_00163,113.17,27.23.225.196,False,2025-09-29 10:10:32+00:00,2025-09-24 09:45:03+00:00,True,1.168.80.210,False,433529
248798,21248,9999-7987-9453-1349,ACC_003356,MERCH_00013,52.75,196.10.142.246,True,2025-09-29 13:04:21+00:00,2025-09-29 10:10:32+00:00,False,27.23.225.196,False,10429
408596,401223,9999-7987-9453-1349,ACC_003356,MERCH_00038,66.70,186.64.30.156,True,2025-09-30 00:07:29+00:00,2025-09-29 13:04:21+00:00,True,196.10.142.246,False,39788
150865,326217,9999-7987-9453-1349,ACC_003356,MERCH_00342,31.81,119.183.74.116,False,2025-10-02 03:55:00+00:00,2025-09-30 00:07:29+00:00,True,186.64.30.156,False,186451


In [33]:
# trans_df['time_since_last_trans'] = features.cc_trans_fg.time_since_last_trans(trans_df['ts'], trans_df['prev_ts'])
# trans_df

In [34]:
# from collections import Counter
# features.cc_trans_fg.test_ip_resolution(trans_df['ip_address'], root_dir)

In [35]:
trans_df['haversine_distance'] = features.cc_trans_fg.haversine_distance(trans_df['card_present'], trans_df['prev_card_present'], 
                                                                         trans_df['ip_address'], trans_df['prev_ip_address'], 
                                                                         trans_df['time_since_last_trans'], root_dir)

trans_df['days_to_card_expiry'] = 0

In [36]:
trans_df = trans_df.drop(
    columns=[
        'prev_card_present',
        'ip_addr',
        'prev_ip_addr',
        'account_id',
        'prev_ip_address',
        'prev_ts'
    ],
    errors='ignore'  # avoids errors if some columns are missing
)
trans_df

Unnamed: 0,t_id,cc_num,merchant_id,amount,ip_address,card_present,ts,is_fraud,time_since_last_trans,haversine_distance,days_to_card_expiry
431397,148071,1001-8827-6069-2529,MERCH_00052,45.04,82.36.250.114,True,2025-09-09 07:07:48+00:00,False,0,False,0
348640,15773,1001-8827-6069-2529,MERCH_00204,13.37,60.37.254.81,False,2025-09-13 11:26:54+00:00,False,361146,True,0
274501,411740,1001-8827-6069-2529,MERCH_00047,15.28,110.23.134.5,True,2025-09-15 09:37:19+00:00,False,166225,True,0
113926,367726,1001-8827-6069-2529,MERCH_00377,16.78,196.93.236.27,False,2025-09-15 13:32:12+00:00,False,14093,False,0
301383,457518,1001-8827-6069-2529,MERCH_00420,17.28,61.98.190.145,False,2025-09-16 05:19:12+00:00,False,56820,True,0
...,...,...,...,...,...,...,...,...,...,...,...
113097,183403,9999-7987-9453-1349,MERCH_00163,113.17,27.23.225.196,False,2025-09-29 10:10:32+00:00,False,433529,True,0
248798,21248,9999-7987-9453-1349,MERCH_00013,52.75,196.10.142.246,True,2025-09-29 13:04:21+00:00,False,10429,False,0
408596,401223,9999-7987-9453-1349,MERCH_00038,66.70,186.64.30.156,True,2025-09-30 00:07:29+00:00,False,39788,False,0
150865,326217,9999-7987-9453-1349,MERCH_00342,31.81,119.183.74.116,False,2025-10-02 03:55:00+00:00,False,186451,True,0


In [37]:
trans_df.rename(columns={'ts': 'event_time'}, inplace=True)

In [38]:
trans_df['haversine_distance'].value_counts()

haversine_distance
True     425080
False     74969
Name: count, dtype: int64

In [39]:
# This will also apply any on-demand transformations
cc_trans_fg.insert(trans_df)

Uploading Dataframe: 100.00% |███████████████████████████████████████████████████████████████████████| Rows 500049/500049 | Elapsed Time: 02:22 | Remaining Time: 00:00


Launching job: cc_trans_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://stagingmain.devnet.hops.works:443/p/122/jobs/named/cc_trans_fg_1_offline_fg_materialization/executions


(Job('cc_trans_fg_1_offline_fg_materialization', 'PYSPARK'), None)