In [1]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings("ignore", module="IPython")

root_dir = Path().absolute()
# Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
if root_dir.parts[-1:] == ('notebooks',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
if root_dir.parts[-1:] == ('ccfraud',):
    root_dir = Path(*root_dir.parts[:-1])
    sys.path.append(str(root_dir))
root_dir = str(root_dir) 

print(f"Root dir: {root_dir}")

# Set the environment variables from the file <root_dir>/.env
from mlfs import config
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

Root dir: /home/jdowling/Projects/mlfs-book
HopsworksSettings initialized!


In [2]:
from ccfraud.features import cc_trans_fg
cc_trans_fg.root_dir = root_dir

In [3]:
import hopsworks
from datetime import datetime, timedelta
from ccfraud import synth_transactions as st 
from hsfs.feature import Feature

last_processed_date = datetime(2025, 1, 1)
current_date = datetime(2025, 10, 5)

project = hopsworks.login()
fs = project.get_feature_store()

name = "cc_trans_fg"

trans_fg = fs.get_feature_group("credit_card_transactions", version=1)
cc_fraud_fg = fs.get_feature_group("cc_fraud", version=1)
card_details_fg = fs.get_feature_group("card_details", version=1)

cc_trans_fg = fs.get_or_create_feature_group(
    name=name,
    primary_key=["t_id"],
    online_enabled=True,
    version=1,
    event_time="ts",
    features=[        
        Feature("t_id", type="bigint"),
        Feature("cc_num", type="string"),
        Feature("merchant_id", type="string"),
        Feature("account_id", type="string"),
        Feature("amount", type="double"),
        Feature("ip_address", type="string"),
        Feature("card_present", type="boolean"),
        # Feature("haversine_distance", type="bool"),
        Feature("time_since_last_trans", type="bigint"),
        Feature("days_to_card_expiry", type="bigint"),
        Feature("is_fraud", type="boolean"),
        Feature("ts", type="timestamp"),
    ],
    transformation_functions=[cc_trans_fg.haversine_distance]
    # transformation_functions=[cc_trans_fg.time_since_last_trans, cc_trans_fg.haversine_distance]
)

2025-12-22 15:57:22,938 INFO: Initializing external client
2025-12-22 15:57:22,940 INFO: Base URL: https://eu-west.cloud.hopsworks.ai:443
2025-12-22 15:57:24,149 INFO: Python Engine initialized.

Logged in to project, explore it here https://eu-west.cloud.hopsworks.ai:443/p/120


In [4]:
try:
    cc_trans_fg.save()
except Exception as e:
    print(e)

Feature Group created successfully, explore it at 
https://eu-west.cloud.hopsworks.ai:443/p/120/fs/68/fg/2161


In [5]:
from hsfs.feature import Feature
trans_df = trans_fg.filter(Feature("ts") > last_processed_date).read()
trans_df

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (27.72s) 


Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts
0,0,5927-6521-3681-9492,ACC_000746,MERCH_00084,47.73,46.198.33.251,False,2025-09-12 12:08:25
1,1,5470-7369-2928-3839,ACC_007613,MERCH_00298,9.51,84.166.253.21,False,2025-09-27 16:58:55
2,2,1886-7675-4892-3257,ACC_006639,MERCH_00438,81.49,201.31.96.116,False,2025-09-06 13:14:14
3,3,6219-2498-6352-7308,ACC_009115,MERCH_00472,102.38,125.73.92.88,True,2025-09-22 16:34:55
4,4,4822-2363-5523-6248,ACC_004159,MERCH_00237,3.19,84.115.234.215,True,2025-09-10 19:45:16
...,...,...,...,...,...,...,...,...
2000195,2000195,2962-7855-7801-3806,ACC_009157,MERCH_00403,16.87,31.175.37.140,True,2025-09-05 01:04:06
2000196,2000196,4379-3469-1569-5624,ACC_005129,MERCH_00133,9.82,186.79.27.101,True,2025-09-25 20:51:21
2000197,2000197,4379-3469-1569-5624,ACC_005129,MERCH_00074,48.28,119.150.118.153,True,2025-09-25 21:45:56
2000198,2000198,5999-3887-5209-3766,ACC_000378,MERCH_00344,11.14,81.41.130.33,True,2025-09-27 13:45:01


In [6]:
fraud_df = cc_fraud_fg.read()
fraud_df

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.94s) 


Unnamed: 0,t_id,cc_num,explanation,ts
0,2000000,4090-3058-7387-5468,Chain attack: Multiple small transactions ($16...,2025-09-10 09:47:17
1,2000001,4090-3058-7387-5468,Chain attack: Multiple small transactions ($38...,2025-09-10 09:44:58
2,2000002,4090-3058-7387-5468,Chain attack: Multiple small transactions ($38...,2025-09-10 09:54:08
3,2000003,4090-3058-7387-5468,Chain attack: Multiple small transactions ($8....,2025-09-10 09:51:54
4,2000004,4090-3058-7387-5468,Chain attack: Multiple small transactions ($6....,2025-09-10 09:49:48
...,...,...,...,...
195,2000195,2962-7855-7801-3806,Geographic fraud: Card present transaction in ...,2025-09-05 01:04:06
196,2000196,4379-3469-1569-5624,Geographic fraud: Card present transaction in ...,2025-09-25 20:51:21
197,2000197,4379-3469-1569-5624,Geographic fraud: Card present transaction in ...,2025-09-25 21:45:56
198,2000198,5999-3887-5209-3766,Geographic fraud: Card present transaction in ...,2025-09-27 13:45:01


In [7]:
trans_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000200 entries, 0 to 2000199
Data columns (total 8 columns):
 #   Column        Dtype         
---  ------        -----         
 0   t_id          int64         
 1   cc_num        object        
 2   account_id    object        
 3   merchant_id   object        
 4   amount        float64       
 5   ip_address    object        
 6   card_present  bool          
 7   ts            datetime64[us]
dtypes: bool(1), datetime64[us](1), float64(1), int64(1), object(4)
memory usage: 108.7+ MB


In [8]:
# Sort by cc_num and ts
trans_df = trans_df.sort_values(["cc_num", "ts"])

In [9]:
trans_df["prev_ts"] = trans_df["ts"].shift(1)
trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_ts
1266506,1266506,1001-8827-6069-2529,ACC_001690,MERCH_00310,27.63,81.85.4.110,False,2025-09-05 03:20:33,NaT
489031,489031,1001-8827-6069-2529,ACC_001690,MERCH_00455,78.50,142.38.100.49,True,2025-09-05 04:19:04,2025-09-05 03:20:33
261308,261308,1001-8827-6069-2529,ACC_001690,MERCH_00218,111.27,92.58.65.233,True,2025-09-05 07:57:46,2025-09-05 04:19:04
92227,92227,1001-8827-6069-2529,ACC_001690,MERCH_00101,43.40,78.119.130.78,False,2025-09-05 10:54:23,2025-09-05 07:57:46
52369,52369,1001-8827-6069-2529,ACC_001690,MERCH_00007,86.62,81.177.55.227,True,2025-09-05 22:27:37,2025-09-05 10:54:23
...,...,...,...,...,...,...,...,...,...
1177874,1177874,9999-7987-9453-1349,ACC_003356,MERCH_00165,402.53,88.107.166.38,False,2025-10-03 21:38:11,2025-10-03 03:11:31
1627052,1627052,9999-7987-9453-1349,ACC_003356,MERCH_00091,92.04,117.229.9.149,True,2025-10-04 00:17:12,2025-10-03 21:38:11
44145,44145,9999-7987-9453-1349,ACC_003356,MERCH_00145,37.54,85.40.62.94,True,2025-10-04 15:24:56,2025-10-04 00:17:12
665037,665037,9999-7987-9453-1349,ACC_003356,MERCH_00449,18.61,81.1.98.190,False,2025-10-04 18:42:12,2025-10-04 15:24:56


In [10]:
trans_df["prev_card_present"] = trans_df["card_present"].shift(1)

In [11]:
trans_df["prev_ip_address"] = trans_df["ip_address"].shift(1)

In [12]:
# Check if any of the new transactions are marked as fraudulent
# Note. There is another 'batch fraud pipeline' to also update later arriving fraud updates
trans_df["is_fraud"] = trans_df["t_id"].isin(fraud_df["t_id"])
trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_ts,prev_card_present,prev_ip_address,is_fraud
1266506,1266506,1001-8827-6069-2529,ACC_001690,MERCH_00310,27.63,81.85.4.110,False,2025-09-05 03:20:33,NaT,,,False
489031,489031,1001-8827-6069-2529,ACC_001690,MERCH_00455,78.50,142.38.100.49,True,2025-09-05 04:19:04,2025-09-05 03:20:33,False,81.85.4.110,False
261308,261308,1001-8827-6069-2529,ACC_001690,MERCH_00218,111.27,92.58.65.233,True,2025-09-05 07:57:46,2025-09-05 04:19:04,True,142.38.100.49,False
92227,92227,1001-8827-6069-2529,ACC_001690,MERCH_00101,43.40,78.119.130.78,False,2025-09-05 10:54:23,2025-09-05 07:57:46,True,92.58.65.233,False
52369,52369,1001-8827-6069-2529,ACC_001690,MERCH_00007,86.62,81.177.55.227,True,2025-09-05 22:27:37,2025-09-05 10:54:23,False,78.119.130.78,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1177874,1177874,9999-7987-9453-1349,ACC_003356,MERCH_00165,402.53,88.107.166.38,False,2025-10-03 21:38:11,2025-10-03 03:11:31,False,196.109.126.150,False
1627052,1627052,9999-7987-9453-1349,ACC_003356,MERCH_00091,92.04,117.229.9.149,True,2025-10-04 00:17:12,2025-10-03 21:38:11,False,88.107.166.38,False
44145,44145,9999-7987-9453-1349,ACC_003356,MERCH_00145,37.54,85.40.62.94,True,2025-10-04 15:24:56,2025-10-04 00:17:12,True,117.229.9.149,False
665037,665037,9999-7987-9453-1349,ACC_003356,MERCH_00449,18.61,81.1.98.190,False,2025-10-04 18:42:12,2025-10-04 15:24:56,True,85.40.62.94,False


In [13]:
trans_df['is_fraud'].value_counts()

is_fraud
False    2000000
True         200
Name: count, dtype: int64

In [14]:
from ccfraud import features

trans_df['time_since_last_trans'] = features.cc_trans_fg.time_since_last_trans(trans_df['ts'], trans_df['prev_ts'])

trans_df = trans_df.drop(columns=['prev_ts'])
trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_card_present,prev_ip_address,is_fraud,time_since_last_trans
1266506,1266506,1001-8827-6069-2529,ACC_001690,MERCH_00310,27.63,81.85.4.110,False,2025-09-05 03:20:33,,,False,0
489031,489031,1001-8827-6069-2529,ACC_001690,MERCH_00455,78.50,142.38.100.49,True,2025-09-05 04:19:04,False,81.85.4.110,False,3511
261308,261308,1001-8827-6069-2529,ACC_001690,MERCH_00218,111.27,92.58.65.233,True,2025-09-05 07:57:46,True,142.38.100.49,False,13122
92227,92227,1001-8827-6069-2529,ACC_001690,MERCH_00101,43.40,78.119.130.78,False,2025-09-05 10:54:23,True,92.58.65.233,False,10597
52369,52369,1001-8827-6069-2529,ACC_001690,MERCH_00007,86.62,81.177.55.227,True,2025-09-05 22:27:37,False,78.119.130.78,False,41594
...,...,...,...,...,...,...,...,...,...,...,...,...
1177874,1177874,9999-7987-9453-1349,ACC_003356,MERCH_00165,402.53,88.107.166.38,False,2025-10-03 21:38:11,False,196.109.126.150,False,66400
1627052,1627052,9999-7987-9453-1349,ACC_003356,MERCH_00091,92.04,117.229.9.149,True,2025-10-04 00:17:12,False,88.107.166.38,False,9541
44145,44145,9999-7987-9453-1349,ACC_003356,MERCH_00145,37.54,85.40.62.94,True,2025-10-04 15:24:56,True,117.229.9.149,False,54464
665037,665037,9999-7987-9453-1349,ACC_003356,MERCH_00449,18.61,81.1.98.190,False,2025-10-04 18:42:12,True,85.40.62.94,False,11836


In [15]:
# from collections import Counter
# features.cc_trans_fg.test_ip_resolution(trans_df['ip_address'], root_dir)

In [16]:
# trans_df['haversine_distance'] = features.cc_trans_fg.haversine_distance(trans_df['card_present'], trans_df['prev_card_present'], 
#                                                                          trans_df['ip_address'], trans_df['prev_ip_address'], 
#                                                                          trans_df['time_since_last_trans'], root_dir)



In [17]:
trans_df['days_to_card_expiry'] = 0

# trans_df = trans_df.drop(
#     columns=[
#         'prev_card_present',
#         'ip_addr',
#         'prev_ip_addr',
#         'account_id',
#         'prev_ip_address',
#         'prev_ts'
#     ],
#     errors='ignore'  # avoids errors if some columns are missing
# )
trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_card_present,prev_ip_address,is_fraud,time_since_last_trans,days_to_card_expiry
1266506,1266506,1001-8827-6069-2529,ACC_001690,MERCH_00310,27.63,81.85.4.110,False,2025-09-05 03:20:33,,,False,0,0
489031,489031,1001-8827-6069-2529,ACC_001690,MERCH_00455,78.50,142.38.100.49,True,2025-09-05 04:19:04,False,81.85.4.110,False,3511,0
261308,261308,1001-8827-6069-2529,ACC_001690,MERCH_00218,111.27,92.58.65.233,True,2025-09-05 07:57:46,True,142.38.100.49,False,13122,0
92227,92227,1001-8827-6069-2529,ACC_001690,MERCH_00101,43.40,78.119.130.78,False,2025-09-05 10:54:23,True,92.58.65.233,False,10597,0
52369,52369,1001-8827-6069-2529,ACC_001690,MERCH_00007,86.62,81.177.55.227,True,2025-09-05 22:27:37,False,78.119.130.78,False,41594,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1177874,1177874,9999-7987-9453-1349,ACC_003356,MERCH_00165,402.53,88.107.166.38,False,2025-10-03 21:38:11,False,196.109.126.150,False,66400,0
1627052,1627052,9999-7987-9453-1349,ACC_003356,MERCH_00091,92.04,117.229.9.149,True,2025-10-04 00:17:12,False,88.107.166.38,False,9541,0
44145,44145,9999-7987-9453-1349,ACC_003356,MERCH_00145,37.54,85.40.62.94,True,2025-10-04 15:24:56,True,117.229.9.149,False,54464,0
665037,665037,9999-7987-9453-1349,ACC_003356,MERCH_00449,18.61,81.1.98.190,False,2025-10-04 18:42:12,True,85.40.62.94,False,11836,0


In [18]:
# trans_df.rename(columns={'ts': 'event_time'}, inplace=True)

In [19]:
# trans_df['haversine_distance'].value_counts()

In [20]:
# This will also apply any on-demand transformations
cc_trans_fg.insert(trans_df)

Uploading Dataframe: 100.00% |█████████████████████████████████████████████████████████████████████| Rows 2000200/2000200 | Elapsed Time: 01:12 | Remaining Time: 00:00


Launching job: cc_trans_fg_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://eu-west.cloud.hopsworks.ai:443/p/120/jobs/named/cc_trans_fg_1_offline_fg_materialization/executions


(Job('cc_trans_fg_1_offline_fg_materialization', 'PYSPARK'), None)

In [21]:
trans_df

Unnamed: 0,t_id,cc_num,account_id,merchant_id,amount,ip_address,card_present,ts,prev_card_present,prev_ip_address,is_fraud,time_since_last_trans,days_to_card_expiry
1266506,1266506,1001-8827-6069-2529,ACC_001690,MERCH_00310,27.63,81.85.4.110,False,2025-09-05 03:20:33,,,False,0,0
489031,489031,1001-8827-6069-2529,ACC_001690,MERCH_00455,78.50,142.38.100.49,True,2025-09-05 04:19:04,False,81.85.4.110,False,3511,0
261308,261308,1001-8827-6069-2529,ACC_001690,MERCH_00218,111.27,92.58.65.233,True,2025-09-05 07:57:46,True,142.38.100.49,False,13122,0
92227,92227,1001-8827-6069-2529,ACC_001690,MERCH_00101,43.40,78.119.130.78,False,2025-09-05 10:54:23,True,92.58.65.233,False,10597,0
52369,52369,1001-8827-6069-2529,ACC_001690,MERCH_00007,86.62,81.177.55.227,True,2025-09-05 22:27:37,False,78.119.130.78,False,41594,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1177874,1177874,9999-7987-9453-1349,ACC_003356,MERCH_00165,402.53,88.107.166.38,False,2025-10-03 21:38:11,False,196.109.126.150,False,66400,0
1627052,1627052,9999-7987-9453-1349,ACC_003356,MERCH_00091,92.04,117.229.9.149,True,2025-10-04 00:17:12,False,88.107.166.38,False,9541,0
44145,44145,9999-7987-9453-1349,ACC_003356,MERCH_00145,37.54,85.40.62.94,True,2025-10-04 15:24:56,True,117.229.9.149,False,54464,0
665037,665037,9999-7987-9453-1349,ACC_003356,MERCH_00449,18.61,81.1.98.190,False,2025-10-04 18:42:12,True,85.40.62.94,False,11836,0
