In [2]:
import hopsworks
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

import math
import pandas as pd

from features.transactions import calculate_time_delta_t_minus_1, calculate_loc_delta_t_minus_1

In [3]:
# Connect to the Hopsworks Feature Store
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://pocs.cloud.hopsworks.ai/p/125
Connected. Call `.close()` to terminate connection gracefully.


In [4]:
# Read the historical transaction data
transactions_pdf = pd.read_csv("../../../../../RawData/historical_transactions.csv", parse_dates=['datetime'])

In [5]:
transactions_pdf = transactions_pdf[["tid", "datetime", "cc_num", "category", "amount", "latitude", 'longitude', 'city', 'fraud_label']]

# cc_num is more of an account_id rather than a proper credit card number, rename the column to avoid confusion
transactions_pdf.rename(columns={'cc_num': 'account_id'}, inplace=True) 

In [6]:
transactions_pdf

Unnamed: 0,tid,datetime,account_id,category,amount,latitude,longitude,city,fraud_label
0,889245b951dbe4af5e813fd3a796f690,2023-10-12 18:13:57,1fec5056a7aa1d952fde354e31fa3095,Grocery,42.51,42.425100,-71.066160,Malden,0
1,6ba02a6dbd3491939a42db9029d2efcd,2023-10-12 18:14:51,943571826a9ef8c079815b80f4870c5c,Domestic Transport,52.39,39.435340,-84.202990,Lebanon,0
2,18181617fc23876176c13e38bd35f711,2023-10-12 18:17:42,bf3296da48b23a6fba5315f90151ceab,Grocery,96.16,35.747880,-95.369690,Muskogee,0
3,d4335ac3dae3b103023e25287f250723,2023-10-12 18:44:07,a883ea3831e0a0bd58672655c586f990,Clothing,98.97,27.099780,-82.454260,Venice,0
4,35d25c232389667ff39b64d461f4ba63,2023-10-12 18:51:32,68a5869412e56139ebce00f393ba6fb5,Restaurant/Cafeteria,65.24,42.739200,-84.620810,Waverly,0
...,...,...,...,...,...,...,...,...,...
74803,330416731e4bfd843120a447478c13f7,2024-03-18 08:13:11,aca95724ff0b9ad2a73540234cae2d59,Cash Withdrawal,19.74,41.149146,-73.507012,New Canaan,0
74804,8e7d56e902f631f4f66514219ba9be48,2024-03-16 09:13:11,aca95724ff0b9ad2a73540234cae2d59,Cash Withdrawal,2.88,41.140591,-73.499096,New Canaan,0
74805,a8a4f239d25986aa2f99b3fa2635e41b,2024-03-14 10:13:11,aca95724ff0b9ad2a73540234cae2d59,Cash Withdrawal,440.57,41.143777,-73.506725,New Canaan,0
74806,1209e928850d0cc60966aaae7741864f,2024-03-12 11:13:11,aca95724ff0b9ad2a73540234cae2d59,Cash Withdrawal,0.38,41.153046,-73.510644,New Canaan,0


In [7]:
transactions_pdf['datetime'].min()

Timestamp('2023-08-14 03:08:49')

In [8]:
transactions_pdf['datetime'].max()

Timestamp('2024-04-09 17:48:28')

In [9]:
# Compute time delta from previous transaction
transactions_pdf = calculate_time_delta_t_minus_1(transactions_pdf)

In [10]:
# Compute location delta from previous transaction
transactions_pdf = calculate_loc_delta_t_minus_1(transactions_pdf)



In [11]:
# Compute feature to determine whether or not a transaction was made outside the city of residence of the user
# Retrieve user profiles information
profiles_fg = fs.get_feature_group("profiles", version=1)
profiles_pdf = profiles_fg.select(['account_id', 'city']).read()

# Join with transaction dataframe
transactions_pdf = transactions_pdf.merge(profiles_pdf, on='account_id')
transactions_pdf["outside_city"] = (transactions_pdf['city_x'] != transactions_pdf['city_y']).astype(int)

# Drop helper column
transactions_pdf.drop(['city_x', 'city_y', 'latitude', 'longitude'], axis=1, inplace=True)



Finished: Reading data from Hopsworks, using Hive (0.74s) 


In [12]:
transactions_pdf

Unnamed: 0,tid,datetime,account_id,category,amount,fraud_label,time_delta_t_minus_1,loc_delta_t_minus_1,outside_city
0,d5190d69c8566f6c4aa8ff4f77b27c4e,2023-08-14 03:08:49,f016ce59dda1ae8c8432b46724bdb18b,Cash Withdrawal,3.96,0,0.000000,0.000000,1
1,0ee904e165a00eeeeed17d8de5bb67be,2023-08-20 20:08:49,f016ce59dda1ae8c8432b46724bdb18b,Cash Withdrawal,79.01,0,6.708333,0.000063,1
2,ea68f07718f9a2dfdaf2d4064645270d,2023-08-27 13:08:49,f016ce59dda1ae8c8432b46724bdb18b,Cash Withdrawal,2518.07,0,6.708333,0.000027,1
3,7b28ed2beda5d31491633804ba1d1173,2023-09-03 06:08:49,f016ce59dda1ae8c8432b46724bdb18b,Cash Withdrawal,18.22,0,6.708333,0.000134,1
4,acf3429f123ae268fff5e6eb693f4dab,2023-09-09 23:08:49,f016ce59dda1ae8c8432b46724bdb18b,Cash Withdrawal,70.28,0,6.708333,0.000115,1
...,...,...,...,...,...,...,...,...,...
74803,d2260bc70656fe935ac3e787cb36472c,2024-04-01 04:25:38,6a14066d35c9ef416c62c4890e13bc50,Domestic Transport,14.15,0,5.289028,0.029674,1
74804,824eb005aba97cd80fd1da9974c10c26,2024-04-03 16:20:56,6a14066d35c9ef416c62c4890e13bc50,Grocery,54.54,0,2.496736,0.182812,1
74805,1aa84ba07d2c16a0c5d9c1c108f4851c,2024-04-04 14:02:51,6a14066d35c9ef416c62c4890e13bc50,Grocery,5.46,0,0.904109,0.208482,1
74806,595fa446f34513ae072c36e404ceabee,2024-04-04 16:48:05,6a14066d35c9ef416c62c4890e13bc50,Restaurant/Cafeteria,9.89,0,0.114745,0.567489,1


In [13]:
# Convert the 'trans_df' DataFrame to a Great Expectations DataFrame
ge_trans_df = ge.from_pandas(transactions_pdf)

# Retrieve the expectation suite associated with the ge DataFrame
expectation_suite = ge_trans_df.get_expectation_suite()

# Set the expectation suite name to "transactions_suite"
expectation_suite.expectation_suite_name = "transactions_suite"

# Check binary fraud_label column to be in set [0,1]
expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_distinct_values_to_be_in_set",
        kwargs={
            "column": "fraud_label",
            "value_set": [0, 1],
        }
    )
)

# Check amount column to be not negative
expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "amount",
            "min_value": 0.0,
        }
    )
)

2024-04-11 22:08:40,657 INFO: 	0 expectation(s) included in expectation_suite.


{"kwargs": {"column": "amount", "min_value": 0.0}, "expectation_type": "expect_column_values_to_be_between", "meta": {}}

In [14]:
# Create the feature group metadata
transactions_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Credit card transaction data",
    primary_key=['tid'],
    event_time='datetime',
    online_enabled=True,
    expectation_suite=expectation_suite,
    statistics_config={'histograms': True, 'correlations': True},
    parents=[profiles_fg]
)

In [15]:
# Set data validation option to Strict
transactions_fg.expectation_suite.validation_ingestion_policy = "STRICT"

In [16]:
# Insert data into feature group
transactions_fg.insert(transactions_pdf)

Feature Group created successfully, explore it at 
https://pocs.cloud.hopsworks.ai/p/125/fs/73/fg/54
2024-04-11 22:08:46,710 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://pocs.cloud.hopsworks.ai/p/125/fs/73/fg/54


Uploading Dataframe: 0.00% |          | Rows 0/74808 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://pocs.cloud.hopsworks.ai/p/125/jobs/named/transactions_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7ffb242cb640>,
 {
   "evaluation_parameters": {},
   "meta": {
     "great_expectations_version": "0.15.12",
     "expectation_suite_name": "transactions_suite",
     "run_id": {
       "run_time": "2024-04-11T22:08:46.710364+00:00",
       "run_name": null
     },
     "batch_kwargs": {
       "ge_batch_id": "1157b8f0-f850-11ee-8e12-bea1135dd279"
     },
     "batch_markers": {},
     "batch_parameters": {},
     "validation_time": "20240411T220846.710299Z",
     "expectation_suite_meta": {
       "great_expectations_version": "0.15.12"
     }
   },
   "results": [
     {
       "result": {
         "observed_value": [
           0,
           1
         ],
         "element_count": 74808,
         "missing_count": null,
         "missing_percent": null
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       },
       "meta": {
         "ingestionResult": "INGESTED

In [17]:
# Update feature descriptions
feature_descriptions = [
    {"name": "tid", "description": "Transaction id"},
    {"name": "datetime", "description": "Transaction time"},
    {"name": "account_id", "description": "Account performing the transaction"},
    {"name": "amount", "description": "Dollar amount of the transaction"},
    {"name": "fraud_label", "description": "Whether the transaction was fraudulent or not"},
    {"name": "loc_delta_t_minus_1", "description": "Location of previous transaction"},
    {"name": "time_delta_t_minus_1", "description": "Time of previous transaction"},
    {"name": "outside_city", "description": "Whether the transaction was done outside the city of residence"},
]

for desc in feature_descriptions: 
    transactions_fg.update_feature_description(desc["name"], desc["description"])