In [1]:
import hopsworks
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

import math
import pandas as pd

from features.transactions import calculate_time_delta_t_minus_1, calculate_loc_delta_t_minus_1

In [2]:
# Connect to the Hopsworks Feature Store
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://snurran.hops.works/p/15479
Connected. Call `.close()` to terminate connection gracefully.


In [3]:
# Read the historical transaction data
transactions_pdf = pd.read_csv("../data_generation/historical_transactions.csv", parse_dates=['datetime'])

In [4]:
transactions_pdf = transactions_pdf[["tid", "datetime", "cc_num", "category", "amount", "latitude", 'longitude', 'city', 'fraud_label']]

# cc_num is more of an account_id rather than a proper credit card number, rename the column to avoid confusion
transactions_pdf.rename(columns={'cc_num': 'account_id'}, inplace=True) 

In [5]:
transactions_pdf

Unnamed: 0,tid,datetime,account_id,category,amount,latitude,longitude,city,fraud_label
0,4aa7ba7a5bcc3493152a33aa73a2f17f,2023-12-28 23:14:45,1766230b4f7602f856ffc82ef0a0ffd1,Grocery,78.31,36.025060,-86.779170,Brentwood Estates,0
1,1d6efad95e92a86e9671c196365277a0,2023-12-28 23:16:45,c30018ba5ca059426ca40184e3b4d833,Grocery,77.14,33.410120,-91.061770,Greenville,0
2,676cbe21703f5e05b2291369c559f71e,2023-12-28 23:21:02,818951f6e43105a228bddc51a19d8e28,Grocery,13.92,33.036990,-117.291980,Encinitas,0
3,3453b68fb35e454fcf792c4f5fddb093,2023-12-28 23:25:16,4fd1d53696a07c807e56c6af5c195430,Grocery,33.05,29.845760,-90.106740,Estelle,0
4,847e5d19ab8bcb8e91c2d174223a9ca5,2023-12-28 23:35:08,3d66cf2dc257be18dbed42327f64f753,Clothing,65.18,40.605380,-73.755130,Far Rockaway,0
...,...,...,...,...,...,...,...,...,...
71317,a08ade3a4a50207271118c89345cd036,2024-04-26 22:36:17,39ccf08351228fbc6073640bdfa58016,Cash Withdrawal,704.83,40.718469,-73.926426,Greenpoint,0
71318,12a6cdb78aa929b28b034087ecfcba26,2024-04-22 22:36:17,39ccf08351228fbc6073640bdfa58016,Cash Withdrawal,50.53,40.723734,-73.925561,Greenpoint,0
71319,d17a2f8dcab315bd778bd3996ff069fb,2024-04-18 22:36:17,39ccf08351228fbc6073640bdfa58016,Cash Withdrawal,61.86,40.722977,-73.926932,Greenpoint,0
71320,95e1e62e4a13ae964a98ebbe8409f98f,2024-04-14 22:36:17,39ccf08351228fbc6073640bdfa58016,Cash Withdrawal,412.33,40.713166,-73.921838,Greenpoint,0


In [6]:
transactions_pdf['datetime'].min()

Timestamp('2023-10-21 15:38:55')

In [7]:
transactions_pdf['datetime'].max()

Timestamp('2024-06-25 22:55:20')

In [8]:
# Compute time delta from previous transaction
transactions_pdf = calculate_time_delta_t_minus_1(transactions_pdf)

In [9]:
# Compute location delta from previous transaction
transactions_pdf = calculate_loc_delta_t_minus_1(transactions_pdf)




In [11]:
# Compute feature to determine whether or not a transaction was made outside the city of residence of the user
# Retrieve user profiles information
profiles_fg = fs.get_feature_group("profiles", version=1)
profiles_pdf = profiles_fg.select(['account_id', 'city']).read()

# Join with transaction dataframe
transactions_pdf = transactions_pdf.merge(profiles_pdf, on='account_id')
transactions_pdf["outside_city"] = (transactions_pdf['city_x'] != transactions_pdf['city_y']).astype(int)

# Drop helper column
transactions_pdf.drop(['city_x', 'city_y', 'latitude', 'longitude'], axis=1, inplace=True)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.49s) 


In [12]:
transactions_pdf

Unnamed: 0,tid,datetime,account_id,category,amount,fraud_label,time_delta_t_minus_1,loc_delta_t_minus_1,outside_city
0,0c6dc9bf4f4fc3d1750becd8666497f8,2023-10-21 15:38:55,adfd42c9cd52c042757273444091ec63,Cash Withdrawal,338.00,0,0.000000,0.000000,1
1,3ecd36103bbd750e8d2c0c3d8a31085b,2023-10-28 13:38:55,adfd42c9cd52c042757273444091ec63,Cash Withdrawal,186.69,0,6.916667,0.000085,1
2,d9a50c88c25741d79224337f357b288c,2023-11-04 11:38:55,adfd42c9cd52c042757273444091ec63,Cash Withdrawal,811.43,0,6.916667,0.000093,1
3,7e29091ac3b1efd7c8e9b250564ef2e1,2023-11-11 09:38:55,adfd42c9cd52c042757273444091ec63,Cash Withdrawal,51.54,0,6.916667,0.000113,1
4,37124bdff5219896436b907222921ea8,2023-11-18 07:38:55,adfd42c9cd52c042757273444091ec63,Cash Withdrawal,72.96,0,6.916667,0.000144,1
...,...,...,...,...,...,...,...,...,...
71317,b41df74deb2e0f452c26d0dd11960dfa,2024-06-04 22:03:04,34a9764349537470768267bcb646eb79,Health/Beauty,69.92,0,0.055694,0.604424,1
71318,2a52e7348ed101198797c1ac335fd5f4,2024-06-12 23:59:45,34a9764349537470768267bcb646eb79,Grocery,74.89,0,8.081030,0.630094,1
71319,66c2ce60e23306ab8e03489b0eab3082,2024-06-16 10:55:35,34a9764349537470768267bcb646eb79,Clothing,901.94,0,3.455440,0.355854,1
71320,9d8688756ea8190ad28f71f8450dd63e,2024-06-22 18:59:55,34a9764349537470768267bcb646eb79,Domestic Transport,75.07,0,6.336343,0.367888,1


In [13]:
# Convert the 'trans_df' DataFrame to a Great Expectations DataFrame
ge_trans_df = ge.from_pandas(transactions_pdf)

# Retrieve the expectation suite associated with the ge DataFrame
expectation_suite = ge_trans_df.get_expectation_suite()

# Set the expectation suite name to "transactions_suite"
expectation_suite.expectation_suite_name = "transactions_suite"

# Check binary fraud_label column to be in set [0,1]
expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_distinct_values_to_be_in_set",
        kwargs={
            "column": "fraud_label",
            "value_set": [0, 1],
        }
    )
)

# Check amount column to be not negative
expectation_suite.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_between",
        kwargs={
            "column": "amount",
            "min_value": 0.0,
        }
    )
)

2024-06-25 23:20:09,565 INFO: 	0 expectation(s) included in expectation_suite.


{"expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "amount", "min_value": 0.0}, "meta": {}}

In [14]:
# Create the feature group metadata
transactions_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Credit card transaction data",
    primary_key=['tid'],
    event_time='datetime',
    online_enabled=True,
    expectation_suite=expectation_suite,
    statistics_config={'histograms': True, 'correlations': True},
    parents=[profiles_fg]
)

In [15]:
# Set data validation option to Strict
transactions_fg.expectation_suite.validation_ingestion_policy = "STRICT"

In [16]:
# Insert data into feature group
transactions_fg.insert(transactions_pdf)

Feature Group created successfully, explore it at 
https://snurran.hops.works/p/15479/fs/15427/fg/16398
2024-06-25 23:20:12,972 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://snurran.hops.works/p/15479/fs/15427/fg/16398


Uploading Dataframe: 0.00% |          | Rows 0/71322 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.hops.works/p/15479/jobs/named/transactions_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7fb357cf8370>,
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_values_to_be_between",
         "kwargs": {
           "column": "amount",
           "min_value": 0.0
         },
         "meta": {
           "expectationId": 11268
         }
       },
       "result": {
         "element_count": 71322,
         "missing_count": 0,
         "missing_percent": 0.0,
         "unexpected_count": 0,
         "unexpected_percent": 0.0,
         "unexpected_percent_total": 0.0,
         "unexpected_percent_nonmissing": 0.0,
         "partial_unexpected_list": []
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-06-25T11:20:12.000971Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true

In [17]:
# Update feature descriptions
feature_descriptions = [
    {"name": "tid", "description": "Transaction id"},
    {"name": "datetime", "description": "Transaction time"},
    {"name": "account_id", "description": "Account performing the transaction"},
    {"name": "amount", "description": "Dollar amount of the transaction"},
    {"name": "fraud_label", "description": "Whether the transaction was fraudulent or not"},
    {"name": "loc_delta_t_minus_1", "description": "Location of previous transaction"},
    {"name": "time_delta_t_minus_1", "description": "Time of previous transaction"},
    {"name": "outside_city", "description": "Whether the transaction was done outside the city of residence"},
]

for desc in feature_descriptions: 
    transactions_fg.update_feature_description(desc["name"], desc["description"])