In [1]:
import hopsworks
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

import math
import pandas as pd

In [2]:
# Read the Know Your Customer data
profiles_pdf = pd.read_csv("../data_generation/kyc.csv", parse_dates=["birthdate", "cc_expiration_date"])

In [3]:
profiles_pdf

Unnamed: 0.1,Unnamed: 0,name,mail,birthdate,city,country_of_residence,cc_num,cc_provider,cc_type,cc_expiration_date
0,0,Zachary Thomas,ashley41@hotmail.com,2004-08-15,Waverly,US,9679a5dee3e743f75d83e83a819ce1e9,mastercard,credit,2028-03-01
1,1,Douglas Conley,umartin@hotmail.com,1926-02-18,Gates-North Gates,US,bb315e47787037748a8fcf8ab6d84243,visa,debit,2025-12-01
2,2,John Pace,nathaniellewis@gmail.com,1942-10-06,Odessa,US,f5a76796857fe0a2c6152f019d6fd265,mastercard,debit,2027-06-01
3,3,Michael Kirby,brianpearson@yahoo.com,1946-01-09,Pekin,US,b9ae1b2174c5333c3a2c8bab6d64c3b6,mastercard,debit,2024-04-01
4,4,Clinton Bauer,jodi08@hotmail.com,1994-02-04,Schererville,US,6799fb69ce9c3e2b94158d5307340cbd,visa,credit,2027-04-01
...,...,...,...,...,...,...,...,...,...,...
995,995,Todd Powers,rodriguezjacqueline@hotmail.com,1973-11-28,Muskogee,US,018ff07230cc505876224e941dfd6096,mastercard,credit,2025-01-01
996,996,Kelly Jordan,wigginstimothy@yahoo.com,1930-01-09,Watsonville,US,a202b33f5567fdd60362e27cd8fac112,mastercard,credit,2024-01-01
997,997,Craig Guerra,angela05@yahoo.com,1984-01-24,Far Rockaway,US,11221754bce5c6d08ce2dab1b0e4a707,mastercard,debit,2029-01-01
998,998,Elizabeth Young,wallacelisa@hotmail.com,2005-05-06,Sunnyside,US,cd1f52107719ba852554d32834b85a82,mastercard,debit,2029-04-01


In [4]:
# Keep only the needed columns

In [5]:
profiles_pdf = profiles_pdf[["cc_num", "cc_provider", "cc_type", "cc_expiration_date", "birthdate", "city"]]

# cc_num is more of an account_id rather than a proper credit card number, rename the column to avoid confusion
profiles_pdf.rename(columns={'cc_num': 'account_id'}, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [8]:
# Execution date
# This should be the current execution date during normal operation. I.e. When are the feature values computed
# However, for the sake of the example, and to make the PIT-correct join work on simulated data, 
# we pick the time of the earliest historical transaction.
execution_timestamp = pd.read_csv("../data_generation/historical_transactions.csv", parse_dates=['datetime'])['datetime'].min() 

In [9]:
# Compute the age of the credit card holder
profiles_pdf['age'] = (execution_timestamp - profiles_pdf['birthdate']).dt.days / 365
profiles_pdf.drop('birthdate', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
# Compute days to expiration
profiles_pdf['cc_expiration_days'] = (profiles_pdf['cc_expiration_date'] - pd.Timestamp.now()).dt.days
profiles_pdf.drop('cc_expiration_date', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
# The feature values are related to a specific account_id and a specific event_time
# Add event_time in the feature group DataFrame
profiles_pdf['event_time'] = execution_timestamp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
# Final profiles feature group content in DataFrame format
profiles_pdf

Unnamed: 0,account_id,cc_provider,cc_type,city,age,cc_expiration_days,event_time
0,9679a5dee3e743f75d83e83a819ce1e9,mastercard,credit,Waverly,19.194521,1344,2023-10-21 15:38:55
1,bb315e47787037748a8fcf8ab6d84243,visa,debit,Gates-North Gates,97.736986,523,2023-10-21 15:38:55
2,f5a76796857fe0a2c6152f019d6fd265,mastercard,debit,Odessa,81.095890,1070,2023-10-21 15:38:55
3,b9ae1b2174c5333c3a2c8bab6d64c3b6,mastercard,debit,Pekin,77.832877,-86,2023-10-21 15:38:55
4,6799fb69ce9c3e2b94158d5307340cbd,visa,credit,Schererville,29.728767,1009,2023-10-21 15:38:55
...,...,...,...,...,...,...,...
995,018ff07230cc505876224e941dfd6096,mastercard,credit,Muskogee,49.928767,189,2023-10-21 15:38:55
996,a202b33f5567fdd60362e27cd8fac112,mastercard,credit,Watsonville,93.843836,-177,2023-10-21 15:38:55
997,11221754bce5c6d08ce2dab1b0e4a707,mastercard,debit,Far Rockaway,39.767123,1650,2023-10-21 15:38:55
998,cd1f52107719ba852554d32834b85a82,mastercard,debit,Sunnyside,18.471233,1740,2023-10-21 15:38:55


In [13]:
# Connect to the Hopsworks Feature Store
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://snurran.hops.works/p/15479
Connected. Call `.close()` to terminate connection gracefully.


In [14]:
# Create an expectation suite to validate the profile feature data
# Convert the 'profiles_pdf' DataFrame to a Great Expectations DataFrame
ge_profiles_df = ge.from_pandas(profiles_pdf)

# Retrieve the expectation suite associated with the ge DataFrame
expectation_suite_profiles = ge_profiles_df.get_expectation_suite()

# Set the expectation suite name to "profiles_suite"
expectation_suite_profiles.expectation_suite_name = "profiles_suite"

2024-06-25 23:18:12,020 INFO: 	0 expectation(s) included in expectation_suite.


In [15]:
expectation_suite_profiles.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_distinct_values_to_be_in_set",
        kwargs={
            "column": "cc_type",
            "value_set": ['debit', 'credit'],
        }
    )
)

# Check for Nulls
expectation_suite_profiles.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_null",
            kwargs={
                "column": 'account_id',
                "mostly": 0.0,
            }
        )
    )

{"expectation_type": "expect_column_values_to_be_null", "kwargs": {"column": "account_id", "mostly": 0.0}, "meta": {}}

In [16]:
# Create the feature group metadata
profile_fg = fs.get_or_create_feature_group(
    name="profiles",
    version=1,
    description="Credit card holder demographic data",
    primary_key=['account_id'],
    event_time='event_time',
    online_enabled=True,
    expectation_suite=expectation_suite_profiles,
    statistics_config={'histograms': True, 'correlations': True}
)

In [17]:
# Set data validation option to Strict
profile_fg.expectation_suite.validation_ingestion_policy = "STRICT"

In [18]:
# Insert data into feature group
profile_fg.insert(profiles_pdf)

Feature Group created successfully, explore it at 
https://snurran.hops.works/p/15479/fs/15427/fg/16397
2024-06-25 23:18:16,705 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://snurran.hops.works/p/15479/fs/15427/fg/16397


Uploading Dataframe: 0.00% |          | Rows 0/1000 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: profiles_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.hops.works/p/15479/jobs/named/profiles_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7fdce7920d00>,
 {
   "success": true,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_column_distinct_values_to_be_in_set",
         "kwargs": {
           "column": "cc_type",
           "value_set": [
             "debit",
             "credit"
           ]
         },
         "meta": {
           "expectationId": 11265
         }
       },
       "result": {
         "observed_value": [
           "credit",
           "debit"
         ],
         "element_count": 1000,
         "missing_count": null,
         "missing_percent": null
       },
       "meta": {
         "ingestionResult": "INGESTED",
         "validationTime": "2024-06-25T11:18:16.000705Z"
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": true,
       "expectation_config": {
         "expect

In [19]:
feature_descriptions = [
    {"name": "age", "description": "Age of credit card holder"},
    {"name": "city", "description": "City of residence of the card holder"},
    {"name": "cc_expiration_days", "description": "Number of days until card expires"}
]

for desc in feature_descriptions: 
    profile_fg.update_feature_description(desc["name"], desc["description"])