In [1]:
import hopsworks
import great_expectations as ge
from great_expectations.core import ExpectationSuite, ExpectationConfiguration

import math
import pandas as pd

In [17]:
# Read the Know Your Customer data
profiles_pdf = pd.read_csv("../../../../../RawData/kyc.csv", parse_dates=["birthdate", "cc_expiration_date"])

In [18]:
profiles_pdf

Unnamed: 0.1,Unnamed: 0,name,mail,birthdate,city,country_of_residence,cc_num,cc_provider,cc_type,cc_expiration_date
0,0,Nichole Hines,susan63@yahoo.com,1967-07-14,Waverly,US,f8da05fb50092668f9ebcd9966cc272a,visa,debit,2025-07-01
1,1,Micheal Gordon,janet62@yahoo.com,1997-11-28,Port Huron,US,6d65aed0b44c5a3e0e7ee1b260a2dff4,visa,debit,2024-06-01
2,2,Michael Higgins,debbiewolf@yahoo.com,1963-06-03,Norco,US,932044f672047bc5ebe875a736a0c449,visa,debit,2027-08-01
3,3,Melanie Miller,erikperez@hotmail.com,1967-08-21,Indio,US,44c193e0045052fa5c17987ee1ecb6d4,mastercard,debit,2028-01-01
4,4,William Sanders,randy39@yahoo.com,1925-08-23,Hutchinson,US,f016ce59dda1ae8c8432b46724bdb18b,visa,credit,2027-03-01
...,...,...,...,...,...,...,...,...,...,...
995,995,Dylan Harvey,rmathews@gmail.com,1989-04-19,Bon Air,US,5445bafdac9defb83b1d73cfd12d437f,visa,debit,2026-11-01
996,996,Stephanie Washington,wwilson@hotmail.com,1975-02-26,Bon Air,US,491bca9c2e6d8b34d441185f869894e0,visa,credit,2027-05-01
997,997,Jose Ruiz,susanharrington@gmail.com,1938-02-08,West Chester,US,bfbcc52361f32756d04a10129e1d5d5e,visa,credit,2024-02-01
998,998,Mrs. Melissa Gomez,emilymontgomery@yahoo.com,2005-07-24,Santa Maria,US,c6f0690434b75f2cb05c1b03046d8ffa,mastercard,debit,2025-02-01


In [19]:
# Keep only the needed columns

In [20]:
profiles_pdf = profiles_pdf[["cc_num", "cc_provider", "cc_type", "cc_expiration_date", "birthdate", "city"]]

# cc_num is more of an account_id rather than a proper credit card number, rename the column to avoid confusion
profiles_pdf.rename(columns={'cc_num': 'account_id'}, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
# Execution date
# This should be the current execution date during normal operation. I.e. When are the feature values computed
# However, for the sake of the example, and to make the PIT-correct join work on simulated data, 
# we pick the time of the earliest historical transaction.
execution_timestamp = pd.read_csv("../../../../../RawData/historical_transactions.csv", parse_dates=['datetime'])['datetime'].min() 

In [21]:
# Compute the age of the credit card holder
profiles_pdf['age'] = (execution_timestamp - profiles_pdf['birthdate']).dt.days / 365
profiles_pdf.drop('birthdate', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [22]:
# Compute days to expiration
profiles_pdf['cc_expiration_days'] = (profiles_pdf['cc_expiration_date'] - pd.Timestamp.now()).dt.days
profiles_pdf.drop('cc_expiration_date', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
# The feature values are related to a specific account_id and a specific event_time
# Add event_time in the feature group DataFrame
profiles_pdf['event_time'] = execution_timestamp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [25]:
# Final profiles feature group content in DataFrame format
profiles_pdf

Unnamed: 0,account_id,cc_provider,cc_type,city,age,cc_expiration_days,event_time
0,f8da05fb50092668f9ebcd9966cc272a,visa,debit,Waverly,56.123288,445,2023-08-14 03:08:49
1,6d65aed0b44c5a3e0e7ee1b260a2dff4,visa,debit,Port Huron,25.726027,50,2023-08-14 03:08:49
2,932044f672047bc5ebe875a736a0c449,visa,debit,Norco,60.238356,1206,2023-08-14 03:08:49
3,44c193e0045052fa5c17987ee1ecb6d4,mastercard,debit,Indio,56.019178,1359,2023-08-14 03:08:49
4,f016ce59dda1ae8c8432b46724bdb18b,visa,credit,Hutchinson,98.041096,1053,2023-08-14 03:08:49
...,...,...,...,...,...,...,...
995,5445bafdac9defb83b1d73cfd12d437f,visa,debit,Bon Air,34.342466,933,2023-08-14 03:08:49
996,491bca9c2e6d8b34d441185f869894e0,visa,credit,Bon Air,48.495890,1114,2023-08-14 03:08:49
997,bfbcc52361f32756d04a10129e1d5d5e,visa,credit,West Chester,85.569863,-71,2023-08-14 03:08:49
998,c6f0690434b75f2cb05c1b03046d8ffa,mastercard,debit,Santa Maria,18.068493,295,2023-08-14 03:08:49


In [26]:
# Connect to the Hopsworks Feature Store
project = hopsworks.login()
fs = project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://pocs.cloud.hopsworks.ai/p/125
Connected. Call `.close()` to terminate connection gracefully.


In [27]:
# Create an expectation suite to validate the profile feature data
# Convert the 'profiles_pdf' DataFrame to a Great Expectations DataFrame
ge_profiles_df = ge.from_pandas(profiles_pdf)

# Retrieve the expectation suite associated with the ge DataFrame
expectation_suite_profiles = ge_profiles_df.get_expectation_suite()

# Set the expectation suite name to "profiles_suite"
expectation_suite_profiles.expectation_suite_name = "profiles_suite"

2024-04-11 19:42:35,159 INFO: 	0 expectation(s) included in expectation_suite.


In [28]:
expectation_suite_profiles.add_expectation(
    ExpectationConfiguration(
        expectation_type="expect_column_distinct_values_to_be_in_set",
        kwargs={
            "column": "cc_type",
            "value_set": ['debit', 'credit'],
        }
    )
)

# Check for Nulls
expectation_suite_profiles.add_expectation(
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_null",
            kwargs={
                "column": 'account_id',
                "mostly": 0.0,
            }
        )
    )

{"kwargs": {"column": "account_id", "mostly": 0.0}, "expectation_type": "expect_column_values_to_be_null", "meta": {}}

In [29]:
# Create the feature group metadata
profile_fg = fs.get_or_create_feature_group(
    name="profiles",
    version=1,
    description="Credit card holder demographic data",
    primary_key=['account_id'],
    event_time='event_time',
    online_enabled=True,
    expectation_suite=expectation_suite_profiles,
    statistics_config={'histograms': True, 'correlations': True}
)

In [30]:
# Set data validation option to Strict
profile_fg.expectation_suite.validation_ingestion_policy = "STRICT"

In [31]:
# Insert data into feature group
profile_fg.insert(profiles_pdf)

Feature Group created successfully, explore it at 
https://pocs.cloud.hopsworks.ai/p/125/fs/73/fg/51
2024-04-11 19:42:38,124 INFO: 	2 expectation(s) included in expectation_suite.
Validation succeeded.
Validation Report saved successfully, explore a summary at https://pocs.cloud.hopsworks.ai/p/125/fs/73/fg/51


Uploading Dataframe: 0.00% |          | Rows 0/1000 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: profiles_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://pocs.cloud.hopsworks.ai/p/125/jobs/named/profiles_1_offline_fg_materialization/executions


(<hsfs.core.job.Job at 0x7f26ec70fe50>,
 {
   "evaluation_parameters": {},
   "meta": {
     "great_expectations_version": "0.15.12",
     "expectation_suite_name": "profiles_suite",
     "run_id": {
       "run_time": "2024-04-11T19:42:38.124617+00:00",
       "run_name": null
     },
     "batch_kwargs": {
       "ge_batch_id": "a6dbb800-f83b-11ee-ad3c-bea1135dd279"
     },
     "batch_markers": {},
     "batch_parameters": {},
     "validation_time": "20240411T194238.124548Z",
     "expectation_suite_meta": {
       "great_expectations_version": "0.15.12"
     }
   },
   "results": [
     {
       "result": {
         "element_count": 1000,
         "unexpected_count": 1000,
         "unexpected_percent": 100.0,
         "unexpected_percent_total": 100.0,
         "partial_unexpected_list": []
       },
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       },
       "meta": {
         "ingestion

In [32]:
feature_descriptions = [
    {"name": "age", "description": "Age of credit card holder"},
    {"name": "city", "description": "City of residence of the card holder"},
    {"name": "cc_expiration_days", "description": "Number of days until card expires"}
]

for desc in feature_descriptions: 
    profile_fg.update_feature_description(desc["name"], desc["description"])