# Solutions to test yourself sections of tutorial

In [None]:
import featurebyte
from featurebyte import *
from featurebyte.config import Configurations
from featurebyte.api.data import Data
from featurebyte.api.change_view import ChangeView

# get the Featurebyte configuration path so that we can edit it
config = Configurations()
print(config.config_file_path)

import pandas as pd
from datetime import datetime

# this script requires version > 0.1.0.dev508
print(featurebyte.version)

# Tutorial: What types of data does featurebyte handle?

## Test yourself

1) What data type is each of these 6 tables?

* BANKCUSTOMER: slowly changing data
* STATEDETAILS: slowly changing data
* CREDITCARD: slowly changing data
* CARDTRANSACTIONS: event data
* CARDFRAUDSTATUS: slowly changing data
* CARDTRANSACTIONGROUPS: dimension data

2. Register each table

In [None]:
# tell Featurebyte we are using the beta testing data
# note: replace "beta-colin" with your profile name
Configurations.use_profile("beta_creditcard")

# connect to the feature store
fs = FeatureStore.get("beta_user_featurestore")
fs.info()

In [None]:
# view the databases the feature store can access
fs.list_databases()

In [None]:
# view the schemas in the BETA_TESTING_DATASETS database
fs.list_schemas(database_name="BETA_TESTING_DATASETS")

In [None]:
# view the tables available in the CREDIT_CARD schema
display(fs.list_tables(database_name="BETA_TESTING_DATASETS", schema_name="CREDIT_CARD"))

In [None]:
# show the column details for bank customer
display(fs.get_table(database_name="BETA_TESTING_DATASETS", schema_name="CREDIT_CARD", table_name="BANKCUSTOMER").dtypes)

In [None]:
# check whether the data is already registered
if not Data.list().name.str.contains("BANKCUSTOMER").any():
    # register BankCustomer as slowly changing data
    BankCustomer = SlowlyChangingData.from_tabular_source(
        name="BANKCUSTOMER",
        surrogate_key_column="RowID",
        natural_key_column="BankCustomerID",
        effective_timestamp_column="ValidFrom",
        end_timestamp_column="ValidTo",
        record_creation_date_column="record_available_at",
        tabular_source=fs.get_table(
            database_name="BETA_TESTING_DATASETS",
            schema_name="CREDIT_CARD",
            table_name="BANKCUSTOMER"
        )
    )
    BankCustomer.save(conflict_resolution="retrieve")

In [None]:
# show the column details for state details
display(fs.get_table(database_name="BETA_TESTING_DATASETS", schema_name="CREDIT_CARD", table_name="STATEDETAILS").dtypes)

In [None]:
# check whether the data is already registered
if not Data.list().name.str.contains("STATEDETAILS").any():
    # register StateDetails as slowly changing data
    StateDetails = SlowlyChangingData.from_tabular_source(
        name="STATEDETAILS",
        natural_key_column="StateCode",
        effective_timestamp_column="ValidFrom",
        end_timestamp_column="ValidTo",
        record_creation_date_column="record_available_at",
        tabular_source=fs.get_table(
            database_name="BETA_TESTING_DATASETS",
            schema_name="CREDIT_CARD",
            table_name="STATEDETAILS"
        )
    )
    StateDetails.save(conflict_resolution="retrieve")

In [None]:
# show the column details for credit card
display(fs.get_table(database_name="BETA_TESTING_DATASETS", schema_name="CREDIT_CARD", table_name="CREDITCARD").dtypes)

In [None]:
# check whether the data is already registered
if not Data.list().name.str.contains("CREDITCARD").any():
    # register CreditCard as slowly changing data
    CreditCard = SlowlyChangingData.from_tabular_source(
        name="CREDITCARD",
        surrogate_key_column="RowID",
        natural_key_column="AccountID",
        effective_timestamp_column="ValidFrom",
        end_timestamp_column="ValidTo",
        record_creation_date_column="record_available_at",
        tabular_source=fs.get_table(
            database_name="BETA_TESTING_DATASETS",
            schema_name="CREDIT_CARD",
            table_name="CREDITCARD"
        )
    )
    CreditCard.save(conflict_resolution="retrieve")

In [None]:
# show the column details for card transactions
display(fs.get_table(database_name="BETA_TESTING_DATASETS", schema_name="CREDIT_CARD", table_name="CARDTRANSACTIONS").dtypes)

In [None]:
# check whether the data is already registered
if not Data.list().name.str.contains("CARDTRANSACTIONS").any():
    # register CardTransactions as event data
    CardTransactions = EventData.from_tabular_source(
        name="CARDTRANSACTIONS",
        event_timestamp_column="Timestamp",
        event_id_column="CardTransactionID",
        record_creation_date_column="record_available_at",
        tabular_source=fs.get_table(
            database_name="BETA_TESTING_DATASETS",
            schema_name="CREDIT_CARD",
            table_name="CARDTRANSACTIONS"
        )
    )
    CardTransactions.save(conflict_resolution="retrieve")
else:
    CardTransactions = EventData.get("CARDTRANSACTIONS")

In [None]:
# ask featurebyte to analyse the record creation timestamps and set an optimal feature job setting
feature_job_analysis = CardTransactions.create_new_feature_job_setting_analysis()

In [None]:
# use more aggressive settings for the feature job, that ignore the 0.5% of the data that arrives very late
CardTransactions.update_default_feature_job_setting(
    FeatureJobSetting(
    blind_spot='120s',
    frequency='3600s',
    time_modulo_frequency='65s'
    )
)

In [None]:
# show the column details for card fraud status
display(fs.get_table(database_name="BETA_TESTING_DATASETS", schema_name="CREDIT_CARD", table_name="CARDFRAUDSTATUS").dtypes)

In [None]:
# check whether the data is already registered
if not Data.list().name.str.contains("CARDFRAUDSTATUS").any():
    # register CardFraudStatus as slowly changing data
    CardFraudStatus = SlowlyChangingData.from_tabular_source(
        name="CARDFRAUDSTATUS",
        surrogate_key_column="RowID",
        natural_key_column="CardTransactionID",
        effective_timestamp_column="ValidFrom",
        end_timestamp_column="ValidTo",
        record_creation_date_column="record_available_at",
        tabular_source=fs.get_table(
            database_name="BETA_TESTING_DATASETS",
            schema_name="CREDIT_CARD",
            table_name="CARDFRAUDSTATUS"
        )
    )
    CardFraudStatus.save(conflict_resolution="retrieve")

In [None]:
# show the column details for card transaction groups
display(fs.get_table(database_name="BETA_TESTING_DATASETS", schema_name="CREDIT_CARD", table_name="CARDTRANSACTIONGROUPS").dtypes)

In [None]:
# check whether the data is already registered
if not Data.list().name.str.contains("CARDTRANSACTIONGROUPS").any():
    # register CardTransactionGroups as dimension data
    CardTransactionGroups = DimensionData.from_tabular_source(
        name="CARDTRANSACTIONGROUPS",
        dimension_id_column="CardTransactionDescription",
        tabular_source=fs.get_table(
            database_name="BETA_TESTING_DATASETS",
            schema_name="CREDIT_CARD",
            table_name="CARDTRANSACTIONGROUPS"
        )
    )
    CardTransactionGroups.save(conflict_resolution="retrieve")

In [None]:
# display all of the registered data
Data.list()

# Tutorial: How do I annotate data?

## Test yourself

1) Which entities are required to join the tables?

* BANKCUSTOMER
* STATE
* CREDITCARD
* CARDTRANSACTION
* CARDTRANSACTIONDESCRIPTION

2) Register each entity

In [None]:
# register each entity
entity1 = Entity(name="bankcustomer", serving_names=["BANKCUSTOMERID"])
entity1.save(conflict_resolution="retrieve")
entity2 = Entity(name="creditcard", serving_names=["ACCOUNTID"])
entity2.save(conflict_resolution="retrieve")
entity3 = Entity(name="cardtransaction", serving_names=["CARDTRANSACTIONID"])
entity3.save(conflict_resolution="retrieve")
entity4 = Entity(name="cardtransactiondescription", serving_names=["CARDTRANSACTIONDESCRIPTION"])
entity4.save(conflict_resolution="retrieve")
entity5 = Entity(name="state", serving_names=["STATECODE"])
entity5.save(conflict_resolution="retrieve")

In [None]:
# list the registered entities
display(Entity.list())

3) Annotate which columns represent entities

In [None]:
# tag the entities for the bank customer table
bankCustomerTable = SlowlyChangingData.get('BANKCUSTOMER')
# tag columns as entities
bankCustomerTable.BankCustomerID.as_entity("bankcustomer")
bankCustomerTable.StateCode.as_entity("state")

bankCustomerTable.info()

In [None]:
# tag the entities for the credit card table
creditCardTable = SlowlyChangingData.get('CREDITCARD')
# tag columns as entities
creditCardTable.AccountID.as_entity("creditcard")
creditCardTable.BankCustomerID.as_entity("bankcustomer")

creditCardTable.info()

In [None]:
# tag the entities for the card transactions table
cardTransactionsTable = EventData.get('CARDTRANSACTIONS')
# tag columns as entities
cardTransactionsTable.CardTransactionID.as_entity("cardtransaction")
cardTransactionsTable.AccountID.as_entity("creditcard")
cardTransactionsTable.CardTransactionDescription.as_entity("cardtransactiondescription")

cardTransactionsTable.info()

In [None]:
# tag the entities for the card transaction groups table
cardTransactionGroupsTable = DimensionData.get('CARDTRANSACTIONGROUPS')
# tag columns as entities
cardTransactionGroupsTable.CardTransactionDescription.as_entity("cardtransactiondescription")

cardTransactionGroupsTable.info()

In [None]:
# tag the entities for the card fraud status table
cardFraudStatusTable = SlowlyChangingData.get('CARDFRAUDSTATUS')
# tag columns as entities
cardFraudStatusTable.CardTransactionID.as_entity("cardtransaction")

cardFraudStatusTable.info()

In [None]:
# tag the entities for the state details table
stateDetailsTable = SlowlyChangingData.get('STATEDETAILS')
# tag columns as entities
stateDetailsTable.StateCode.as_entity("state")

stateDetailsTable.info()

In [None]:
Data.list()

# Tutorial: How do I view tabular data?

## Test yourself

1) Create a customer view and display sample rows

In [None]:
# create a view for the customer data
customerView = SlowlyChangingView.from_slowly_changing_data(
    slowly_changing_data=SlowlyChangingData.get("BANKCUSTOMER")
)

# display sample rows from the view
customerView.sample()

2) Create a transaction view and display sample rows

In [None]:
# create a view for the transaction data
transactionView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# display sample rows from the transaction view
transactionView.sample()

# Tutorial: How do I transform data?

## Test yourself

1) Calculate the population density for each US state

In [None]:
# create a view for StateDetails
stateDetailsView = SlowlyChangingView.from_slowly_changing_data(
    slowly_changing_data=SlowlyChangingData.get("STATEDETAILS")
)

# calculate the population density for each state
stateDetailsView['population_density'] = stateDetailsView['TotalPopulation'] / stateDetailsView['Area']

# show sample rows from the view
stateDetailsView[['StateName', 'TotalPopulation', 'Area', 'population_density']].sample()

2) Create a flag column for when repayments occur

In [None]:
# create a view for card transactions
cardTransactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# calculate a flag for when repayments occur
cardTransactionsView['repayment_flag'] = (cardTransactionsView['CardTransactionDescription'] == 'repayment') | (cardTransactionsView['CardTransactionDescription'] == 'amortization') | (cardTransactionsView['CardTransactionDescription'] == 'direct debit repayment')

# show sample rows from the view
cardTransactionsView.sample(100)

# Tutorial: How do I filter data?

## Test yourself

1) Create a view of transactions showing only the fees charged

In [None]:
# create a view for card transactions
cardTransactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# create a mask that is only true for transactions that include the text "fee", but not "coffee" in the description
feeMask = cardTransactionsView['CardTransactionDescription'].str.contains('fee') & ~cardTransactionsView['CardTransactionDescription'].str.contains('coffee')

# create a view that only includes transactions that include the text "fee"
feeView = cardTransactionsView[feeMask].copy()

# show sample rows from the view
feeView.sample()

2) Assign customers to market segments based upon their age at 31-Dec-2022:
    * under 18
    * 18-65
    * over 65

In [None]:
# create a view for bank customers
bankCustomerView = SlowlyChangingView.from_slowly_changing_data(
    slowly_changing_data=SlowlyChangingData.get("BANKCUSTOMER")
)

# calculate the customer age
bankCustomerView['age'] = 2022 - bankCustomerView['DateOfBirth'].dt.year

# create a mask for customer age less than 18
minorMask = bankCustomerView['age'] < 18

# create a mask for customer age greater than 65
seniorMask = bankCustomerView['age'] > 65

# create a column that categorizes customers as minor, adult or senior
bankCustomerView['age_category'] = 'adult'
bankCustomerView.age_category[minorMask] = 'minor'
bankCustomerView.age_category[seniorMask] = 'senior'

# show sample rows from the view
bankCustomerView[['DateOfBirth', 'age', 'age_category']].sample()

# Tutorial: How do I join data?

## Test yourself

1) Join the CreditCard view to the CardTransactions view

In [None]:
# get the credit card view
creditCardView = SlowlyChangingView.from_slowly_changing_data(
    slowly_changing_data=SlowlyChangingData.get("CREDITCARD")
)

# get the card transactions view
cardTransactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# join the credit card view to the card transactions view
cardTransactionsView.join(creditCardView, rsuffix='_credit_card')

# display sample rows from the view
cardTransactionsView.sample()

2) Join the CardTransactions view to the CardFraudStatus view


In [None]:
# get the card transactions view
cardTransactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# get the card fraud status view
cardFraudStatusView = SlowlyChangingView.from_slowly_changing_data(
    slowly_changing_data=SlowlyChangingData.get("CARDFRAUDSTATUS")
)

# join the card transactions view to the card fraud status view
cardFraudStatusView.join(cardTransactionsView, rsuffix='_card_transactions')

# display sample rows from the view
cardFraudStatusView.sample()

3) Join the CardTransactionGroups view to the CardTransactions view

In [None]:
# get the card transactions view
cardTransactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# get the card transaction groups view
cardTransactionGroupsView = DimensionView.from_dimension_data(
    dimension_data=DimensionData.get("CARDTRANSACTIONGROUPS")
)

# join the card transactions groups view to the card transactions view
cardTransactionsView.join(cardTransactionGroupsView, rsuffix='_card_transaction_groups')

# display sample rows from the view
cardTransactionsView.sample()

# Tutorial: How do I aggregate data?

## Test yourself

1) Calculate the number of credit cards a customer has at a point in time

In [None]:
# get a credit card view
creditCardView = SlowlyChangingView.from_slowly_changing_data(
    slowly_changing_data=creditCardTable
)

# filter the credit card view to only include cards that are active
activeCreditCardView = creditCardView[creditCardView['closed_at'].isnull()]

# count the number of credit cards per customer
cardCount = activeCreditCardView.groupby('BankCustomerID').aggregate_asat(
    None,
    AggFunc.COUNT,
    backward=True,
    feature_name="card_count_by_customer_asat_now"
)

2) Calculate the proportion of transactions that are negative over the past 30 days, grouped by credit card

In [None]:
# get a transaction view
cardTransactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# create a field that has 1 if a transaction is negative, 0 otherwise
cardTransactionsView['negative_transaction'] = (cardTransactionsView['Amount'] < 0).astype(int)

# calculate the average proportion of negative transactions per credit card over the past 30 days
negativeTransactionProportion = cardTransactionsView.groupby('AccountID').aggregate_over(
    'negative_transaction',
    AggFunc.AVG,
    windows=['30d'],
    feature_names=['negative_transactions_by_card_past_30_days']
)

3) Calculate the entropy of transaction types over the past 90 days, grouped by credit card

In [None]:
# get the card transaction view
cardTransactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# get the count of card transaction descriptions, grouped by credit card, over the past 90 days
cardTransactionDescriptionCount = cardTransactionsView.groupby('AccountID', category='CardTransactionDescription').aggregate_over(
    None,
    AggFunc.COUNT,
    windows=['90d'],
    feature_names=['card_transaction_description_count_by_card_past_90_days']
)

# get the entropy of cardTransactionDescriptionCount
counts = cardTransactionDescriptionCount['card_transaction_description_count_by_card_past_90_days']
cardTransactionDescriptionEntropy = counts.cd.entropy()
cardTransactionDescriptionEntropy.name = 'card_transaction_description_entropy_by_card_past_90_days'

Show some sample result for these three aggregations

In [None]:
# get some customer IDs and account IDs where we know the account has been active in December 2022
mask = creditCardView.ValidTo.isnull() | (creditCardView.ValidTo.dt.year > 2022)
samples = creditCardView[mask].sample(5)

# create a feature group containing all of the aggregations
featureGroup = FeatureGroup([cardCount, negativeTransactionProportion, cardTransactionDescriptionEntropy])

# loop through the samples to create a multi-row preview of the feature values
display(pd.concat([featureGroup.preview(
    point_in_time_and_serving_name={
        "POINT_IN_TIME": "2023-01-01",
        "BANKCUSTOMERID": samples['BankCustomerID'][i],
        "ACCOUNTID": samples['AccountID'][i]
    }
) for i in range(0, 5)]))

# Tutorial: How do I declare features?

## Test yourself

This is the section of the tutorial where you apply what you have learned to a new dataset, in this case, the credit card dataset.

1) Create a feature that is the consistency of the past 30 days' transactions versus past 90 days on a credit card
2) Create a feature that is the similarity of the past 30 days' transaction groups for a credit card versus all customers

1) Create a feature that is the consistency of the past 30 days' transactions versus past 90 days on a credit card

In [None]:
# create a view of transactions
transactionsView = EventView.from_event_data(
    event_data=EventData.get("CARDTRANSACTIONS")
)

# declare a feature that is an inventory of the items purchased over the past 30 days, grouped by customer
customer_inventory30d = transactionsView.groupby("AccountID", category="CardTransactionDescription").aggregate_over(
    None,
    method=AggFunc.COUNT,
    feature_names=["CustomerInventory30d"],
    windows=['30d']
)

# declare a feature that is an inventory of the items purchased over the past 90 days, grouped by customer
customer_inventory90d = transactionsView.groupby("AccountID", category="CardTransactionDescription").aggregate_over(
    None,
    method=AggFunc.COUNT,
    feature_names=["CustomerInventory90d"],
    windows=['90d']
)

# How consistent is a customer's purchasing behavior over time?
# create a feature that measures the similarity of the past 30 days' purchases versus the past 90 days' purchases
customer_inventory_consistency = customer_inventory30d["CustomerInventory30d"].cd.cosine_similarity(customer_inventory90d["CustomerInventory90d"])
customer_inventory_consistency.name = "CustomerTransactionConsistency"

# get some account IDs where we know the account has been active in December 2022
mask = (transactionsView['Timestamp'].dt.year == 2022) & (transactionsView['Timestamp'].dt.month == 12)
samples = transactionsView[mask].sample(5)

# create a feature group
consistency = FeatureGroup([customer_inventory30d, customer_inventory90d, customer_inventory_consistency])

# loop through the AccountID column of samples to create a multi-row preview of the feature values
display(pd.concat([consistency.preview(
    point_in_time_and_serving_name={
        "POINT_IN_TIME": "2023-01-01 00:00:00",
        "ACCOUNTID": account_id
    }
) for account_id in samples['AccountID']]))

2) Create a feature that is the similarity of the past 30 days' transaction groups for a credit card versus all customers

In [None]:
# declare a feature that is an inventory of the items purchased over the past 30 days, grouped by customer
all_inventory30d = transactionsView.groupby(by_keys=[], category="CardTransactionDescription").aggregate_over(
    None,
    method=AggFunc.COUNT,
    feature_names=["AllInventory30d"],
    windows=['30d']
)

# How consistent is a customer's purchasing behavior over time?
# create a feature that measures the similarity of the past 30 days' purchases versus the past 90 days' purchases
customer_inventory_similarity = customer_inventory30d["CustomerInventory30d"].cd.cosine_similarity(all_inventory30d["AllInventory30d"])
customer_inventory_similarity.name = "CustomerTransactionSimilarity"

# create a feature group
similarity = FeatureGroup([customer_inventory30d, all_inventory30d, customer_inventory_similarity])

# loop through the AccountID column of samples to create a multi-row preview of the feature values
display(pd.concat([similarity.preview(
    point_in_time_and_serving_name={
        "POINT_IN_TIME": "2023-01-01 00:00:00",
        "ACCOUNTID": account_id
    }
) for account_id in samples['AccountID']]))

# Tutorial: How do I declare a target?

## Test yourself

1) declare a target which is the total transaction amounts over the next 30 days, grouped by customer

In [None]:
# join card transactions view with credit card view, to get the customer ID
targetView = transactionsView.copy()
targetView.join(creditCardView[['BankCustomerID', 'AccountID']])

display(targetView.sample())

In [None]:
target1 = targetView.groupby("BankCustomerID").aggregate_over(
    "Amount",
    method=AggFunc.SUM,
    feature_names=["Target1"],
    windows=['30d'],
    fill_value = 0
)

# get some customer IDs where we know at least one credit card account was active at 31-Dec-2022
mask = creditCardView['ValidTo'].isnull() | (creditCardView['ValidTo'].dt.year > 2022)
customer_samples = creditCardView[mask].sample(5)

# loop through the BankCustomerID column of samples to create a multi-row preview of the feature values
display(pd.concat([target1.preview(
    point_in_time_and_serving_name={
        "POINT_IN_TIME": "2023-01-01 00:00:00",
        "BANKCUSTOMERID": customer_id
    }
) for customer_id in customer_samples['BankCustomerID']]))

2) declare a target which is whether a credit card will have an active record 30 days from now

In [None]:
# filter out closed credit card accounts
mask1 = creditCardView['closed_at'].isnull()
active_card_records = creditCardView[mask1]
active_card_records['card_is_active'] = 1

target2 = active_card_records.card_is_active.as_feature(feature_name="Target2")

# loop through the AccountID column of samples to create a multi-row preview of the feature values
display(pd.concat([target2.preview(
    point_in_time_and_serving_name={
        "POINT_IN_TIME": "2023-01-01 00:00:00",
        "ACCOUNTID": account_id
    }
) for account_id in samples['AccountID']]))

# Tutorial: How do I align the entity level of features?

## Test yourself

1) get the entity for each of the following features
* population of the state that the customer lives in
* count of credit cards the customer has used over the past 90 days
* entropy of transaction description groups on transactions over the past 60 days, by customer
* largest single transaction over the past 30 days, by credit card

In [None]:
# join BankCustomerID to the transactions view
transactionsView.join(creditCardView[['BankCustomerID', 'AccountID']])

# join transaction description groups to the transactions view
transactionsView.join(cardTransactionGroupsView, on="CardTransactionDescription", how="left")

In [None]:
# create a feature that the total transaction amounts over 30 days by customer
target_total_transactions = transactionsView.groupby("BankCustomerID").aggregate_over(
    "Amount",
    method=AggFunc.SUM,
    feature_names=["Target_TotalTransactions"],
    windows=['30d'],
    fill_value=0
)
# convert the feature group to a single feature
target_total_transactions = target_total_transactions["Target_TotalTransactions"]

In [None]:
# create a feature that is the population of the state of residence of a customer
state_population = stateDetailsView["TotalPopulation"].as_feature("StatePopulation")

# get a count dictionary the transaction counts for each credit card over the past 90 days
cardTransactionCounts = transactionsView.groupby('BankCustomerID', category='AccountID').aggregate_over(
    None,
    AggFunc.COUNT,
    windows=['90d'],
    feature_names=['transaction_count_by_card_past_90_days']
)
# create a feature that is the length of the inventory within cardTransactionCounts
activeCreditCardCount = cardTransactionCounts['transaction_count_by_card_past_90_days'].cd.unique_count(include_missing=False)
activeCreditCardCount.name = 'active_credit_card_count_by_customer_past_90_days'

# create a feature that is the count of transactions in each transaction description group over the past 60 days by customer
transactionGroupCount = transactionsView.groupby('BankCustomerID', category='TransactionGroup').aggregate_over(
    None,
    AggFunc.COUNT,
    windows=['60d'],
    feature_names=['card_transaction_group_count_by_card_past_60_days']
)
# create a feature that is the entropy of the transaction description groups over the past 60 days by credit card
transactionGroupEntropy60d = transactionGroupCount["card_transaction_group_count_by_card_past_60_days"].cd.entropy()
transactionGroupEntropy60d.name = "card_transaction_group_entropy_by_card_past_60_days"

# create a feature that is the largest transaction amount over the past 30 days by credit card
cardTransactionMax = transactionsView.groupby('AccountID').aggregate_over(
    'Amount',
    AggFunc.MAX,
    windows=['30d'],
    feature_names=['transaction_max_by_card_past_30_days']
)

In [None]:
# save each feature
target_total_transactions.save(conflict_resolution="retrieve")
state_population.save(conflict_resolution="retrieve")
activeCreditCardCount.save(conflict_resolution="retrieve")
transactionGroupEntropy60d.save(conflict_resolution="retrieve")
cardTransactionMax['transaction_max_by_card_past_30_days'].save(conflict_resolution="retrieve")

In [None]:
# view the entities of each feature
stored_features = Feature.list()

# filter to only show the credit card use case features
mask = stored_features.entities.apply(lambda x: ('creditcard' in x) or ('bankcustomer' in x) or ('state' in x) or ('cardtransactiondescription' in x) or ('cardtransactiond' in x))
stored_features = stored_features[mask]

display(stored_features)

2) which features require aggregation?

In [None]:
# what is the entity level of the target?
target_entity = stored_features.loc[stored_features.name.str.contains("Target")].entities.values[0][0]
print('The target entity is {}'.format(target_entity))

In [None]:
# which features have entities other than the target entity?
def has_other_entities(x, whitelisted):
    for y in x:
        if y != whitelisted:
            return True
    return False
stored_features.loc[[has_other_entities(x, target_entity) for x in stored_features.entities.values]]

3) apply MAX aggregation to all features that require aggregation

In [None]:
# create a feature that is the largest transaction amount over the past 30 days by customer
customerTransactionMax = transactionsView.groupby('BankCustomerID').aggregate_over(
    'Amount',
    AggFunc.MAX,
    windows=['30d'],
    feature_names=['transaction_max_by_customer_past_30_days']
)

# save the feature
customerTransactionMax['transaction_max_by_customer_past_30_days'].save(conflict_resolution="retrieve")

In [None]:
# view the entities of each feature
stored_features = Feature.list()

# filter to only show the credit card use case features
mask = stored_features.entities.apply(lambda x: ('creditcard' in x) or ('bankcustomer' in x) or ('state' in x) or ('cardtransactiondescription' in x) or ('cardtransactiond' in x))
stored_features = stored_features[mask]

display(stored_features)

# Tutorial: How do I declare a feature list?

## Test yourself

Build a feature list that combines these four features:
* population of the state that the customer lives in
* count of credit cards the customer has used over the past 90 days
* entropy of transaction description groups on transactions over the past 60 days, by customer
* largest single transaction over the past 30 days, by customer

In [None]:
feature_list_name = 'Beta testing tutorial test yourself feature list'
new_feature_list = FeatureList([
    state_population,
    activeCreditCardCount,
    transactionGroupEntropy60d,
    customerTransactionMax       
    ], name=feature_list_name)

In [None]:
entities = Entity.list()

# filter to only show the credit card use case entities
mask = entities.name.apply(lambda x: ('french' not in x) and ('grocery' not in x))
entities = entities[mask]

display(entities)

In [None]:
# save the feature list into the feature store
new_feature_list.save(conflict_resolution="retrieve")

In [None]:
# display all feature lists in the feature store
FeatureList.list()

# Tutorial: How do I change the status of a feature list?

## Test yourself

1) set each feature readiness to PRODUCTION_READY

In [None]:
# change the readiness status of each feature in the feature list
for name, feat in new_feature_list.feature_objects.items():
    feat.update_readiness('PRODUCTION_READY')

2) set the feature list status to PUBLISHED

In [None]:
# make the feature list have published status
new_feature_list.update_status('PUBLISHED')

In [None]:
# display all feature lists in the feature store
FeatureList.list()

# Tutorial: How do I create training data?

## Test yourself

This is the section of the tutorial where you apply what you have learned to a new dataset, in this case, the credit card dataset.

Materialize your credit card dataset feature list from the observation set file credit_card_observation_set.csv 

In [None]:
# display all feature lists in the feature store
display(FeatureList.list())

# display all of the entities in the feature store
display(Entity.list())

In [None]:
# load a pre-built observation set matching the defined context
observation_set = pd.read_csv("credit_card_observation_set.csv")
observation_set["POINT_IN_TIME"] = pd.to_datetime(observation_set["POINT_IN_TIME"], utc=True).dt.tz_localize(None)
observation_set["POINT_IN_TIME"] = pd.to_datetime("2022-12-01", utc=True).tz_localize(None)
display(observation_set)

In [None]:
training_data = new_feature_list.get_historical_features(observation_set)
display(training_data)

Prepare the target for inclusion in the training data

In [None]:
# create a version of the observation set where the point in time is moved forwards 30 days
target_observation_set = observation_set.copy()
target_observation_set["OLD_POINT_IN_TIME"] = target_observation_set["POINT_IN_TIME"]
target_observation_set["POINT_IN_TIME"] = target_observation_set["POINT_IN_TIME"] + pd.Timedelta(days=30)
display(target_observation_set)

In [None]:
# save target1 as a feature list
target1.save(conflict_resolution="retrieve")

# create a feature list that is the target1 feature list
target_feature_list_name = 'Beta testing tutorial test yourself target feature list'
target_feature_list = FeatureList([
    target1
    ], name=target_feature_list_name)

# save the feature list into the feature store
target_feature_list.save(conflict_resolution="retrieve")

# change the readiness status of each feature in the feature list
for name, feat in target_feature_list.feature_objects.items():
    feat.update_readiness('PRODUCTION_READY')

# make the feature list have published status
target_feature_list.update_status('PUBLISHED')

In [None]:
target_training_data = target_feature_list.get_historical_features(target_observation_set)
target_training_data.drop(columns=['STATECODE'], inplace=True)
display(target_training_data)

In [None]:
target_training_data['POINT_IN_TIME'] = target_training_data['OLD_POINT_IN_TIME']
target_training_data = target_training_data.drop(columns=['OLD_POINT_IN_TIME'])

In [None]:
display(target_training_data)

In [None]:
# merge the target and training data
training_data = training_data.merge(target_training_data, on=['BANKCUSTOMERID', 'POINT_IN_TIME'], how='left')
display(training_data)

# Tutorial: How do I serve features?

## Test yourself

1) deploy your feature list

In [None]:
# deploy the new feature list
new_feature_list.deploy(enable=True, make_production_ready=True)

In [None]:
# show the deployed feature lists
feature_lists = FeatureList.list()
feature_lists = feature_lists[feature_lists.deployed == True]
feature_lists = feature_lists[feature_lists.name.str.contains('test yourself')]
display(feature_lists)

In [None]:
# show the online features
stored_features = Feature.list()
stored_features = stored_features[stored_features.online_enabled == True]

# filter to only show the credit card use case features
mask = stored_features.entities.apply(lambda x: ('creditcard' in x) or ('bankcustomer' in x) or ('state' in x) or ('cardtransactiondescription' in x) or ('cardtransactiond' in x))
stored_features = stored_features[mask]

display(stored_features)

In [None]:
new_feature_list.info(verbose=True)

2) call the API with BankCustomerID = 'a07d302c-17bb-4f57-8272-1e3be8795ad9' and StateCode = 'AK'

In [None]:
# features should be online_enabled
new_feature_list.list_features()

In [None]:
# get a python template for consuming the feature serving API
new_feature_list.get_online_serving_code(language="python")

Copy the online serving code that was generated above, paste it into the cell below, then run it

In [None]:
# replace the contents of this Python code cell with the output from new_feature_list.get_online_serving_code(language="python")

In [None]:
# show the deployed feature lists
feature_lists = FeatureList.list()
feature_lists = feature_lists[feature_lists.deployed == True]
display(feature_lists)

In [None]:
# disable the feature list deployment
new_feature_list.deploy(enable=False)