In [1]:
import sys
from pathlib import Path


def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False


def clone_repository() -> None:
    !git clone https://github.com/featurestorebook/mlfs-book.git
    %cd mlfs-book


def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml


if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('ccfraud',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")

⛳️ Local environment
Adding the following directory to the PYTHONPATH: /home/jdowling/Projects/mlfs-book/notebooks/ch06/cc-fraud


In [2]:
import hopsworks
import pandas as pd
from datetime import datetime
from datetime import timedelta
import polars as pl
from hopsworks.hsfs.feature import Feature
import mlfs.ccfraud.features
from mlfs.config import settings

fs = hopsworks.login(api_key_value=settings.HOPSWORKS_API_KEY.get_secret_value()).get_feature_store()

accounts_fg = fs.get_feature_group("account_details", version=1)
merchants_fg = fs.get_feature_group("merchant_details", version=1)
banks_fg = fs.get_feature_group("bank_details", version=1)
cards_fg = fs.get_feature_group("card_details", version=1)
transactions_fg = fs.get_feature_group("credit_card_transactions", version=1)
cc_fraud_fg = fs.get_feature_group("cc_fraud", version=1)

2024-12-30 22:49:25,903 INFO: Initializing external client
2024-12-30 22:49:25,903 INFO: Base URL: https://c.app.hopsworks.ai:443
2024-12-30 22:49:27,264 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/17565


In [3]:
# For full BACKFILL, change backfill=30
backfill=30
end_time = datetime.now() + timedelta(days=7)
start_time = end_time - timedelta(days=backfill)

In [4]:
banks_df = banks_fg.read()

/arrow/cpp/src/arrow/status.cc:137: DoAction result was not fully consumed: Cancelled: Flight cancelled call, with message: CANCELLED. Detail: Cancelled


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.06s) 


In [5]:
def df_by_ts(fg, ts_column, start_time, end_time):
    return fg.filter((Feature(ts_column) > start_time) & (Feature(ts_column) < end_time)).read(dataframe_type="polars") 

In [6]:
transactions_df = df_by_ts(transactions_fg, "ts", start_time, end_time)
print(transactions_df)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (6.71s) 
shape: (510_000, 7)
┌─────────────────┬────────┬─────────────┬────────┬────────────────┬────────────────┬──────────────┐
│ t_id            ┆ cc_num ┆ merchant_id ┆ amount ┆ ip_address     ┆ ts             ┆ card_present │
│ ---             ┆ ---    ┆ ---         ┆ ---    ┆ ---            ┆ ---            ┆ ---          │
│ str             ┆ str    ┆ str         ┆ f64    ┆ str            ┆ datetime[μs,   ┆ bool         │
│                 ┆        ┆             ┆        ┆                ┆ Etc/UTC]       ┆              │
╞═════════════════╪════════╪═════════════╪════════╪════════════════╪════════════════╪══════════════╡
│ 6496e803-4aec-4 ┆ 4356   ┆ 1368        ┆ 123.69 ┆ 115.90.194.134 ┆ 2024-12-18 06: ┆ false        │
│ cf5-9f0a-b6f851 ┆        ┆             ┆        ┆ /32            ┆ 42:00.058492   ┆              │
│ …               ┆        ┆             ┆        ┆                ┆ UTC            ┆ 

In [7]:
transactions_df.count()

t_id,cc_num,merchant_id,amount,ip_address,ts,card_present
u32,u32,u32,u32,u32,u32,u32
510000,510000,510000,510000,510000,510000,510000


In [8]:
transactions_df.head()

t_id,cc_num,merchant_id,amount,ip_address,ts,card_present
str,str,str,f64,str,"datetime[μs, Etc/UTC]",bool
"""6496e803-4aec-4cf5-9f0a-b6f851…","""4356""","""1368""",123.69,"""115.90.194.134/32""",2024-12-18 06:42:00.058492 UTC,False
"""1a74ceab-ca03-4667-8707-9c7adb…","""5244""","""1250""",66.62,"""154.178.147.0""",2024-12-18 23:23:48.058492 UTC,False
"""e4f25e22-d9ac-4e6e-98d4-af53fe…","""593""","""110""",113.86,"""73.167.241.177""",2024-12-23 22:34:14.058492 UTC,False
"""bf1691fe-02ee-4582-bb59-772ecb…","""6914""","""1990""",192.46,"""77.198.32.44""",2024-12-17 02:18:40.058492 UTC,False
"""2d360a07-4bf9-4b76-b398-9c661b…","""8137""","""451""",164.67,"""109.32.0.0/11""",2024-12-20 02:36:53.058492 UTC,False


In [9]:
fraud_reports_df = cc_fraud_fg.read(dataframe_type="polars")
fraud_reports_df.head()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.36s) 


report_id,t_id,report_time,fraud_type
str,str,"datetime[μs, Etc/UTC]",str
"""30fa98fd-b245-4eac-a6a2-7452ba…","""27d1491e-1d80-46f9-8698-45c726…",2024-12-26 23:25:08.058492 UTC,"""Card Not Present"""
"""e65aa076-b721-44f9-b913-0978e7…","""1ed41244-e91f-402d-b5b1-b7afd6…",2024-12-27 02:23:21.058492 UTC,"""Skimming"""
"""cc4fb7cb-346c-44b4-9094-3e81d9…","""1f302bc7-47dc-4c35-adf3-1f3e5c…",2024-12-26 23:05:32.058492 UTC,"""Lost/Stolen"""
"""b03362f6-4b68-4f59-ac1a-e98bc6…","""53fcffaf-a93f-495e-8df2-e9e456…",2024-12-27 17:22:08.058492 UTC,"""Card Not Present"""
"""0db8d35f-d19a-4632-a058-099fea…","""ea37e9a1-c0d7-484c-8cee-feb547…",2024-12-26 21:41:40.058492 UTC,"""Lost/Stolen"""


In [10]:
fraud_reports_df.count()

report_id,t_id,report_time,fraud_type
u32,u32,u32,u32
10000,10000,10000,10000


In [11]:
labels_df = transactions_df.join(fraud_reports_df, on="t_id", how="left")

# Add the "is_fraud" column based on "fraud_type"
labels_df = labels_df.with_columns(
    (pl.col("fraud_type").is_not_null().cast(pl.Int8)).alias("is_fraud")
)

# Drop columns that are not needed
labels_df = labels_df.drop(["report_id", "report_time", "fraud_type"])
labels_df.head()

t_id,cc_num,merchant_id,amount,ip_address,ts,card_present,is_fraud
str,str,str,f64,str,"datetime[μs, Etc/UTC]",bool,i8
"""6496e803-4aec-4cf5-9f0a-b6f851…","""4356""","""1368""",123.69,"""115.90.194.134/32""",2024-12-18 06:42:00.058492 UTC,False,0
"""1a74ceab-ca03-4667-8707-9c7adb…","""5244""","""1250""",66.62,"""154.178.147.0""",2024-12-18 23:23:48.058492 UTC,False,0
"""e4f25e22-d9ac-4e6e-98d4-af53fe…","""593""","""110""",113.86,"""73.167.241.177""",2024-12-23 22:34:14.058492 UTC,False,0
"""bf1691fe-02ee-4582-bb59-772ecb…","""6914""","""1990""",192.46,"""77.198.32.44""",2024-12-17 02:18:40.058492 UTC,False,0
"""2d360a07-4bf9-4b76-b398-9c661b…","""8137""","""451""",164.67,"""109.32.0.0/11""",2024-12-20 02:36:53.058492 UTC,False,0


In [12]:
labels_df.count()

t_id,cc_num,merchant_id,amount,ip_address,ts,card_present,is_fraud
u32,u32,u32,u32,u32,u32,u32,u32
510000,510000,510000,510000,510000,510000,510000,510000


In [13]:
# Group by "is_fraud" and count occurrences
grouped_df = labels_df.group_by("is_fraud").agg(
    pl.count().alias("count")  # Count all rows in each group
)

# Display the result
print(grouped_df)


shape: (2, 2)
┌──────────┬────────┐
│ is_fraud ┆ count  │
│ ---      ┆ ---    │
│ i8       ┆ u32    │
╞══════════╪════════╡
│ 1        ┆ 10000  │
│ 0        ┆ 500000 │
└──────────┴────────┘


In [14]:
print(labels_df.describe)

<bound method DataFrame.describe of shape: (510_000, 8)
┌─────────────┬────────┬─────────────┬────────┬─────────────┬─────────────┬─────────────┬──────────┐
│ t_id        ┆ cc_num ┆ merchant_id ┆ amount ┆ ip_address  ┆ ts          ┆ card_presen ┆ is_fraud │
│ ---         ┆ ---    ┆ ---         ┆ ---    ┆ ---         ┆ ---         ┆ t           ┆ ---      │
│ str         ┆ str    ┆ str         ┆ f64    ┆ str         ┆ datetime[μs ┆ ---         ┆ i8       │
│             ┆        ┆             ┆        ┆             ┆ , Etc/UTC]  ┆ bool        ┆          │
╞═════════════╪════════╪═════════════╪════════╪═════════════╪═════════════╪═════════════╪══════════╡
│ 6496e803-4a ┆ 4356   ┆ 1368        ┆ 123.69 ┆ 115.90.194. ┆ 2024-12-18  ┆ false       ┆ 0        │
│ ec-4cf5-9f0 ┆        ┆             ┆        ┆ 134/32      ┆ 06:42:00.05 ┆             ┆          │
│ a-b6f851…   ┆        ┆             ┆        ┆             ┆ 8492 UTC    ┆             ┆          │
│ 1a74ceab-ca ┆ 5244   ┆ 1250      

In [None]:
# def fraud_rate_by_num_days(col, df, days):
#     """
#     Computes the fraud_rate.
#     """
#     day = 86400

#     # Define a window that looks at the last 30 days for each merchant, sliding 1 day at a time
#     window_spec = Window.partitionBy(f"{col}").orderBy(F.col("transaction_time").cast("long")).rangeBetween(-days * day, 0)

#     # Compute total transactions and fraud transactions within the 7-day window
#     df = df.withColumn(f"total_transactions_{days}d", F.count("t_id").over(window_spec)) \
#            .withColumn(f"fraud_transactions_{days}d", F.sum("is_fraud").over(window_spec)) \
#            .withColumn(f"{col}_fraud_rate_last_{days}_days", F.col(f"fraud_transactions_{days}d") / F.col(f"total_transactions_{days}d"))
    
#     return df.drop(f"total_transactions_{days}d", f"fraud_transactions_{days}d")
    


In [15]:
import polars as pl
from datetime import timedelta

def fraud_rate_by_num_days(col, df, ts_column, days):
    """
    Computes the fraud_rate over a sliding window of `days` days.
    """
    # Ensure the transaction_time column is in datetime format
    df = df.with_columns(
        pl.col(ts_column).cast(pl.Datetime)
    )

    df = df.sort(ts_column)

    # Calculate the rolling window aggregations
    rolling_df = (
        df.rolling(
            index_column=ts_column,
            period=f"{days}d",
            group_by=col,  # Group by the specified column
            closed="right"
        )
        .agg([
            pl.count("t_id").alias(f"total_transactions_{days}d"),
            pl.sum("is_fraud").alias(f"fraud_transactions_{days}d"),
        ])
    )

    # Compute fraud rate
    rolling_df = rolling_df.with_columns(
        (
            pl.col(f"fraud_transactions_{days}d") / pl.col(f"total_transactions_{days}d")
        ).alias(f"{col}_fraud_rate_last_{days}_days")
    )

    # Drop intermediate columns
    rolling_df = rolling_df.drop(
        [f"total_transactions_{days}d", f"fraud_transactions_{days}d"]
    )

    return rolling_df


In [16]:
merchant_df = labels_df
merchant_df = fraud_rate_by_num_days("merchant_id", merchant_df, "ts", 3)
# merchant_df = merchant_df.drop(["t_id", "customer_id", "card_id", "location", "amount", "transaction_category_name", "is_fraud"])
merchant_df.head()

merchant_id,ts,merchant_id_fraud_rate_last_3_days
str,datetime[μs],f64
"""42""",2024-12-16 19:45:48.058492,0.0
"""42""",2024-12-16 21:54:25.058492,0.0
"""42""",2024-12-17 00:34:08.058492,0.0
"""42""",2024-12-17 02:18:17.058492,0.0
"""42""",2024-12-17 02:42:19.058492,0.0


In [17]:
merchant_df.count()

merchant_id,ts,merchant_id_fraud_rate_last_3_days
u32,u32,u32
510000,510000,510000


In [None]:
# merchant_df = merchant_df.withColumn("transaction_date", F.to_date(F.col("transaction_time")))

# merchant_df = merchant_df.groupBy("transaction_date", "merchant_id") \
#     .agg(
#         F.avg("merchant_id_fraud_rate_last_3_days").alias("avg_fraud_rate_last_3_days"),
#     )

# merchant_df.show()

In [None]:
# Add a column for transaction_date by extracting the date part of transaction_time
merchant_df = merchant_df.with_columns(
    pl.col("ts").cast(pl.Datetime).dt.date().alias("transaction_date")
)

# Group by transaction_date and merchant_id and calculate the average fraud rate
merchant_df = merchant_df.group_by(["transaction_date", "merchant_id"]).agg(
    pl.col("merchant_id_fraud_rate_last_3_days").mean().alias("avg_fraud_rate_last_3_days")
)

# Display the resulting DataFrame
print(merchant_df)

In [None]:
print(merchant_df.schema)

In [None]:
merchant_df.count()

In [None]:
# filtered_df = merchant_df.filter(
#     (F.col("avg_fraud_rate_last_3_days") != 0)
#     #& (F.col("avg_fraud_rate_last_30_days") != 0)
# )
# filtered_df.show()

In [None]:
filtered_df = merchant_df.filter(
    pl.col("avg_fraud_rate_last_3_days") != 0
)

print(filtered_df)

In [None]:
filtered_df.count()

In [None]:
print(labels_df)

In [None]:
location_df = labels_df

location_df = fraud_rate_by_num_days("ip_address", location_df, "ts", 3)
location_df.head()

In [None]:
# location_df = location_df.withColumn("transaction_date", F.to_date(F.col("transaction_time")))

# location_df = location_df.groupBy("transaction_date", "location") \
#     .agg(
#         F.avg("location_fraud_rate_last_3_days").alias("location_fraud_rate_last_3_days"),
#     )

# location_df.show()

# # Add a column for transaction_date by extracting the date part of transaction_time
# location_df = location_df.with_columns(
#     pl.col("ts").cast(pl.Datetime).dt.date().alias("transaction_date")
# )

# # Group by transaction_date and location and calculate the average fraud rate
# location_df = location_df.group_by(["transaction_date", "ip_address"]).agg(
#     pl.col("location_fraud_rate_last_3_days").mean().alias("location_fraud_rate_last_3_days")
# )

# print(location_df)

In [None]:
# Eye-ball the fraud rates for correctness
# filtered_df = location_df.filter(F.col("location_fraud_rate_last_3_days") > 0)
# filtered_df.select("transaction_date", "location", "location_fraud_rate_last_3_days").show()

# filtered_df = location_df.filter(
#     pl.col("location_fraud_rate_last_3_days") > 0
# ).select(
#     ["transaction_date", "location", "location_fraud_rate_last_3_days"]
# )

# print(filtered_df)

In [None]:
# # Define a window partitioned by credit_card_id and ordered by transaction_time
# window_spec = Window.partitionBy("card_id").orderBy("transaction_time")
# # Add a new column 'prev_transaction_time' using the lag function
# labels_df = labels_df.withColumn("prev_transaction_time", F.lag(F.col("transaction_time"), 1).over(window_spec))
# # Show the resulting DataFrame
# labels_df.show()

labels_df = labels_df.sort(["cc_num", "ts"]).with_columns(
    pl.col("ts")
    .shift(1)
    .over("cc_num")
    .alias("prev_transaction_ts")
)

print(labels_df)

In [None]:
# Some transactions have no previous transaction
# labels_df.filter(F.isnull(F.col("prev_transaction_time"))).count()
null_count = labels_df.filter(
    pl.col("prev_transaction_ts").is_null()
).height

print(null_count)

In [None]:
# For transactions with no previous transaction, set timestamp for prev transaction to be 7 days beforehand
# labels_df = labels_df.withColumn(
#     "prev_transaction_time", 
#     F.when(F.col("prev_transaction_time").isNull(), F.date_sub(F.col("transaction_time"), 3))
#     .otherwise(F.col("prev_transaction_time"))
# )

# labels_df.filter(F.isnull(F.col("prev_transaction_time"))).count()

# Replace null values in 'prev_transaction_time' with 'transaction_time' minus 3 days
labels_df = labels_df.with_columns(
    pl.when(pl.col("prev_transaction_ts").is_null())
    .then(pl.col("ts") - pl.duration(days=3))
    .otherwise(pl.col("prev_transaction_ts"))
    .alias("prev_transaction_ts")
)

# Count rows where 'prev_transaction_time' is still null
null_count = labels_df.filter(
    pl.col("prev_transaction_ts").is_null()
).height

print(null_count)


In [None]:
print(labels_df.schema)

In [None]:
# transactions_df = transactions_fg.read() 

# customer_lifetime_spend_df = transactions_df.groupBy("customer_id").agg(F.sum("amount").alias("total_spend"))

# customer_lifetime_spend_df = customer_lifetime_spend_df.drop("t_id", "cc_num", "amount", "transaction_category_name")

# customer_lifetime_spend_df.show()

# Read the transactions DataFrame
transactions_df = transactions_fg.read()
transactions_df = pl.from_pandas(transactions_df)

# Calculate the total spend per customer
customer_lifetime_spend_df = transactions_df.group_by("cc_num").agg(
    pl.col("amount").sum().alias("total_spend")
)

# Drop unnecessary columns (no-op since they are not in the grouped DataFrame)
customer_lifetime_spend_df = customer_lifetime_spend_df.select(["cc_num", "total_spend"])

# Display the resulting DataFrame
print(customer_lifetime_spend_df)


In [None]:
# No event-time for this feature group
customer_features_fg = fs.get_or_create_feature_group(
    name="customer_fg",
    version=1,
    primary_key=['cc_num'],
    event_time='last_modified',
    online_enabled=True,
    description="Customer batch fetaures"
)

customer_features_fg.insert(customer_lifetime_spend_df)

In [None]:
merchant_features_fg = fs.get_or_create_feature_group(
    name="merchant_fgc",
    version=1,
    primary_key=['merchant_id'],
    event_time="last_modified",
    online_enabled=True,
    description="Merchant batch features"
)

merchant_features_fg.insert(merchant_df)

In [None]:
# Add the event_time (7 days ago) for the time window (days 14-7 ago) for measuring fraud at locations

# location_features_fg = fs.get_or_create_feature_group(
#     name="batch_location_features",
#     version=1,
#     primary_key=['location'],
#     event_time="transaction_date",
#     online_enabled=True,
#     description="Transaction location batch features"
# )
# location_features_fg.insert(location_df, wait=True)

In [None]:
import great_expectations as ge

expectation_suite = ge.core.ExpectationSuite(
    "new_expectation_suite",
    expectations=[
        ge.core.ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_between",
            kwargs={
                "column": "amount",
                "min_value": 0,
                "max_value": 100_000
            }
        )
    ]
)

In [None]:

#
# ON-DEMAND TRANSFORMATIONS
#
@hopsworks.udf(int, drop=['prev_transaction_ts'])
def time_since_last_transaction_secs(prev_transaction_ts: pd.Series, ts: pd.Series) -> pd.Series:
    return (ts - prev_transaction_ts).dt.total_seconds().astype(int)



In [None]:
labels_fg = fs.create_feature_group(
    name="cc_trans_fg",
    version=1,
    primary_key=['t_id'],
    event_time = 'ts',
    description="Transaction fraud labels",
    online_enabled=True,
    parents=[transactions_fg],
    transformation_functions=[time_since_last_transaction_secs],
    expectation_suite=expectation_suite
)

labels_fg.insert(labels_df)

In [None]:
bank_fg = fs.create_feature_group(
    name="bank_fg",
    version=1,
    primary_key=['bank_id'],
    event_time = 'last_modified',
    description="Bank feature data",
    online_enabled=True
)

bank_fg.insert(banks_df)