In [3]:
from typing import Callable

import polars as pl
from faker import Faker

fake = Faker()

%load_ext autoreload
%autoreload 2

In [4]:
errors = (
    pl.read_csv("/home/grantham/windmark/data/transactions.v1.csv")
    .get_column("Errors?")
    .drop_nulls()
    .unique()
    .to_list()
)

unique_errors = set("".join(errors).split(","))

unique_errors.remove("")

print(unique_errors)

{'Bad PIN', 'Bad Zipcode', 'Bad Card Number', 'Insufficient Balance', 'Bad Expiration', 'Technical Glitch', 'Bad CVV'}


In [5]:
df = pl.read_csv("/home/grantham/windmark/data/transactions.v1.csv").filter(pl.col("User") < 500)

df.columns = [col.lower().replace("?", "").replace(" ", "_") for col in df.columns]

print(df.head())


def mock(dataframe: pl.DataFrame, name: str, generator: Callable = fake.unique.company):
    values: list[str] = dataframe.get_column(name).cast(pl.String).unique().to_list()

    mapping = {value: generator() for value in values}

    return dataframe.select(pl.exclude(name), pl.col(name).replace(mapping).alias(name))


df = mock(dataframe=df, name="merchant_name", generator=fake.unique.company)
df = mock(dataframe=df, name="card", generator=fake.unique.credit_card_number)
df = mock(dataframe=df, name="user", generator=fake.unique.name)

output = (
    df
    # .filter(pl.col("user") < 100)
    .select(pl.all(), timeparts=pl.col("time").str.split(":"))
    .select(
        pl.all(),
        hour=pl.col("timeparts").list.first(),
        minute=pl.col("timeparts").list.last(),
    )
    .select(
        pl.all(),
        timestamp=pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"), pl.col("minute")),
    )
    .select(
        "use_chip",
        "merchant_state",
        "merchant_city",
        "mcc",
        "card",
        "timestamp",
        *[
            pl.col("errors").fill_null("").str.contains(error).alias(f'has_{error.lower().replace(" ", "_")}')
            for error in unique_errors
        ],
        amount=pl.col("amount").str.strip_prefix("$").str.to_decimal(),
        merchant_name=pl.col("merchant_name").cast(pl.Utf8),
        target=pl.col("is_fraud"),
        transaction_id=pl.col("timestamp").cast(pl.Utf8),
        customer_id=pl.col("user"),
    )
    .sort("timestamp")
    .select(
        pl.all(),
        order_id=pl.col("transaction_id").cum_count().over("customer_id"),
        timedelta=pl.col("timestamp").sub(pl.col("timestamp").shift()).dt.seconds(),
    )
    # .write_parquet("quarter_ledger.parquet")
)

shape: (5, 15)
┌──────┬──────┬──────┬───────┬───┬─────────┬──────┬────────┬──────────┐
│ user ┆ card ┆ year ┆ month ┆ … ┆ zip     ┆ mcc  ┆ errors ┆ is_fraud │
│ ---  ┆ ---  ┆ ---  ┆ ---   ┆   ┆ ---     ┆ ---  ┆ ---    ┆ ---      │
│ i64  ┆ i64  ┆ i64  ┆ i64   ┆   ┆ f64     ┆ i64  ┆ str    ┆ str      │
╞══════╪══════╪══════╪═══════╪═══╪═════════╪══════╪════════╪══════════╡
│ 0    ┆ 0    ┆ 2002 ┆ 9     ┆ … ┆ 91750.0 ┆ 5300 ┆ null   ┆ No       │
│ 0    ┆ 0    ┆ 2002 ┆ 9     ┆ … ┆ 91754.0 ┆ 5411 ┆ null   ┆ No       │
│ 0    ┆ 0    ┆ 2002 ┆ 9     ┆ … ┆ 91754.0 ┆ 5411 ┆ null   ┆ No       │
│ 0    ┆ 0    ┆ 2002 ┆ 9     ┆ … ┆ 91754.0 ┆ 5651 ┆ null   ┆ No       │
│ 0    ┆ 0    ┆ 2002 ┆ 9     ┆ … ┆ 91750.0 ┆ 5912 ┆ null   ┆ No       │
└──────┴──────┴──────┴───────┴───┴─────────┴──────┴────────┴──────────┘


  timedelta=pl.col("timestamp").sub(pl.col("timestamp").shift()).dt.seconds(),


In [6]:
output.columns

['use_chip',
 'merchant_state',
 'merchant_city',
 'mcc',
 'card',
 'timestamp',
 'has_bad_pin',
 'has_bad_zipcode',
 'has_bad_card_number',
 'has_insufficient_balance',
 'has_bad_expiration',
 'has_technical_glitch',
 'has_bad_cvv',
 'amount',
 'merchant_name',
 'target',
 'transaction_id',
 'customer_id',
 'order_id',
 'timedelta']