In [1]:
from typing import Callable

import polars as pl
import polars.selectors as cs
from faker import Faker

fake = Faker()

%load_ext autoreload
%autoreload 2

In [2]:
datapath = "/home/grantham/windmark/data/ledgers/trxns.csv"

df = pl.read_csv(datapath).filter(pl.col("User") < 500)

In [3]:
df.head()

User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
i64,i64,i64,i64,i64,str,str,str,i64,str,str,f64,i64,str,str
0,0,2002,9,1,"""06:21""","""$134.09""","""Swipe Transaction""",3527213246127876953,"""La Verne""","""CA""",91750.0,5300,,"""No"""
0,0,2002,9,1,"""06:42""","""$38.48""","""Swipe Transaction""",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""06:22""","""$120.34""","""Swipe Transaction""",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""17:45""","""$128.95""","""Swipe Transaction""",3414527459579106770,"""Monterey Park""","""CA""",91754.0,5651,,"""No"""
0,0,2002,9,3,"""06:23""","""$104.71""","""Swipe Transaction""",5817218446178736267,"""La Verne""","""CA""",91750.0,5912,,"""No"""


In [4]:
df.get_column("Use Chip").unique()

Use Chip
str
"""Chip Transaction"""
"""Online Transaction"""
"""Swipe Transaction"""


In [5]:
errors = (
    pl.read_csv(datapath)
    .get_column("Errors?")
    .drop_nulls()
    .unique()
    .to_list()
)

unique_errors = set("".join(errors).split(","))

unique_errors.remove("")

print(unique_errors)

{'Technical Glitch', 'Bad Expiration', 'Insufficient Balance', 'Bad Zipcode', 'Bad PIN', 'Bad CVV', 'Bad Card Number'}


In [6]:
df = pl.read_csv(datapath).filter(pl.col("User") < 500)

df.columns = [col.lower().replace("?", "").replace(" ", "_") for col in df.columns]

def mock(dataframe: pl.DataFrame, name: str, generator: Callable = fake.unique.company):
    values: list[str] = dataframe.get_column(name).cast(pl.String).unique().to_list()

    mapping = {value: generator() for value in values}

    return dataframe.select(pl.exclude(name), pl.col(name).replace_strict(mapping).alias(name))


df = mock(dataframe=df, name="merchant_name", generator=fake.unique.company)
df = mock(dataframe=df, name="card", generator=fake.unique.credit_card_number)
df = mock(dataframe=df, name="user", generator=fake.unique.name)

ledger = (
    df
    # .filter(pl.col("user") < 100)
    .select(pl.all(), timeparts=pl.col("time").str.split(":"))
    .select(
        pl.all(),
        hour=pl.col("timeparts").list.first(),
        minute=pl.col("timeparts").list.last(),
    )
    .select(
        pl.all(),
        timestamp=pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"), pl.col("minute")),
    )
    .select(
        "use_chip",
        "merchant_state",
        "merchant_city",
        pl.col("mcc").cast(pl.String),
        "card",
        "timestamp",
        *[
            pl.col("errors").fill_null("").str.contains(error).cast(pl.String).alias(f'has_{error.lower().replace(" ", "_")}')
            for error in unique_errors
        ],
        amount=pl.col("amount").str.strip_prefix("$").str.to_decimal(),
        merchant_name=pl.col("merchant_name").cast(pl.Utf8),
        is_fraud=pl.col("is_fraud"),
        transaction_id=pl.col("timestamp").cast(pl.Utf8),
        customer_id=pl.col("user"),
    )
    .sort("timestamp")
    .select(
        pl.all(),
        order_id=pl.col("transaction_id").cum_count().over("customer_id"),
        timedelta=pl.col("timestamp").sub(pl.col("timestamp").shift().over("customer_id")).dt.total_seconds(),
        tenure=pl.col("timestamp").sub(pl.col("timestamp").first().over("customer_id")).dt.total_seconds(),
    )
)

In [7]:
ledger.head()

use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_technical_glitch,has_bad_expiration,has_insufficient_balance,has_bad_zipcode,has_bad_pin,has_bad_cvv,has_bad_card_number,amount,merchant_name,is_fraud,transaction_id,customer_id,order_id,timedelta,tenure
str,str,str,str,str,datetime[μs],str,str,str,str,str,str,str,"decimal[*,2]",str,str,str,str,u32,i64,i64
"""Swipe Transaction""","""CA""","""Sacramento""","""3058""","""6563748441765294""",1991-11-25 06:55:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",355.71,"""Lopez, Ponce and Mendoza""","""No""","""1991-11-25 06:55:00.000000""","""Maria Rivas""",1,,0
"""Swipe Transaction""","""CA""","""San Jose""","""4829""","""6563748441765294""",1991-12-01 06:51:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",100.0,"""Chan, Williams and Stone""","""No""","""1991-12-01 06:51:00.000000""","""Maria Rivas""",2,518160.0,518160
"""Swipe Transaction""","""CA""","""Watsonville""","""5411""","""6563748441765294""",1991-12-01 09:36:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",2.1,"""Nelson PLC""","""No""","""1991-12-01 09:36:00.000000""","""Maria Rivas""",3,9900.0,528060
"""Swipe Transaction""","""CA""","""Watsonville""","""5411""","""6563748441765294""",1991-12-01 09:44:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",48.93,"""Sullivan, Le and Lewis""","""No""","""1991-12-01 09:44:00.000000""","""Maria Rivas""",4,480.0,528540
"""Swipe Transaction""","""CA""","""Watsonville""","""5912""","""6563748441765294""",1991-12-01 10:21:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",52.23,"""Perez-Hendricks""","""No""","""1991-12-01 10:21:00.000000""","""Maria Rivas""",5,2220.0,530760


In [8]:
from zlib import crc32

def assign_split(column: str) -> pl.Expr:

    seed = (
        pl.col(column)
        .cast(pl.String)
        .map_elements(lambda x: float(crc32(str.encode(x)) & 0xFFFFFFFF), return_dtype=pl.Float32)
        .mul(1 / 2**32)
    )

    return (
        pl.when(seed.is_between(0.0, 0.6))
        .then(pl.lit("train"))
        .when(seed.is_between(0.6, 0.8))
        .then(pl.lit("validate"))
        .when(seed.is_between(0.8, 1.0))
        .then(pl.lit("test"))
        .otherwise(pl.lit("train"))
    )

In [9]:
fields = [
    'use_chip',
    'merchant_state',
    'merchant_city',
    'mcc',
    'card',
    'timestamp',
    'has_technical_glitch',
    'has_bad_pin',
    'has_bad_zipcode',
    'has_insufficient_balance',
    'has_bad_cvv',
    'has_bad_card_number',
    'has_bad_expiration',
    'amount',
    'merchant_name',
    'timedelta',
    'tenure',
]

In [10]:
lifestreams = (
    ledger.select(
        *[field for field in fields],
        'is_fraud',
        'transaction_id',
        'customer_id',
        'order_id',
        split=assign_split('customer_id'),
    )
    .select(
        (cs.numeric() - cs.by_name('order_id')).qcut(16).cast(pl.String),
        'order_id',
        ~cs.numeric()
    )
    .sort("customer_id", "order_id")
    .group_by("customer_id", maintain_order=True)
    .agg(
        *[field for field in fields],
        'transaction_id',
        is_fraud=pl.col('is_fraud'),
        split=pl.col("split").last(),
    )
    .iter_slices(5)
)


In [11]:
for index, lifestream in enumerate(lifestreams):
    lifestream.write_ndjson(f"../data/lifestreams/trxns_discrete/trxns-{index}.ndjson")

In [12]:
lifestream

customer_id,use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_technical_glitch,has_bad_pin,has_bad_zipcode,has_insufficient_balance,has_bad_cvv,has_bad_card_number,has_bad_expiration,amount,merchant_name,timedelta,tenure,transaction_id,is_fraud,split
str,list[str],list[str],list[str],list[str],list[str],list[datetime[μs]],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str
"""Wesley Le""","[""Online Transaction"", ""Chip Transaction"", … ""Chip Transaction""]","[null, ""RI"", … ""RI""]","["" ONLINE"", ""Cranston"", … ""Cranston""]","[""7393"", ""5411"", … ""5812""]","[""3544317050053029"", ""3544317050053029"", … ""3544317050053029""]","[2020-02-01 07:29:00, 2020-02-01 09:31:00, … 2020-02-28 11:52:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(30.39, 38.41]"", ""(9.85, 13.67]"", … ""(0.5, 2.58]""]","[""Malone Group"", ""Stevens-Anderson"", … ""White, Cochran and Ferrell""]","[null, ""(6720, 9660]"", … ""(23520, 32040]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(-inf, 34646763.75]""]","[""2020-02-01 07:29:00.000000"", ""2020-02-01 09:31:00.000000"", … ""2020-02-28 11:52:00.000000""]","[""No"", ""No"", … ""No""]","""test"""
"""William Harvey""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Online Transaction""]","[""TX"", ""TX"", … null]","[""Del Valle"", ""Seguin"", … "" ONLINE""]","[""3260"", ""5541"", … ""5311""]","[""345509619558905"", ""345509619558905"", … ""345509619558905""]","[2002-08-18 06:30:00, 2002-09-01 11:48:00, … 2020-02-28 23:58:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(13.67, 18.22]"", … ""(38.41, 46.68]""]","[""Horton-Elliott"", ""Hughes-Mclaughlin"", … ""Steele, Williamson and Alvarez""]","[null, ""(85680, inf]"", … ""(23520, 32040]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(542457660, inf]""]","[""2002-08-18 06:30:00.000000"", ""2002-09-01 11:48:00.000000"", … ""2020-02-28 23:58:00.000000""]","[""No"", ""No"", … ""No""]","""train"""
"""William Martinez""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""FL"", ""FL"", … ""FL""]","[""Fort Lauderdale"", ""Valrico"", … ""Melbourne""]","[""3000"", ""3174"", … ""5814""]","[""4871824726483"", ""4871824726483"", … ""4871824726483""]","[1999-05-22 11:16:00, 1999-11-21 00:12:00, … 2020-02-28 19:12:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(140, inf]"", … ""(46.68, 56.44]""]","[""Rosario-Ruiz"", ""Yang and Sons"", … ""Crawford, Mathews and Lopez""]","[null, ""(85680, inf]"", … ""(13140, 17640]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(542457660, inf]""]","[""1999-05-22 11:16:00.000000"", ""1999-11-21 00:12:00.000000"", … ""2020-02-28 19:12:00.000000""]","[""No"", ""No"", … ""No""]","""train"""
"""William Thompson""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""VT"", ""IL"", … ""IL""]","[""Ludlow"", ""Yorkville"", … ""Yorkville""]","[""3504"", ""5812"", … ""5211""]","[""3544317050053029"", ""3544317050053029"", … ""3544317050053029""]","[2011-09-13 20:24:00, 2012-01-01 08:18:00, … 2020-02-28 12:31:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(5.6, 9.85]"", … ""(56.44, 67.18]""]","[""Mullins-Bryant"", ""Sanders LLC"", … ""Burns-Olson""]","[null, ""(85680, inf]"", … ""(4260, 6720]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(251737020, 282492675]""]","[""2011-09-13 20:24:00.000000"", ""2012-01-01 08:18:00.000000"", … ""2020-02-28 12:31:00.000000""]","[""No"", ""No"", … ""No""]","""test"""
"""Yvonne Rogers""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Swipe Transaction""]","[""MO"", ""MO"", … ""MO""]","[""Maryland Heights"", ""Bridgeton"", … ""Maryland Heights""]","[""8021"", ""5311"", … ""5814""]","[""3544317050053029"", ""3544317050053029"", … ""3544317050053029""]","[2020-02-01 16:18:00, 2020-02-03 01:13:00, … 2020-02-28 18:22:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(38.41, 46.68]"", … ""(38.41, 46.68]""]","[""Robinson PLC"", ""Hancock-Thomas"", … ""Lloyd-Allison""]","[null, ""(85680, inf]"", … ""(17640, 23520]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(-inf, 34646763.75]""]","[""2020-02-01 16:18:00.000000"", ""2020-02-03 01:13:00.000000"", … ""2020-02-28 18:22:00.000000""]","[""No"", ""No"", … ""No""]","""test"""


In [13]:
lifestream

customer_id,use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_technical_glitch,has_bad_pin,has_bad_zipcode,has_insufficient_balance,has_bad_cvv,has_bad_card_number,has_bad_expiration,amount,merchant_name,timedelta,tenure,transaction_id,is_fraud,split
str,list[str],list[str],list[str],list[str],list[str],list[datetime[μs]],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],str
"""Wesley Le""","[""Online Transaction"", ""Chip Transaction"", … ""Chip Transaction""]","[null, ""RI"", … ""RI""]","["" ONLINE"", ""Cranston"", … ""Cranston""]","[""7393"", ""5411"", … ""5812""]","[""3544317050053029"", ""3544317050053029"", … ""3544317050053029""]","[2020-02-01 07:29:00, 2020-02-01 09:31:00, … 2020-02-28 11:52:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(30.39, 38.41]"", ""(9.85, 13.67]"", … ""(0.5, 2.58]""]","[""Malone Group"", ""Stevens-Anderson"", … ""White, Cochran and Ferrell""]","[null, ""(6720, 9660]"", … ""(23520, 32040]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(-inf, 34646763.75]""]","[""2020-02-01 07:29:00.000000"", ""2020-02-01 09:31:00.000000"", … ""2020-02-28 11:52:00.000000""]","[""No"", ""No"", … ""No""]","""test"""
"""William Harvey""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Online Transaction""]","[""TX"", ""TX"", … null]","[""Del Valle"", ""Seguin"", … "" ONLINE""]","[""3260"", ""5541"", … ""5311""]","[""345509619558905"", ""345509619558905"", … ""345509619558905""]","[2002-08-18 06:30:00, 2002-09-01 11:48:00, … 2020-02-28 23:58:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(13.67, 18.22]"", … ""(38.41, 46.68]""]","[""Horton-Elliott"", ""Hughes-Mclaughlin"", … ""Steele, Williamson and Alvarez""]","[null, ""(85680, inf]"", … ""(23520, 32040]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(542457660, inf]""]","[""2002-08-18 06:30:00.000000"", ""2002-09-01 11:48:00.000000"", … ""2020-02-28 23:58:00.000000""]","[""No"", ""No"", … ""No""]","""train"""
"""William Martinez""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""FL"", ""FL"", … ""FL""]","[""Fort Lauderdale"", ""Valrico"", … ""Melbourne""]","[""3000"", ""3174"", … ""5814""]","[""4871824726483"", ""4871824726483"", … ""4871824726483""]","[1999-05-22 11:16:00, 1999-11-21 00:12:00, … 2020-02-28 19:12:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(140, inf]"", … ""(46.68, 56.44]""]","[""Rosario-Ruiz"", ""Yang and Sons"", … ""Crawford, Mathews and Lopez""]","[null, ""(85680, inf]"", … ""(13140, 17640]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(542457660, inf]""]","[""1999-05-22 11:16:00.000000"", ""1999-11-21 00:12:00.000000"", … ""2020-02-28 19:12:00.000000""]","[""No"", ""No"", … ""No""]","""train"""
"""William Thompson""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""VT"", ""IL"", … ""IL""]","[""Ludlow"", ""Yorkville"", … ""Yorkville""]","[""3504"", ""5812"", … ""5211""]","[""3544317050053029"", ""3544317050053029"", … ""3544317050053029""]","[2011-09-13 20:24:00, 2012-01-01 08:18:00, … 2020-02-28 12:31:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(5.6, 9.85]"", … ""(56.44, 67.18]""]","[""Mullins-Bryant"", ""Sanders LLC"", … ""Burns-Olson""]","[null, ""(85680, inf]"", … ""(4260, 6720]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(251737020, 282492675]""]","[""2011-09-13 20:24:00.000000"", ""2012-01-01 08:18:00.000000"", … ""2020-02-28 12:31:00.000000""]","[""No"", ""No"", … ""No""]","""test"""
"""Yvonne Rogers""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Swipe Transaction""]","[""MO"", ""MO"", … ""MO""]","[""Maryland Heights"", ""Bridgeton"", … ""Maryland Heights""]","[""8021"", ""5311"", … ""5814""]","[""3544317050053029"", ""3544317050053029"", … ""3544317050053029""]","[2020-02-01 16:18:00, 2020-02-03 01:13:00, … 2020-02-28 18:22:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""(140, inf]"", ""(38.41, 46.68]"", … ""(38.41, 46.68]""]","[""Robinson PLC"", ""Hancock-Thomas"", … ""Lloyd-Allison""]","[null, ""(85680, inf]"", … ""(17640, 23520]""]","[""(-inf, 34646763.75]"", ""(-inf, 34646763.75]"", … ""(-inf, 34646763.75]""]","[""2020-02-01 16:18:00.000000"", ""2020-02-03 01:13:00.000000"", … ""2020-02-28 18:22:00.000000""]","[""No"", ""No"", … ""No""]","""test"""
