In [1]:
from typing import Callable

import polars as pl
from faker import Faker

fake = Faker()

%load_ext autoreload
%autoreload 2

In [2]:
datapath = "/home/grantham/windmark/data/ledgers/trxns.csv"

df = pl.read_csv(datapath).filter(pl.col("User") < 500)

In [3]:
df.head()

User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
i64,i64,i64,i64,i64,str,str,str,i64,str,str,f64,i64,str,str
0,0,2002,9,1,"""06:21""","""$134.09""","""Swipe Transaction""",3527213246127876953,"""La Verne""","""CA""",91750.0,5300,,"""No"""
0,0,2002,9,1,"""06:42""","""$38.48""","""Swipe Transaction""",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""06:22""","""$120.34""","""Swipe Transaction""",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""17:45""","""$128.95""","""Swipe Transaction""",3414527459579106770,"""Monterey Park""","""CA""",91754.0,5651,,"""No"""
0,0,2002,9,3,"""06:23""","""$104.71""","""Swipe Transaction""",5817218446178736267,"""La Verne""","""CA""",91750.0,5912,,"""No"""


In [4]:
df.get_column("Use Chip").unique()

Use Chip
str
"""Swipe Transaction"""
"""Online Transaction"""
"""Chip Transaction"""


In [5]:
errors = (
    pl.read_csv(datapath)
    .get_column("Errors?")
    .drop_nulls()
    .unique()
    .to_list()
)

unique_errors = set("".join(errors).split(","))

unique_errors.remove("")

print(unique_errors)

{'Insufficient Balance', 'Technical Glitch', 'Bad Zipcode', 'Bad Expiration', 'Bad CVV', 'Bad PIN', 'Bad Card Number'}


In [6]:
df = pl.read_csv(datapath).filter(pl.col("User") < 500)

df.columns = [col.lower().replace("?", "").replace(" ", "_") for col in df.columns]

def mock(dataframe: pl.DataFrame, name: str, generator: Callable = fake.unique.company):
    values: list[str] = dataframe.get_column(name).cast(pl.String).unique().to_list()

    mapping = {value: generator() for value in values}

    return dataframe.select(pl.exclude(name), pl.col(name).replace_strict(mapping).alias(name))


df = mock(dataframe=df, name="merchant_name", generator=fake.unique.company)
df = mock(dataframe=df, name="card", generator=fake.unique.credit_card_number)
df = mock(dataframe=df, name="user", generator=fake.unique.name)

ledger = (
    df
    # .filter(pl.col("user") < 100)
    .select(pl.all(), timeparts=pl.col("time").str.split(":"))
    .select(
        pl.all(),
        hour=pl.col("timeparts").list.first(),
        minute=pl.col("timeparts").list.last(),
    )
    .select(
        pl.all(),
        timestamp=pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"), pl.col("minute")),
    )
    .select(
        "use_chip",
        "merchant_state",
        "merchant_city",
        pl.col("mcc").cast(pl.String),
        "card",
        "timestamp",
        *[
            pl.col("errors").fill_null("").str.contains(error).cast(pl.String).alias(f'has_{error.lower().replace(" ", "_")}')
            for error in unique_errors
        ],
        amount=pl.col("amount").str.strip_prefix("$").str.to_decimal(),
        merchant_name=pl.col("merchant_name").cast(pl.Utf8),
        is_fraud=pl.col("is_fraud"),
        transaction_id=pl.col("timestamp").cast(pl.Utf8),
        customer_id=pl.col("user"),
    )
    .sort("timestamp")
    .select(
        pl.all(),
        order_id=pl.col("transaction_id").cum_count().over("customer_id"),
        timedelta=pl.col("timestamp").sub(pl.col("timestamp").shift().over("customer_id")).dt.total_seconds(),
        tenure=pl.col("timestamp").sub(pl.col("timestamp").first().over("customer_id")).dt.total_seconds(),
    )
)

In [7]:
ledger.head()

use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_insufficient_balance,has_technical_glitch,has_bad_zipcode,has_bad_expiration,has_bad_cvv,has_bad_pin,has_bad_card_number,amount,merchant_name,is_fraud,transaction_id,customer_id,order_id,timedelta,tenure
str,str,str,str,str,datetime[μs],str,str,str,str,str,str,str,"decimal[*,2]",str,str,str,str,u32,i64,i64
"""Swipe Transaction""","""CA""","""Sacramento""","""3058""","""4765121420225254273""",1991-11-25 06:55:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",355.71,"""Garcia, Smith and Clayton""","""No""","""1991-11-25 06:55:00.000000""","""Thomas Macdonald""",1,,0
"""Swipe Transaction""","""CA""","""San Jose""","""4829""","""4765121420225254273""",1991-12-01 06:51:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",100.0,"""Patterson, White and Oconnor""","""No""","""1991-12-01 06:51:00.000000""","""Thomas Macdonald""",2,518160.0,518160
"""Swipe Transaction""","""CA""","""Watsonville""","""5411""","""4765121420225254273""",1991-12-01 09:36:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",2.1,"""Thompson, Castillo and Smith""","""No""","""1991-12-01 09:36:00.000000""","""Thomas Macdonald""",3,9900.0,528060
"""Swipe Transaction""","""CA""","""Watsonville""","""5411""","""4765121420225254273""",1991-12-01 09:44:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",48.93,"""Espinoza, Lang and Solis""","""No""","""1991-12-01 09:44:00.000000""","""Thomas Macdonald""",4,480.0,528540
"""Swipe Transaction""","""CA""","""Watsonville""","""5912""","""4765121420225254273""",1991-12-01 10:21:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",52.23,"""Green-Ward""","""No""","""1991-12-01 10:21:00.000000""","""Thomas Macdonald""",5,2220.0,530760


In [8]:
from zlib import crc32

def assign_split(column: str) -> pl.Expr:

    seed = (
        pl.col(column)
        .cast(pl.String)
        .map_elements(lambda x: float(crc32(str.encode(x)) & 0xFFFFFFFF), return_dtype=pl.Float32)
        .mul(1 / 2**32)
    )

    return (
        pl.when(seed.is_between(0.0, 0.6))
        .then(pl.lit("train"))
        .when(seed.is_between(0.6, 0.8))
        .then(pl.lit("validate"))
        .when(seed.is_between(0.8, 1.0))
        .then(pl.lit("test"))
        .otherwise(pl.lit("train"))
    )

In [9]:
fields = [
    'use_chip',
    'merchant_state',
    'merchant_city',
    'mcc',
    'card',
    'timestamp',
    'has_technical_glitch',
    'has_bad_pin',
    'has_bad_zipcode',
    'has_insufficient_balance',
    'has_bad_cvv',
    'has_bad_card_number',
    'has_bad_expiration',
    'amount',
    'merchant_name',
    # 'is_fraud',
    # 'transaction_id',
    # 'customer_id',
    # 'order_id',
    'timedelta',
    'tenure',
]

In [10]:
lifestreams = (
    ledger.select(
        *[field for field in fields],
        'is_fraud',
        'transaction_id',
        'customer_id',
        'order_id',
        split=assign_split('customer_id'),
    )
    .sort("customer_id", "order_id")
    .group_by("customer_id", maintain_order=True)
    .agg(
        *[field for field in fields],
        'transaction_id',
        size=pl.len().cast(pl.Int32),
        is_fraud=pl.col('is_fraud'),
        split=pl.col("split").last(),
    )
    .iter_slices(5)
)

In [11]:
for index, lifestream in enumerate(lifestreams):
    lifestream.write_ndjson(f"../data/lifestreams/trxns/trxns-{index}.ndjson")

In [12]:
lifestream

customer_id,use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_technical_glitch,has_bad_pin,has_bad_zipcode,has_insufficient_balance,has_bad_cvv,has_bad_card_number,has_bad_expiration,amount,merchant_name,timedelta,tenure,transaction_id,size,is_fraud,split
str,list[str],list[str],list[str],list[str],list[str],list[datetime[μs]],list[str],list[str],list[str],list[str],list[str],list[str],list[str],"list[decimal[*,2]]",list[str],list[i64],list[i64],list[str],i32,list[str],str
"""Zachary Booth""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Swipe Transaction""]","[""NY"", ""NY"", … ""NY""]","[""Horseheads"", ""Canaseraga"", … ""Brooktondale""]","[""7832"", ""5499"", … ""5211""]","[""5163165729772924"", ""5163165729772924"", … ""4241445643534330""]","[2020-01-02 11:48:00, 2020-01-03 21:08:00, … 2020-02-28 12:34:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[24.47, -96.00, … 46.44]","[""Thomas-Vang"", ""Dawson, Castillo and Mcdowell"", … ""Powell, Smith and Bryant""]","[null, 120000, … 75480]","[0, 120000, … 4927560]","[""2020-01-02 11:48:00.000000"", ""2020-01-03 21:08:00.000000"", … ""2020-02-28 12:34:00.000000""]",86,"[""No"", ""No"", … ""No""]","""train"""
"""Zachary Hutchinson""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""OK"", ""LA"", … ""NC""]","[""Roland"", ""Abbeville"", … ""Goldsboro""]","[""3000"", ""5541"", … ""5411""]","[""4829359803135"", ""4829359803135"", … ""4829359803135""]","[2009-06-12 07:18:00, 2009-07-01 12:04:00, … 2020-02-28 11:38:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[1161.92, 11.44, … 11.57]","[""Boyd-Rivera"", ""Spears-Taylor"", … ""Stewart, Mccarthy and Rose""]","[null, 1658760, … 1260]","[0, 1658760, … 338098800]","[""2009-06-12 07:18:00.000000"", ""2009-07-01 12:04:00.000000"", … ""2020-02-28 11:38:00.000000""]",18284,"[""No"", ""No"", … ""No""]","""train"""
"""Zachary Michael""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""TX"", ""TX"", … ""TX""]","[""Houston"", ""Houston"", … ""Diboll""]","[""5541"", ""5912"", … ""5541""]","[""4829359803135"", ""4829359803135"", … ""4829359803135""]","[2006-10-01 09:52:00, 2006-10-01 11:01:00, … 2020-02-28 09:48:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[12.24, 112.48, … 11.05]","[""Spears-Taylor"", ""Avila-Fitzpatrick"", … ""Spears-Taylor""]","[null, 4140, … 360]","[0, 4140, … 423186960]","[""2006-10-01 09:52:00.000000"", ""2006-10-01 11:01:00.000000"", … ""2020-02-28 09:48:00.000000""]",10978,"[""No"", ""No"", … ""No""]","""train"""
"""Zachary Perez""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""ID"", ""UT"", … ""ID""]","[""Sagle"", ""Kaysville"", … ""Fernwood""]","[""7832"", ""5300"", … ""7538""]","[""4241445643534330"", ""4241445643534330"", … ""4829359803135""]","[2003-10-01 11:00:00, 2003-10-01 13:15:00, … 2020-02-28 18:52:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[27.12, 42.81, … 33.50]","[""Thomas-Vang"", ""Bird, Foley and Lee"", … ""Villegas-Ortiz""]","[null, 8100, … 5340]","[0, 8100, … 517909920]","[""2003-10-01 11:00:00.000000"", ""2003-10-01 13:15:00.000000"", … ""2020-02-28 18:52:00.000000""]",21468,"[""No"", ""No"", … ""No""]","""train"""
"""Zachary Thompson""","[""Chip Transaction"", ""Chip Transaction"", … ""Swipe Transaction""]","[""OH"", ""OH"", … ""OH""]","[""Columbus"", ""Columbus"", … ""Columbus""]","[""5812"", ""5814"", … ""5812""]","[""5163165729772924"", ""5163165729772924"", … ""4241445643534330""]","[2020-01-01 18:35:00, 2020-01-01 20:15:00, … 2020-02-28 19:16:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[63.60, 66.80, … 56.18]","[""Kim Inc"", ""Blake, Gray and Chandler"", … ""Kim Inc""]","[null, 6000, … 21840]","[0, 6000, … 5013660]","[""2020-01-01 18:35:00.000000"", ""2020-01-01 20:15:00.000000"", … ""2020-02-28 19:16:00.000000""]",55,"[""No"", ""No"", … ""No""]","""train"""
