In [1]:
from typing import Callable

import polars as pl
from faker import Faker

fake = Faker()

%load_ext autoreload
%autoreload 2

In [2]:
datapath = "/home/grantham/windmark/data/ledgers/trxns.csv"

df = pl.read_csv(datapath).filter(pl.col("User") < 500)

In [3]:
df.head()

User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
i64,i64,i64,i64,i64,str,str,str,i64,str,str,f64,i64,str,str
0,0,2002,9,1,"""06:21""","""$134.09""","""Swipe Transaction""",3527213246127876953,"""La Verne""","""CA""",91750.0,5300,,"""No"""
0,0,2002,9,1,"""06:42""","""$38.48""","""Swipe Transaction""",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""06:22""","""$120.34""","""Swipe Transaction""",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""17:45""","""$128.95""","""Swipe Transaction""",3414527459579106770,"""Monterey Park""","""CA""",91754.0,5651,,"""No"""
0,0,2002,9,3,"""06:23""","""$104.71""","""Swipe Transaction""",5817218446178736267,"""La Verne""","""CA""",91750.0,5912,,"""No"""


In [4]:
df.get_column("Use Chip").unique()

Use Chip
str
"""Online Transaction"""
"""Chip Transaction"""
"""Swipe Transaction"""


In [5]:
errors = (
    pl.read_csv(datapath)
    .get_column("Errors?")
    .drop_nulls()
    .unique()
    .to_list()
)

unique_errors = set("".join(errors).split(","))

unique_errors.remove("")

print(unique_errors)

{'Insufficient Balance', 'Bad Expiration', 'Bad Card Number', 'Bad Zipcode', 'Technical Glitch', 'Bad CVV', 'Bad PIN'}


In [6]:
df = pl.read_csv(datapath).filter(pl.col("User") < 500)

df.columns = [col.lower().replace("?", "").replace(" ", "_") for col in df.columns]

def mock(dataframe: pl.DataFrame, name: str, generator: Callable = fake.unique.company):
    values: list[str] = dataframe.get_column(name).cast(pl.String).unique().to_list()

    mapping = {value: generator() for value in values}

    return dataframe.select(pl.exclude(name), pl.col(name).replace_strict(mapping).alias(name))


df = mock(dataframe=df, name="merchant_name", generator=fake.unique.company)
df = mock(dataframe=df, name="card", generator=fake.unique.credit_card_number)
df = mock(dataframe=df, name="user", generator=fake.unique.name)

ledger = (
    df
    # .filter(pl.col("user") < 100)
    .select(pl.all(), timeparts=pl.col("time").str.split(":"))
    .select(
        pl.all(),
        hour=pl.col("timeparts").list.first(),
        minute=pl.col("timeparts").list.last(),
    )
    .select(
        pl.all(),
        timestamp=pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"), pl.col("minute")),
    )
    .select(
        "use_chip",
        "merchant_state",
        "merchant_city",
        pl.col("mcc").cast(pl.String),
        "card",
        "timestamp",
        *[
            pl.col("errors").fill_null("").str.contains(error).cast(pl.String).alias(f'has_{error.lower().replace(" ", "_")}')
            for error in unique_errors
        ],
        amount=pl.col("amount").str.strip_prefix("$").str.to_decimal(),
        merchant_name=pl.col("merchant_name").cast(pl.Utf8),
        is_fraud=pl.col("is_fraud"),
        transaction_id=pl.col("timestamp").cast(pl.Utf8),
        customer_id=pl.col("user"),
    )
    .sort("timestamp")
    .select(
        pl.all(),
        order_id=pl.col("transaction_id").cum_count().over("customer_id"),
        timedelta=pl.col("timestamp").sub(pl.col("timestamp").shift().over("customer_id")).dt.total_seconds(),
        tenure=pl.col("timestamp").sub(pl.col("timestamp").first().over("customer_id")).dt.total_seconds(),
    )
)

In [7]:
ledger.head()

use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_insufficient_balance,has_bad_expiration,has_bad_card_number,has_bad_zipcode,has_technical_glitch,has_bad_cvv,has_bad_pin,amount,merchant_name,is_fraud,transaction_id,customer_id,order_id,timedelta,tenure
str,str,str,str,str,datetime[μs],str,str,str,str,str,str,str,"decimal[*,2]",str,str,str,str,u32,i64,i64
"""Swipe Transaction""","""CA""","""Sacramento""","""3058""","""4905302247175118""",1991-11-25 06:55:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",355.71,"""Bates, Sanders and Kim""","""No""","""1991-11-25 06:55:00.000000""","""Nathan Ramirez""",1,,0
"""Swipe Transaction""","""CA""","""San Jose""","""4829""","""4905302247175118""",1991-12-01 06:51:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",100.0,"""Owens, Diaz and Lee""","""No""","""1991-12-01 06:51:00.000000""","""Nathan Ramirez""",2,518160.0,518160
"""Swipe Transaction""","""CA""","""Watsonville""","""5411""","""4905302247175118""",1991-12-01 09:36:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",2.1,"""Cox-Davidson""","""No""","""1991-12-01 09:36:00.000000""","""Nathan Ramirez""",3,9900.0,528060
"""Swipe Transaction""","""CA""","""Watsonville""","""5411""","""4905302247175118""",1991-12-01 09:44:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",48.93,"""Mitchell-Macdonald""","""No""","""1991-12-01 09:44:00.000000""","""Nathan Ramirez""",4,480.0,528540
"""Swipe Transaction""","""CA""","""Watsonville""","""5912""","""4905302247175118""",1991-12-01 10:21:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",52.23,"""Davis-Garcia""","""No""","""1991-12-01 10:21:00.000000""","""Nathan Ramirez""",5,2220.0,530760


In [8]:
from zlib import crc32

def assign_split(column: str) -> pl.Expr:

    seed = (
        pl.col(column)
        .cast(pl.String)
        .map_elements(lambda x: float(crc32(str.encode(x)) & 0xFFFFFFFF), return_dtype=pl.Float32)
        .mul(1 / 2**32)
    )

    return (
        pl.when(seed.is_between(0.0, 0.6))
        .then(pl.lit("train"))
        .when(seed.is_between(0.6, 0.8))
        .then(pl.lit("validate"))
        .when(seed.is_between(0.8, 1.0))
        .then(pl.lit("test"))
        .otherwise(pl.lit("train"))
    )

In [9]:
fields = [
    'use_chip',
    'merchant_state',
    'merchant_city',
    'mcc',
    'card',
    'timestamp',
    'has_technical_glitch',
    'has_bad_pin',
    'has_bad_zipcode',
    'has_insufficient_balance',
    'has_bad_cvv',
    'has_bad_card_number',
    'has_bad_expiration',
    'amount',
    'merchant_name',
    # 'is_fraud',
    # 'transaction_id',
    # 'customer_id',
    # 'order_id',
    'timedelta',
    'tenure',
]

In [10]:
lifestreams = (
    ledger.select(
        *[field for field in fields],
        'is_fraud',
        'transaction_id',
        'customer_id',
        'order_id',
        split=assign_split('customer_id'),
    )
    .sort("customer_id", "order_id")
    .group_by("customer_id", maintain_order=True)
    .agg(
        *[field for field in fields],
        'transaction_id',
        size=pl.len().cast(pl.Int32),
        is_fraud=pl.col('is_fraud'),
        split=pl.col("split").last(),
    )
    .iter_slices(5)
)

In [11]:
for index, lifestream in enumerate(lifestreams):
    lifestream.write_ndjson(f"../data/lifestreams/trxns/trxns-{index}.ndjson")

In [12]:
lifestream

customer_id,use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_technical_glitch,has_bad_pin,has_bad_zipcode,has_insufficient_balance,has_bad_cvv,has_bad_card_number,has_bad_expiration,amount,merchant_name,timedelta,tenure,transaction_id,size,is_fraud,split
str,list[str],list[str],list[str],list[str],list[str],list[datetime[μs]],list[str],list[str],list[str],list[str],list[str],list[str],list[str],"list[decimal[*,2]]",list[str],list[i64],list[i64],list[str],i32,list[str],str
"""William Cummings""","[""Swipe Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""PA"", ""PA"", … ""PA""]","[""Myerstown"", ""Mount Union"", … ""Mount Union""]","[""8049"", ""5621"", … ""5813""]","[""6527766522221636"", ""6527766522221636"", … ""6527766522221636""]","[2011-01-02 08:09:00, 2011-01-03 16:44:00, … 2020-02-27 19:37:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[130.29, 24.74, … 7.26]","[""Garza-Stewart"", ""Taylor-Russell"", … ""Dawson, Key and Smith""]","[null, 117300, … 108900]","[0, 117300, … 288876480]","[""2011-01-02 08:09:00.000000"", ""2011-01-03 16:44:00.000000"", … ""2020-02-27 19:37:00.000000""]",3992,"[""No"", ""No"", … ""No""]","""train"""
"""William Fisher""","[""Chip Transaction"", ""Chip Transaction"", … ""Chip Transaction""]","[""MD"", ""DE"", … ""DE""]","[""Great Mills"", ""Wilmington"", … ""Wilmington""]","[""3132"", ""5411"", … ""5411""]","[""6527766522221636"", ""6527766522221636"", … ""6527766522221636""]","[2018-03-21 22:35:00, 2018-04-01 13:50:00, … 2020-02-28 13:31:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[661.36, 133.05, … 204.18]","[""Wilson, Floyd and Stewart"", ""Kane, Norman and Ruiz"", … ""Wright, Alvarez and Jackson""]","[null, 918900, … 22740]","[0, 918900, … 61224960]","[""2018-03-21 22:35:00.000000"", ""2018-04-01 13:50:00.000000"", … ""2020-02-28 13:31:00.000000""]",2403,"[""No"", ""No"", … ""No""]","""train"""
"""William Schwartz""","[""Chip Transaction"", ""Chip Transaction"", … ""Chip Transaction""]","[""CA"", ""CA"", … ""CA""]","[""Redlands"", ""Redlands"", … ""Redlands""]","[""5411"", ""5541"", … ""5499""]","[""6527766522221636"", ""6527766522221636"", … ""6527766522221636""]","[2020-02-01 07:00:00, 2020-02-01 14:14:00, … 2020-02-28 14:28:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[2.94, 96.00, … 66.00]","[""Brewer-Caldwell"", ""Floyd-Kennedy"", … ""Hanna, Peterson and Adkins""]","[null, 26040, … 360]","[0, 26040, … 2359680]","[""2020-02-01 07:00:00.000000"", ""2020-02-01 14:14:00.000000"", … ""2020-02-28 14:28:00.000000""]",48,"[""No"", ""No"", … ""No""]","""validate"""
"""William Stewart""","[""Swipe Transaction"", ""Online Transaction"", … ""Online Transaction""]","[""CA"", null, … null]","[""Fresno"", "" ONLINE"", … "" ONLINE""]","[""3000"", ""4899"", … ""7996""]","[""3536821597735758"", ""587669652634"", … ""3536821597735758""]","[2002-05-08 14:24:00, 2003-04-01 05:17:00, … 2020-02-28 16:24:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[518.75, 111.88, … 132.04]","[""Duffy-Hill"", ""Vaughan-Salinas"", … ""Jones, Henry and Sosa""]","[null, 28306380, … 23340]","[0, 28306380, … 562039200]","[""2002-05-08 14:24:00.000000"", ""2003-04-01 05:17:00.000000"", … ""2020-02-28 16:24:00.000000""]",14351,"[""No"", ""No"", … ""No""]","""train"""
"""William Williams""","[""Chip Transaction"", ""Swipe Transaction"", … ""Chip Transaction""]","[""PA"", ""PA"", … ""PA""]","[""Doylestown"", ""Chalfont"", … ""Doylestown""]","[""5814"", ""5812"", … ""4121""]","[""587669652634"", ""3593128805672876"", … ""587669652634""]","[2020-01-01 06:47:00, 2020-01-01 07:06:00, … 2020-02-28 18:11:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[19.93, 21.49, … 24.54]","[""Dougherty, Burgess and Robertson"", ""Kerr-Lynch"", … ""Franco, Shaw and Webb""]","[null, 1140, … 34500]","[0, 1140, … 5052240]","[""2020-01-01 06:47:00.000000"", ""2020-01-01 07:06:00.000000"", … ""2020-02-28 18:11:00.000000""]",182,"[""No"", ""No"", … ""No""]","""train"""
