In [1]:
from typing import Callable

import polars as pl
from faker import Faker

fake = Faker()

%load_ext autoreload
%autoreload 2

In [2]:
df = pl.read_csv("/home/grantham/windmark/data/transactions.v1.csv").filter(pl.col("User") < 500)

In [3]:
df.head()

User,Card,Year,Month,Day,Time,Amount,Use Chip,Merchant Name,Merchant City,Merchant State,Zip,MCC,Errors?,Is Fraud?
i64,i64,i64,i64,i64,str,str,str,i64,str,str,f64,i64,str,str
0,0,2002,9,1,"""06:21""","""$134.09""","""Swipe Transact…",3527213246127876953,"""La Verne""","""CA""",91750.0,5300,,"""No"""
0,0,2002,9,1,"""06:42""","""$38.48""","""Swipe Transact…",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""06:22""","""$120.34""","""Swipe Transact…",-727612092139916043,"""Monterey Park""","""CA""",91754.0,5411,,"""No"""
0,0,2002,9,2,"""17:45""","""$128.95""","""Swipe Transact…",3414527459579106770,"""Monterey Park""","""CA""",91754.0,5651,,"""No"""
0,0,2002,9,3,"""06:23""","""$104.71""","""Swipe Transact…",5817218446178736267,"""La Verne""","""CA""",91750.0,5912,,"""No"""


In [4]:
df.get_column("Use Chip").unique()

Use Chip
str
"""Swipe Transact…"
"""Online Transac…"
"""Chip Transacti…"


In [5]:
errors = (
    pl.read_csv("/home/grantham/windmark/data/transactions.v1.csv")
    .get_column("Errors?")
    .drop_nulls()
    .unique()
    .to_list()
)

unique_errors = set("".join(errors).split(","))

unique_errors.remove("")

print(unique_errors)

{'Bad CVV', 'Bad Card Number', 'Technical Glitch', 'Bad Expiration', 'Insufficient Balance', 'Bad Zipcode', 'Bad PIN'}


In [6]:
df = pl.read_csv("/home/grantham/windmark/data/transactions.v1.csv").filter(pl.col("User") < 500)

df.columns = [col.lower().replace("?", "").replace(" ", "_") for col in df.columns]

def mock(dataframe: pl.DataFrame, name: str, generator: Callable = fake.unique.company):
    values: list[str] = dataframe.get_column(name).cast(pl.String).unique().to_list()

    mapping = {value: generator() for value in values}

    return dataframe.select(pl.exclude(name), pl.col(name).replace(mapping).alias(name))


df = mock(dataframe=df, name="merchant_name", generator=fake.unique.company)
df = mock(dataframe=df, name="card", generator=fake.unique.credit_card_number)
df = mock(dataframe=df, name="user", generator=fake.unique.name)

ledger = (
    df
    # .filter(pl.col("user") < 100)
    .select(pl.all(), timeparts=pl.col("time").str.split(":"))
    .select(
        pl.all(),
        hour=pl.col("timeparts").list.first(),
        minute=pl.col("timeparts").list.last(),
    )
    .select(
        pl.all(),
        timestamp=pl.datetime(pl.col("year"), pl.col("month"), pl.col("day"), pl.col("hour"), pl.col("minute")),
    )
    .select(
        "use_chip",
        "merchant_state",
        "merchant_city",
        pl.col("mcc").cast(pl.String),
        "card",
        "timestamp",
        *[
            pl.col("errors").fill_null("").str.contains(error).cast(pl.String).alias(f'has_{error.lower().replace(" ", "_")}')
            for error in unique_errors
        ],
        amount=pl.col("amount").str.strip_prefix("$").str.to_decimal(),
        merchant_name=pl.col("merchant_name").cast(pl.Utf8),
        is_fraud=pl.col("is_fraud"),
        transaction_id=pl.col("timestamp").cast(pl.Utf8),
        customer_id=pl.col("user"),
    )
    .sort("timestamp")
    .select(
        pl.all(),
        order_id=pl.col("transaction_id").cum_count().over("customer_id"),
        timedelta=pl.col("timestamp").sub(pl.col("timestamp").shift().over("customer_id")).dt.total_seconds(),
        tenure=pl.col("timestamp").sub(pl.col("timestamp").first().over("customer_id")).dt.total_seconds(),
    )
)

In [7]:
ledger.head()

use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_bad_cvv,has_bad_card_number,has_technical_glitch,has_bad_expiration,has_insufficient_balance,has_bad_zipcode,has_bad_pin,amount,merchant_name,is_fraud,transaction_id,customer_id,order_id,timedelta,tenure
str,str,str,str,str,datetime[μs],str,str,str,str,str,str,str,"decimal[*,2]",str,str,str,str,u32,i64,i64
"""Swipe Transact…","""CA""","""Sacramento""","""3058""","""675957263535""",1991-11-25 06:55:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",355.71,"""Jackson-Vazque…","""No""","""1991-11-25 06:…","""Joseph Bailey""",1,,0
"""Swipe Transact…","""CA""","""San Jose""","""4829""","""675957263535""",1991-12-01 06:51:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",100.0,"""Reyes Group""","""No""","""1991-12-01 06:…","""Joseph Bailey""",2,518160.0,518160
"""Swipe Transact…","""CA""","""Watsonville""","""5411""","""675957263535""",1991-12-01 09:36:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",2.1,"""Hernandez-Duar…","""No""","""1991-12-01 09:…","""Joseph Bailey""",3,9900.0,528060
"""Swipe Transact…","""CA""","""Watsonville""","""5411""","""675957263535""",1991-12-01 09:44:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",48.93,"""Payne, Howard …","""No""","""1991-12-01 09:…","""Joseph Bailey""",4,480.0,528540
"""Swipe Transact…","""CA""","""Watsonville""","""5912""","""675957263535""",1991-12-01 10:21:00,"""false""","""false""","""false""","""false""","""false""","""false""","""false""",52.23,"""Simmons, Graha…","""No""","""1991-12-01 10:…","""Joseph Bailey""",5,2220.0,530760


In [8]:
from zlib import crc32

def assign_split(column: str) -> pl.Expr:

    seed = (
        pl.col(column)
        .cast(pl.String)
        .map_elements(lambda x: float(crc32(str.encode(x)) & 0xFFFFFFFF), return_dtype=pl.Float32)
        .mul(1 / 2**32)
    )

    return (
        pl.when(seed.is_between(0.0, 0.6))
        .then(pl.lit("train"))
        .when(seed.is_between(0.6, 0.8))
        .then(pl.lit("validate"))
        .when(seed.is_between(0.8, 1.0))
        .then(pl.lit("test"))
        .otherwise(pl.lit("train"))
    )

In [9]:
fields = [
    'use_chip',
    'merchant_state',
    'merchant_city',
    'mcc',
    'card',
    'timestamp',
    'has_technical_glitch',
    'has_bad_pin',
    'has_bad_zipcode',
    'has_insufficient_balance',
    'has_bad_cvv',
    'has_bad_card_number',
    'has_bad_expiration',
    'amount',
    'merchant_name',
    # 'is_fraud',
    # 'transaction_id',
    # 'customer_id',
    # 'order_id',
    'timedelta',
    'tenure',
]

In [19]:
lifestreams = (
    ledger.select(
        *[field for field in fields],
        'is_fraud',
        'transaction_id',
        'customer_id',
        'order_id',
        split=assign_split('customer_id'),
    )
    .sort("customer_id", "order_id")
    .group_by("customer_id", maintain_order=True)
    .agg(
        *[field for field in fields],
        'transaction_id',
        size=pl.len().cast(pl.Int32),
        is_fraud=pl.col('is_fraud'),
        split=pl.col("split").last(),
    )
    .iter_slices(5)
)

In [17]:
x = next(iter(lifestreams))

In [21]:
for index, lifestream in enumerate(lifestreams):
    lifestream.write_avro(f"lifestreams/lifestream-{index}.avro", name="lifestream")

In [18]:
x

customer_id,use_chip,merchant_state,merchant_city,mcc,card,timestamp,has_technical_glitch,has_bad_pin,has_bad_zipcode,has_insufficient_balance,has_bad_cvv,has_bad_card_number,has_bad_expiration,amount,merchant_name,timedelta,tenure,transaction_id,size,target,split
str,list[str],list[str],list[str],list[str],list[str],list[datetime[μs]],list[str],list[str],list[str],list[str],list[str],list[str],list[str],"list[decimal[*,2]]",list[str],list[i64],list[i64],list[str],i32,list[str],str
"""Adam Wright""","[""Swipe Transaction"", ""Online Transaction"", … ""Swipe Transaction""]","[""CA"", null, … ""AZ""]","[""Calabasas"", "" ONLINE"", … ""Chandler""]","[""3006"", ""4511"", … ""4121""]","[""349944068043057"", ""349944068043057"", … ""4634009271034""]","[2001-05-14 13:45:00, 2001-12-02 16:58:00, … 2020-02-28 23:00:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[977.28, 976.14, … 12.86]","[""Riley, Ball and Johnson"", ""Knight, Brown and Cook"", … ""Owen, Jackson and Allen""]","[null, 17464380, … 2460]","[0, 17464380, … 593082900]","[""2001-05-14 13:45:00.000000"", ""2001-12-02 16:58:00.000000"", … ""2020-02-28 23:00:00.000000""]",30392,"[""No"", ""No"", … ""No""]","""train"""
"""Adrian Norris""","[""Swipe Transaction"", ""Online Transaction"", … ""Swipe Transaction""]","[""GA"", null, … ""GA""]","[""Centerville"", "" ONLINE"", … ""Thomson""]","[""3132"", ""4722"", … ""5541""]","[""349944068043057"", ""349944068043057"", … ""4634009271034""]","[2005-06-10 14:34:00, 2006-02-28 21:48:00, … 2020-02-28 20:07:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[378.91, 613.41, … 72.81]","[""Vance-Lowery"", ""Rose-Taylor"", … ""Henderson Group""]","[null, 22749240, … 3480]","[0, 22749240, … 464506380]","[""2005-06-10 14:34:00.000000"", ""2006-02-28 21:48:00.000000"", … ""2020-02-28 20:07:00.000000""]",12598,"[""No"", ""No"", … ""No""]","""train"""
"""Alejandra Bail…","[""Chip Transaction"", ""Swipe Transaction"", … ""Online Transaction""]","[""CA"", ""CA"", … null]","[""Union City"", ""Sunol"", … "" ONLINE""]","[""5812"", ""5541"", … ""4121""]","[""4634009271034"", ""4634009271034"", … ""4634009271034""]","[2020-01-01 08:15:00, 2020-01-01 08:43:00, … 2020-02-28 15:11:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[5.14, 2.91, … 40.98]","[""Ewing-Brewer"", ""Martinez, Richardson and Young"", … ""Frazier-Clark""]","[null, 1680, … 2460]","[0, 1680, … 5036160]","[""2020-01-01 08:15:00.000000"", ""2020-01-01 08:43:00.000000"", … ""2020-02-28 15:11:00.000000""]",225,"[""No"", ""No"", … ""No""]","""test"""
"""Alejandra Cald…","[""Chip Transaction"", ""Chip Transaction"", … ""Chip Transaction""]","[""CA"", ""CA"", … ""CA""]","[""Redding"", ""Redding"", … ""Redding""]","[""5541"", ""5912"", … ""5499""]","[""349944068043057"", ""349944068043057"", … ""349944068043057""]","[2020-01-01 06:32:00, 2020-01-02 07:25:00, … 2020-02-28 17:48:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[4.33, 13.31, … 1.45]","[""Bird, Hicks and Mitchell"", ""Mitchell-Johnson"", … ""Mcdonald, Duncan and Conley""]","[null, 89580, … 40680]","[0, 89580, … 5051760]","[""2020-01-01 06:32:00.000000"", ""2020-01-02 07:25:00.000000"", … ""2020-02-28 17:48:00.000000""]",171,"[""No"", ""No"", … ""No""]","""validate"""
"""Alex Hudson""","[""Chip Transaction"", ""Chip Transaction"", … ""Chip Transaction""]","[""CT"", ""NJ"", … ""NJ""]","[""Uncasville"", ""Jersey City"", … ""Jersey City""]","[""7995"", ""4829"", … ""4829""]","[""4634009271034"", ""4634009271034"", … ""4634009271034""]","[2020-01-01 13:40:00, 2020-01-01 14:22:00, … 2020-02-28 14:20:00]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[""false"", ""false"", … ""false""]","[30.08, 0, … 0]","[""Richardson, Nelson and Miller"", ""Reyes Group"", … ""Reyes Group""]","[null, 2520, … 780]","[0, 2520, … 5013600]","[""2020-01-01 13:40:00.000000"", ""2020-01-01 14:22:00.000000"", … ""2020-02-28 14:20:00.000000""]",225,"[""No"", ""No"", … ""No""]","""train"""
