In [1]:
# Built-in library
import re
import json
import logging
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
fp: str = "../../data/trans_TAGS_17.parquet"
N: int = 850_000
RANDOM_STATE: int = 123

df: pl.DataFrame = pl.read_parquet(source=fp).sample(
    n=N, shuffle=True, seed=RANDOM_STATE
)

df.head()

customer_id,nuban,date,description,amount,type,tags
str,i64,str,str,f64,str,list[str]
"""38969""",1,"""2022-10-05""","""REV-ETZ-09FG22…",25.0,"""C""","[""balance"", ""behavioural.accountSweep"", … ""transactionpattern.mostFrequentBalanceRange""]"
"""38865""",1,"""2022-04-04""","""ABDULRAHEEM F…",20000.0,"""C""","[""balance"", ""behavioural.accountSweep"", … ""transactionpattern.mostFrequentBalanceRange""]"
"""39550""",1,"""2022-01-09""","""YELLO DIGITAL …",6000.0,"""D""","[""balance"", ""behavioural.accountSweep"", … ""transactionpattern.mostFrequentBalanceRange""]"
"""39430""",1,"""2022-11-07""","""ASSISTEDDEPOSI…",500.0,"""C""","[""balance"", ""behavioural.accountSweep"", … ""transactionpattern.mostFrequentBalanceRange""]"
"""39942""",1,"""2022-03-28""","""TRF/Pat/FRM PA…",20200.0,"""D""","[""balance"", ""behavioural.accountSweep"", … ""transactionpattern.mostFrequentBalanceRange""]"


In [3]:
df_exp = df.explode(["tags"])

print(df_exp.head())

In [4]:
df_exp = df_exp.filter(
    (df_exp["tags"] != "balance")
    & (df_exp["tags"] != "behavioural.inflowOutflowRate")
    & (df_exp["tags"] != "credit")
    & (df_exp["tags"] != "debit")
    & (df_exp["tags"] != "behavioural.accountSweep")
    & (df_exp["tags"] != "transactionpattern.mostFrequentBalanceRange")
    & (df_exp["tags"] != "transactionpattern.mostFrequentTransactionRange")
    & (df_exp["tags"] != "transactionpattern.transactionLessThan10000")
    & (df_exp["tags"] != "transactionpattern.transactionBetween10000And100000")
    & (df_exp["tags"] != "income.medianIncome")
    & (df_exp["tags"] != "transactionpattern.recurringExpense")
    & (df_exp["tags"] != "income.averageOtherIncome")
    & (df_exp["tags"] != "income.numberOtherIncomePayments")
    & (df_exp["tags"] != "transactionpattern.transactionBetween100000And500000")
    & (df_exp["tags"] != "behavioural.topRecipient")
    & (df_exp["tags"] != "behavioural.topDepositor")
    & (df_exp["tags"] != "income.averageSalary")
    & (df_exp["tags"] != "income.salary")
    & (df_exp["tags"] != "income.numberSalaryPayments")
    & (df_exp["tags"] != "income.salaryFrequency")
    & (df_exp["tags"] != "transactionpattern.transactionGreaterThan500000")
    & (df_exp["tags"] != "cashflow.closingBalance")
    & (df_exp["tags"] != "cashflow.lastDay")
    & (df_exp["tags"] != "cashflow.closingBalance")
    & (df_exp["tags"] != "cashflow.firstDay")
    & (df_exp["tags"] != "income.lastSalaryDate")
    & (df_exp["tags"] != "spend.flightRisk")
    & (df_exp["tags"] != "spend.eatingOut")
    & (df_exp["tags"] != "spend.savingsAndInvestments")
    & (df_exp["tags"] != "spend.entertainment")
    & (df_exp["tags"] != "spend.internationalTransactionsSpend")
    & (df_exp["tags"] != "spend.chequeWithdrawal")
)

In [5]:
df_exp["tags"].value_counts(sort=True).head(30)  # .glimpse(max_items_per_column=20)

tags,counts
str,u32
"""spend.spendOnT…",218949
"""spend.airtime""",93829
"""spend.ussdTran…",92486
"""spend.posSpend…",82786
"""spend.webSpend…",81539
"""spend.bankChar…",42631
"""spend.atmSpend…",39292
"""income.salaryE…",20408
"""behavioural.lo…",13048
"""income.gigWork…",10921


In [6]:
df_exp["tags"].value_counts(sort=True).head(20).glimpse(max_items_per_column=20)

Rows: 16
Columns: 2
$ tags   <str> 'spend.spendOnTransfers', 'spend.airtime', 'spend.ussdTransactions', 'spend.posSpend', 'spend.webSpend', 'spend.bankCharges', 'spend.atmSpend', 'income.salaryEarner', 'behavioural.loanRepayments', 'income.gigWorker', 'spend.bills', 'spend.mobileSpend', 'spend.cashWithdrawal', 'spend.gambling', 'behavioural.loanAmount', 'spend.shopping'
$ counts <u32> 218949, 93829, 92486, 82786, 81539, 42631, 39292, 20408, 13048, 10921, 9946, 8010, 7372, 5919, 5803, 3048



In [7]:
N_SAMPLE: int = 3_000

df_spend_on_trf = df_exp.filter(pl.col("tags") == "spend.spendOnTransfers").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_airtime = df_exp.filter(pl.col("tags") == "spend.airtime").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_ussd = df_exp.filter(pl.col("tags") == "spend.ussdTransactions").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_pos = df_exp.filter(pl.col("tags") == "spend.posSpend").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_web = df_exp.filter(pl.col("tags") == "spend.webSpend").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_charges = df_exp.filter(pl.col("tags") == "spend.bankCharges").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_atm = df_exp.filter(pl.col("tags") == "spend.atmSpend").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_salary = df_exp.filter(pl.col("tags") == "income.salaryEarner").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_loan_repay = df_exp.filter(pl.col("tags") == "behavioural.loanRepayments").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_gig_worker = df_exp.filter(pl.col("tags") == "income.gigWorker").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_bills = df_exp.filter(pl.col("tags") == "spend.bills").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_mobile = df_exp.filter(pl.col("tags") == "spend.mobileSpend").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_wdrwl = df_exp.filter(pl.col("tags") == "spend.cashWithdrawal").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_spend_on_gambling = df_exp.filter(pl.col("tags") == "spend.gambling").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

df_loan_amt = df_exp.filter(pl.col("tags") == "behavioural.loanAmount").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)
df_spend_on_shopping = df_exp.filter(pl.col("tags") == "spend.shopping").sample(
    n=N_SAMPLE, shuffle=True, seed=RANDOM_STATE
)

In [8]:
df_final: pl.DataFrame = pl.concat(
    [
        df_spend_on_trf,
        df_spend_on_airtime,
        df_spend_on_ussd,
        df_spend_on_pos,
        df_spend_on_web,
        df_spend_on_charges,
        df_spend_on_atm,
        df_salary,
        df_loan_repay,
        df_gig_worker,
        df_spend_on_bills,
        df_spend_on_mobile,
        df_spend_on_wdrwl,
        df_spend_on_gambling,
        df_loan_amt,
        df_spend_on_shopping,
    ],
    how="vertical",
)

df_final = df_final.sample(n=df_final.shape[0], shuffle=True, seed=RANDOM_STATE)
df_final = df_final.with_columns(pl.col("type").map_dict({"C": "Credit", "D": "Debit"}))

df_final.head()

customer_id,nuban,date,description,amount,type,tags
str,i64,str,str,f64,str,str
"""39345""",1,"""2021-12-30""","""CSH DEP FJB213…",100500.0,"""Credit""","""behavioural.lo…"
"""38857""",1,"""2022-02-13""","""ATM CASH WDL R…",10000.0,"""Debit""","""spend.cashWith…"
"""40107""",1,"""2022-05-26""","""POS/WEB PMT PA…",5300.0,"""Debit""","""spend.webSpend…"
"""40788""",1,"""2022-01-25""","""ATM CASH WDL R…",12000.0,"""Debit""","""spend.cashWith…"
"""38974""",1,"""2022-08-04""","""TRF/Medical/dr…",21926.880859,"""Debit""","""spend.spendOnT…"


In [9]:
df_spend_on_shopping.head().to_pandas()

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags
0,40980,1,2022-11-15,ATM WDL @1232405F RITEHEALTH SUPERMARKET2 STERLING NG REF:002569/001076902483,37.630001,D,spend.shopping
1,40921,1,2022-11-04,"TRF/Forte Oil Supermarket, Orozo Road/FRM KALAT SOUL TO JAMES DANJUMA - 011",10026.879883,D,spend.shopping
2,40806,1,2022-11-30,TRF//FRM UDOAGWA MARTIN CHUKWULOBE TO Martin Chukwulobe Udoagwa- 305,400053.75,D,spend.shopping
3,40187,1,2022-05-28,POS/WEB PMT T OVERCOMER STORES 004053 2070328K NG,3500.0,D,spend.shopping
4,39509,1,2022-02-24,TRF//FRM OBAFEMI SUNDAY AMOS TO APATA COMPUTER VILLAGE STORES- 076,49446.878906,D,spend.shopping


In [10]:
path: str = "../06_Transformers/my_data/training_data_2.parquet"
df_final.write_parquet(file=path, use_pyarrow=True)