In [1]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
import seaborn as sns
import time
import plotly.express as px

In [2]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

import torch
print("CUDA available:", torch.cuda.is_available())

print("GPU in use:", torch.cuda.current_device())
torch.cuda.empty_cache()

2024-11-28 20:21:07.531080: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-28 20:21:07.531149: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-28 20:21:07.533076: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-28 20:21:07.545451: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  1
CUDA available: True
GPU in use: 0


# Reading Data + EDA

In [24]:
inflows = pd.read_parquet('../../data/ucsd-inflows.pqt')
inflows.category = inflows.category.apply(lambda x: 'PAYCHECK' if x == 'PAYCHECK_PLACEHOLDER' else x)
inflows.posted_date = pd.to_datetime(inflows['posted_date'])

In [25]:
inflows.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.0,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-07-29,EXTERNAL_TRANSFER


In [26]:
# Check values add up

# inflows['category_check'] = inflows.category.apply(lambda x: 'PAYCHECK' if x == 'PAYCHECK_PLACEHOLDER' else x)
# inflows.category.value_counts().loc[['PAYCHECK', 'PAYCHECK_PLACEHOLDER']].sum() == inflows.category_check.value_counts().loc[['PAYCHECK']].values[0]

#### Transactions Per Consumer

In [27]:
inflows_consumer_cnt = inflows.groupby('prism_consumer_id').count()[['category']]
inflows_consumer_cnt.head()

Unnamed: 0_level_0,category
prism_consumer_id,Unnamed: 1_level_1
0,91
2,113
4,152
7,129
9,375


In [28]:
print(f'Average # of transactions per consumer: {inflows_consumer_cnt.category.mean():.4f}')

Average # of transactions per consumer: 172.5336


#### Sum of Inflows by Consumer

In [29]:
inflows_consumer_sum = inflows.groupby('prism_consumer_id')[['amount']].sum()

inflows_consumer_sum.head()

Unnamed: 0_level_0,amount
prism_consumer_id,Unnamed: 1_level_1
0,110312.43
2,349639.03
4,462557.9
7,250214.09
9,215342.01


In [30]:
print(f'Average inflow sum per consumer: ${inflows_consumer_sum.amount.mean():.4f}')

Average inflow sum per consumer: $126760.4207


#### Income Estimates

In [31]:
inflows.category.value_counts()

category
EXTERNAL_TRANSFER        156533
SELF_TRANSFER            110437
DEPOSIT                   61345
PAYCHECK                  59225
MISCELLANEOUS             55648
REFUND                    23220
INVESTMENT_INCOME         17325
SMALL_DOLLAR_ADVANCE      13621
OTHER_BENEFITS             7708
TAX                        3405
LOAN                       2513
UNEMPLOYMENT_BENEFITS      1961
INSURANCE                   174
Name: count, dtype: int64

In [32]:
# relevant income categories
income_cats = ['EXTERNAL_TRANSFER', 'DEPOSIT', 'PAYCHECK', 'INVESTMENT_INCOME', 'OTHER_BENEFITS', 'UNEMPLOYMENT_BENEFITS']
inflows_rel = inflows[inflows.category.isin(income_cats)]

In [33]:
print(f'Originally {inflows.shape[0]} records, Now {inflows_rel.shape[0]} records')

Originally 513115 records, Now 304097 records


In [34]:
inflows_rel_consumer_sum = inflows_rel.groupby('prism_consumer_id')[['amount']].sum()

print(f'Average rough income sum per consumer: ${inflows_rel_consumer_sum.amount.mean():.4f}')
print(f'Median rough income sum per consumer: ${inflows_rel_consumer_sum.amount.median():.4f}')

Average rough income sum per consumer: $87154.2950
Median rough income sum per consumer: $37786.2100


In [35]:
sunburst_income = px.sunburst(
    inflows_rel,
    path=["category"], 
    values="amount",
    title="Major Sources of Income",
    width=400, height=400
)

sunburst_income.write_html('../../result/sunburst_income_sources.html')
sunburst_income.show()


# Recognizing Regularity with Timing and Amount

In [36]:
inflows_rel.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-10-25,EXTERNAL_TRANSFER
3,0,acc_0,EXTERNAL_TRANSFER,277.0,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-07-29,EXTERNAL_TRANSFER
5,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-12-23,EXTERNAL_TRANSFER


In [37]:
inflows_rel.memo.nunique()

7

In [93]:
def inflows_freq(data, consumer_id):
    """
    Identifies recurring inflows for a specific consumer in a dataset.

    Parameters:
    - dataset (pd.DataFrame): A dataframe containing consumer inflows with columns:
        ['consumer_id', 'account_id', 'memo', 'amount', 'posted_date']
    - consumer_id (int): The ID of the consumer to analyze.

    Returns:
    - pd.DataFrame: A dataframe with the following columns for recurring inflows:
        ['memo', 'avg_amount', 'frequency']
    """
    # Filter dataset for the specific consumer
    consumer_data = data[data['prism_consumer_id'] == consumer_id]
    
    # Ensure posted_date is datetime
    # consumer_data['posted_date'] = pd.to_datetime(consumer_data['posted_date'])
    
    # Group by memo to analyze recurring patterns for each category
    recurring_transactions = []
    reoccurrences = {}
    
    for memo, group in consumer_data.groupby('memo'):
        group = group.sort_values('posted_date')
        
        # Calculate time differences between consecutive transactions
        group['time_diff'] = group['posted_date'].diff().dt.days
        
        # Define frequency tolerances
        frequency_mapping = {
            'weekly': (6, 8),
            '2 weeks': (12, 16),
            'monthly': (27, 34)
        }
        
        # Check for recurring patterns
        for freq, (lower, upper) in frequency_mapping.items():
            freq_group = group[(group['time_diff'] >= lower) & (group['time_diff'] <= upper)]
            
            if not freq_group.empty:
                # Calculate average amount
                avg_amount = freq_group['amount'].mean()
                
                # Check if all amounts fall within 20% of the average
                within_20_percent = freq_group['amount'].apply(
                    lambda x: abs(x - avg_amount) <= 0.2 * avg_amount
                ).all()
                
                if within_20_percent:
                    recurring_transactions.append({
                        'memo': memo,
                        'avg_amount': round(avg_amount, 2),
                        'frequency': freq
                    })
                    # Add indices of recurring transactions to the dictionary
                    reoccurrences[memo] = freq_group.index.tolist()
                    display(freq_group)

    
    # Create output dataframe
    recurring_df = pd.DataFrame(recurring_transactions)
    
    return recurring_df, reoccurrences

In [94]:
curr_id = inflows_rel.prism_consumer_id.sample(1).values[0]
curr_id = 100

inflows_reg, inflows_rec = inflows_freq(inflows_rel, curr_id)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,time_diff
8952,100,acc_284,DEPOSIT,126.47,2020-07-24,DEPOSIT,7.0


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,time_diff
9001,100,acc_283,DEPOSIT,5336.49,2021-04-26,DEPOSIT,14.0


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,time_diff
8829,100,acc_283,INVESTMENT_INCOME,0.04,2020-09-25,INVESTMENT_INCOME,8.0


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,time_diff
8965,100,acc_288,INVESTMENT_INCOME,0.02,2021-04-30,INVESTMENT_INCOME,30.0
9076,100,acc_288,INVESTMENT_INCOME,0.03,2021-05-28,INVESTMENT_INCOME,28.0


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,time_diff
8898,100,acc_283,OTHER_BENEFITS,2286.96,2020-07-30,OTHER_BENEFITS,30.0
8940,100,acc_283,OTHER_BENEFITS,2286.96,2020-08-31,OTHER_BENEFITS,32.0
8993,100,acc_283,OTHER_BENEFITS,2286.96,2020-09-30,OTHER_BENEFITS,30.0
8969,100,acc_283,OTHER_BENEFITS,2286.96,2020-10-29,OTHER_BENEFITS,29.0
8851,100,acc_283,OTHER_BENEFITS,2286.96,2020-11-30,OTHER_BENEFITS,32.0
8856,100,acc_283,OTHER_BENEFITS,2316.18,2020-12-30,OTHER_BENEFITS,30.0
8943,100,acc_283,OTHER_BENEFITS,2316.18,2021-01-29,OTHER_BENEFITS,30.0
8935,100,acc_283,OTHER_BENEFITS,2316.18,2021-02-26,OTHER_BENEFITS,28.0
8944,100,acc_283,OTHER_BENEFITS,2316.18,2021-03-31,OTHER_BENEFITS,33.0
9057,100,acc_283,OTHER_BENEFITS,2316.18,2021-04-29,OTHER_BENEFITS,29.0


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,time_diff
8834,100,acc_284,PAYCHECK,166.03,2021-01-14,PAYCHECK,28.0


In [95]:
inflows_reg

Unnamed: 0,memo,avg_amount,frequency
0,DEPOSIT,126.47,weekly
1,DEPOSIT,5336.49,2 weeks
2,INVESTMENT_INCOME,0.04,weekly
3,INVESTMENT_INCOME,0.02,monthly
4,OTHER_BENEFITS,2298.64,monthly
5,PAYCHECK,166.03,monthly


In [96]:
inflows_rec

{'DEPOSIT': [9001],
 'INVESTMENT_INCOME': [8965, 9076],
 'OTHER_BENEFITS': [8898,
  8940,
  8993,
  8969,
  8851,
  8856,
  8943,
  8935,
  8944,
  9057,
  8836],
 'PAYCHECK': [8834]}

In [97]:
inflows.iloc[inflows_rec['OTHER_BENEFITS']]

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
8898,100,acc_283,OTHER_BENEFITS,2286.96,2020-07-30,OTHER_BENEFITS
8940,100,acc_283,OTHER_BENEFITS,2286.96,2020-08-31,OTHER_BENEFITS
8993,100,acc_283,OTHER_BENEFITS,2286.96,2020-09-30,OTHER_BENEFITS
8969,100,acc_283,OTHER_BENEFITS,2286.96,2020-10-29,OTHER_BENEFITS
8851,100,acc_283,OTHER_BENEFITS,2286.96,2020-11-30,OTHER_BENEFITS
8856,100,acc_283,OTHER_BENEFITS,2316.18,2020-12-30,OTHER_BENEFITS
8943,100,acc_283,OTHER_BENEFITS,2316.18,2021-01-29,OTHER_BENEFITS
8935,100,acc_283,OTHER_BENEFITS,2316.18,2021-02-26,OTHER_BENEFITS
8944,100,acc_283,OTHER_BENEFITS,2316.18,2021-03-31,OTHER_BENEFITS
9057,100,acc_283,OTHER_BENEFITS,2316.18,2021-04-29,OTHER_BENEFITS


In [90]:
for memo, group in inflows_rel.groupby('memo'):
    print(memo)
    display(group)

DEPOSIT


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
23,0,acc_0,DEPOSIT,537.5000,2022-08-08,DEPOSIT
49,0,acc_0,DEPOSIT,2313.2000,2022-05-06,DEPOSIT
52,0,acc_0,DEPOSIT,524.4800,2022-09-12,DEPOSIT
91,2,acc_3,DEPOSIT,19.0000,2021-04-08,DEPOSIT
101,2,acc_3,DEPOSIT,320.0000,2021-10-25,DEPOSIT
...,...,...,...,...,...,...
512806,5940,acc_9523,DEPOSIT,225.0000,2023-01-24,DEPOSIT
512813,5940,acc_9523,DEPOSIT,20.0000,2023-01-31,DEPOSIT
512815,5940,acc_9523,DEPOSIT,275.0000,2023-01-31,DEPOSIT
512876,5941,acc_9524,DEPOSIT,130.0000,2022-08-18,DEPOSIT


EXTERNAL_TRANSFER


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
1,0,acc_0,EXTERNAL_TRANSFER,100.0000,2022-10-25,EXTERNAL_TRANSFER
3,0,acc_0,EXTERNAL_TRANSFER,277.0000,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.0000,2022-07-29,EXTERNAL_TRANSFER
5,0,acc_0,EXTERNAL_TRANSFER,100.0000,2022-12-23,EXTERNAL_TRANSFER
8,0,acc_0,EXTERNAL_TRANSFER,75.0000,2022-03-23,EXTERNAL_TRANSFER
...,...,...,...,...,...,...
513110,5941,acc_9524,EXTERNAL_TRANSFER,8.6600,2023-01-21,EXTERNAL_TRANSFER
513111,5941,acc_9524,EXTERNAL_TRANSFER,267.1300,2023-01-23,EXTERNAL_TRANSFER
513112,5941,acc_9524,EXTERNAL_TRANSFER,2.0000,2023-01-24,EXTERNAL_TRANSFER
513113,5941,acc_9524,EXTERNAL_TRANSFER,207.1600,2023-01-24,EXTERNAL_TRANSFER


INVESTMENT_INCOME


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
7,0,acc_1,INVESTMENT_INCOME,0.0400,2023-01-20,INVESTMENT_INCOME
11,0,acc_1,INVESTMENT_INCOME,0.0400,2022-05-18,INVESTMENT_INCOME
14,0,acc_1,INVESTMENT_INCOME,0.0400,2022-02-17,INVESTMENT_INCOME
21,0,acc_1,INVESTMENT_INCOME,0.0400,2022-08-17,INVESTMENT_INCOME
43,0,acc_1,INVESTMENT_INCOME,0.0400,2022-03-17,INVESTMENT_INCOME
...,...,...,...,...,...,...
512757,5940,acc_9523,INVESTMENT_INCOME,0.0300,2022-12-31,INVESTMENT_INCOME
512801,5940,acc_9523,INVESTMENT_INCOME,288.3600,2023-01-20,INVESTMENT_INCOME
512814,5940,acc_9523,INVESTMENT_INCOME,0.0400,2023-01-31,INVESTMENT_INCOME
512817,5940,acc_9523,INVESTMENT_INCOME,228.7300,2023-02-01,INVESTMENT_INCOME


OTHER_BENEFITS


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
485,9,acc_18,OTHER_BENEFITS,1763.7100,2021-04-01,OTHER_BENEFITS
517,9,acc_18,OTHER_BENEFITS,1117.4300,2020-07-31,OTHER_BENEFITS
557,9,acc_18,OTHER_BENEFITS,1117.4300,2020-10-01,OTHER_BENEFITS
571,9,acc_18,OTHER_BENEFITS,631.6700,2020-12-31,OTHER_BENEFITS
584,9,acc_18,OTHER_BENEFITS,1763.7100,2021-04-30,OTHER_BENEFITS
...,...,...,...,...,...,...
508306,5892,acc_9475,OTHER_BENEFITS,1678.0000,2022-11-17,OTHER_BENEFITS
508313,5892,acc_9475,OTHER_BENEFITS,152.6400,2022-12-01,OTHER_BENEFITS
508330,5892,acc_9475,OTHER_BENEFITS,165.9200,2022-12-30,OTHER_BENEFITS
510002,5910,acc_9493,OTHER_BENEFITS,5172.5000,2021-04-07,OTHER_BENEFITS


PAYCHECK


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.0200,2022-03-18,PAYCHECK
6,0,acc_0,PAYCHECK,2547.3500,2023-02-03,PAYCHECK
16,0,acc_0,PAYCHECK,2528.4000,2022-10-20,PAYCHECK
17,0,acc_0,PAYCHECK,2547.3500,2023-01-20,PAYCHECK
25,0,acc_0,PAYCHECK,2331.7100,2022-05-05,PAYCHECK
...,...,...,...,...,...,...
511992,5929,acc_9512,PAYCHECK,728.8900,2023-01-24,PAYCHECK
512713,5940,acc_9523,PAYCHECK,30.0000,2022-11-30,PAYCHECK
512750,5940,acc_9523,PAYCHECK,30.0000,2022-12-28,PAYCHECK
512782,5940,acc_9523,PAYCHECK,40.0000,2023-01-11,PAYCHECK


PAYCHECK_PLACEHOLDER


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
48,0,acc_0,PAYCHECK_PLACEHOLDER,143.0000,2022-07-11,PAYCHECK
272,4,acc_5,PAYCHECK_PLACEHOLDER,15.9900,2022-03-25,PAYCHECK
345,4,acc_5,PAYCHECK_PLACEHOLDER,1110.5100,2022-03-11,PAYCHECK
364,7,acc_14,PAYCHECK_PLACEHOLDER,5792.0800,2022-10-06,PAYCHECK
367,7,acc_14,PAYCHECK_PLACEHOLDER,5919.3400,2023-02-23,PAYCHECK
...,...,...,...,...,...,...
512777,5940,acc_9523,PAYCHECK_PLACEHOLDER,2235.7100,2023-01-06,PAYCHECK
512786,5940,acc_9523,PAYCHECK_PLACEHOLDER,100.0000,2023-01-11,PAYCHECK
512800,5940,acc_9523,PAYCHECK_PLACEHOLDER,1138.8700,2023-01-20,PAYCHECK
512807,5940,acc_9523,PAYCHECK_PLACEHOLDER,150.0000,2023-01-25,PAYCHECK


UNEMPLOYMENT_BENEFITS


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
507,9,acc_18,UNEMPLOYMENT_BENEFITS,535.0000,2021-02-03,UNEMPLOYMENT_BENEFITS
511,9,acc_18,UNEMPLOYMENT_BENEFITS,535.0000,2021-02-24,UNEMPLOYMENT_BENEFITS
525,9,acc_18,UNEMPLOYMENT_BENEFITS,2140.0000,2021-05-05,UNEMPLOYMENT_BENEFITS
537,9,acc_18,UNEMPLOYMENT_BENEFITS,300.0000,2021-02-24,UNEMPLOYMENT_BENEFITS
546,9,acc_18,UNEMPLOYMENT_BENEFITS,535.0000,2021-01-20,UNEMPLOYMENT_BENEFITS
...,...,...,...,...,...,...
508429,5894,acc_9477,UNEMPLOYMENT_BENEFITS,300.0000,2021-08-16,UNEMPLOYMENT_BENEFITS
508432,5894,acc_9477,UNEMPLOYMENT_BENEFITS,119.0000,2021-08-25,UNEMPLOYMENT_BENEFITS
508434,5894,acc_9477,UNEMPLOYMENT_BENEFITS,300.0000,2021-08-30,UNEMPLOYMENT_BENEFITS
508435,5894,acc_9477,UNEMPLOYMENT_BENEFITS,202.0000,2021-09-08,UNEMPLOYMENT_BENEFITS
