In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
inflows = pd.read_parquet('data/ucsd-inflows.pqt')
outflows = pd.read_parquet('data/ucsd-outflows.pqt')

display(inflows.head(), outflows.head())

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.0,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-07-29,EXTERNAL_TRANSFER


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.6,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.0,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.0,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES


## WEEK 2 - Train Test Split

In [3]:
# WEEK 2

In [4]:
from sklearn.model_selection import train_test_split

test_size = 0.2 

inflows_train, inflows_test = train_test_split(inflows, test_size=test_size, random_state=42)
outflows_train, outflows_test = train_test_split(outflows, test_size=test_size, random_state=42)

In [5]:
# need to sample by consumer instead of transaction

In [6]:
# Get unique customer_ids
ids = outflows.prism_consumer_id.unique()

# Split customer_ids into training and testing sets
train_ids, test_ids = train_test_split(ids, test_size=0.25)

In [7]:
outflows_train = outflows[outflows['prism_consumer_id'].isin(train_ids)]
outflows_test = outflows[outflows['prism_consumer_id'].isin(test_ids)]

In [8]:
outflows_train.shape, outflows_test.shape

((1957406, 6), (640082, 6))

In [9]:
print(f'# of unique IDS: {len(outflows.prism_consumer_id.unique())}, # of unique train IDS: {len(outflows_train.prism_consumer_id.unique())}, # of unique test IDS: {len(outflows_test.prism_consumer_id.unique())}')

# of unique IDS: 2968, # of unique train IDS: 2226, # of unique test IDS: 742


In [10]:
outflows.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,prism_consumer_id,amount
count,2597488.0,2597488.0
mean,3084.22934,145.1264
std,1820.57135,1697.87473
min,0.0,0.0
25%,1369.0,9.63
50%,3245.0,24.26
75%,4733.0,66.36
max,5943.0,654853.21


In [11]:
outflows_train.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,prism_consumer_id,amount
count,1957406.0,1957406.0
mean,3118.28999,142.87479
std,1830.8066,1512.20332
min,0.0,0.0
25%,1376.0,9.71
50%,3275.0,24.3
75%,4798.0,66.42
max,5940.0,461421.0


In [12]:
outflows_test.describe().apply(lambda s: s.apply('{0:.5f}'.format))

Unnamed: 0,prism_consumer_id,amount
count,640082.0,640082.0
mean,2980.06997,152.01193
std,1784.88101,2169.18952
min,7.0,0.0
25%,1344.0,9.36
50%,3160.0,24.16
75%,4458.0,66.24
max,5943.0,654853.21


In [13]:
# include statistical tests to ensure distributions are similar

## WEEK 3 - Memo Cleaning

In [14]:
outflows.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.6,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.0,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.0,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES


In [15]:
outflows.category.unique()

array(['LOAN', 'ATM_CASH', 'FOOD_AND_BEVERAGES', 'ENTERTAINMENT',
       'GENERAL_MERCHANDISE', 'ESSENTIAL_SERVICES', 'GROCERIES',
       'EXTERNAL_TRANSFER', 'AUTOMOTIVE', 'UNCATEGORIZED',
       'CREDIT_CARD_PAYMENT', 'SELF_TRANSFER', 'PETS',
       'HEALTHCARE_MEDICAL', 'INSURANCE', 'ACCOUNT_FEES',
       'HOME_IMPROVEMENT', 'TRAVEL', 'MORTGAGE', 'OVERDRAFT', 'EDUCATION',
       'RENT', 'TAX', 'CHILD_DEPENDENTS', 'GIFTS_DONATIONS',
       'BILLS_UTILITIES', 'PAYCHECK', 'BNPL', 'AUTO_LOAN'], dtype=object)

In [16]:
print(outflows.loc[~(outflows.memo == outflows.category), 'memo'].values[501:1000])
starting_num = len(memos.unique())

['Amazon Prime'
 'PURCHASE AUTHORIZED ON 12/24 NAILS R US HENDERSON NV SXXXXXXXXXXXXXXX CARD XXXX'
 'Ace Hardware' 'Albertsons'
 'PURCHASE AUTHORIZED ON 08/14 SMITHS FOOD #434 XXXX N. HENDERSON NV PXXXXXXXXXXXXXXXXX CARD XXXX'
 'PURCHASE AUTHORIZED ON 12/25 SIRIYA THAI RESTAU HENDERSON NV SXXXXXXXXXXXXXXX CARD XXXX'
 'Albertsons' 'Albertsons' 'Sephora'
 'PURCHASE AUTHORIZED ON 05/20 Amazon Digit*2R0DF amzn.com/bill WA SXXXXXXXXXXXXXXX CARD XXXX'
 'PURCHASE AUTHORIZED ON 10/02 KATE SPADE OUTLET LAS VEGAS NV SXXXXXXXXXXXXXXX CARD XXXX'
 'PURCHASE AUTHORIZED ON 05/28 TST* TACOTARIAN - LAS VEGAS NV SXXXXXXXXXXXXXXX CARD XXXX'
 'Chick-fil-A'
 'PURCHASE AUTHORIZED ON 09/17 NAILS R US HENDERSON NV SXXXXXXXXXXXXXXX CARD XXXX'
 'Albertsons'
 'RECURRING PAYMENT INTL AUTHORIZED ON 01/06 Brown Thomas Dublin IRL SXXXXXXXXXXXXXXX CARD XXXX'
 'Albertsons'
 'PURCHASE AUTHORIZED ON 04/01 TST* SETTEBELLO PI NV SXXXXXXXXXXXXXXX CARD XXXX'
 'Amazon'
 'PURCHASE AUTHORIZED ON 09/11 STAR NURSERY XXXX LAS VEG

NameError: name 'memos' is not defined

In [49]:
og_memos = outflows.loc[~(outflows.memo == outflows.category), 'memo']
memos = outflows.loc[~(outflows.memo == outflows.category), 'memo']

In [63]:
memos = memos.str.lower() \
    .str.replace(r'\b\w*x{2,}\w*\b', '', regex=True) \
    .str.replace(r'\b(0[1-9]|1[0-2])\/[0-9]{2}\b', '', regex=True) \
    .str.replace(r"[,'*#_-]", '', regex=True) \
    .str.replace(r'~', '', regex=True) \
    .str.replace('purchase.* authorized on', '', regex=True) \
    .str.replace('tst', '') \
    .str.replace('checkcard', '') \
    .str.strip()

In [65]:
memos

2                              casa del rio  exp fairlawn oh
4                                         buffalo wild wings
6                                                  oculus ca
7                                      los girasoles stow oh
8                                        buzzis laundry 1 oh
                                 ...                        
2597457    debit card withdrawal purchaseamazon primeti40...
2597462    pos withdrawalaz lot quiktrip   e indian schoo...
2597465    pos withdrawalwalmart   e mckellips rd mesa az...
2597468    withdrawal salt river projetype: online pmt co...
2597476    pos withdrawalfrysfooddrg 1 435 s. e mesa az  ...
Name: memo, Length: 1306452, dtype: object

In [66]:
memos[memos.str.contains('@')].values[:1000]

array(['gsuitebesmer. cc@google.comca  recurring',
       'purchase  scbjj scbjj dknakagawa@gmca  recurring',
       'gsuitebesmer. cc@google.comca  recurring',
       'purchase  scbjj scbjj dknakagawa@gmca  recurring',
       'gsuitebesmer. cc@google.comca  recurring',
       'scbjj scbjj dknakagawa@gmca  recurring',
       'scbjj scbjj dknakagawa@gmca  recurring',
       'scbjj scbjj dknakagawa@gmca  recurring',
       'gsuitebesmer. cc@google.comca', 'autoplicity ll support@autop',
       'merchant bnkcd | fee / | judith wiswell atty @',
       'merchant bnkcd | fee / | judith wiswell atty @',
       'merchant bnkcd | fee / | judith wiswell atty @',
       'merchant bnkcd | fee / | judith wiswell atty @',
       'merchant bnkcd | discount / | judith wiswell atty @',
       'merchant bnkcd | fee / | judith wiswell atty @',
       'merchant bnkcd | discount / | judith wiswell atty @',
       'merchant bnkcd | fee / | judith wiswell atty @',
       'merchant bnkcd | fee / | judith wisw

In [64]:
outflows.loc[~(outflows.memo == outflows.category), 'memo'].unique().__len__(), memos.unique().__len__(), 

(528766, 320731)

In [None]:
# steps:
## did more complex preprocessing first 
    # lemmatizer? (might not be good for this because it could stem words it shouldn't
    # same thing for stop words
    # remove dates (regex search mm/yy) and addresses 
## simple preprocessing
    # lowercase 
    # remove punctuation (,-*#_')
    # remove XXXX, even amount of X's
    # remove purchase authorized on
    # remove purchase, checkcard
# should we preprocess memos that are the same as category? -- no


In [24]:
with open('memo_original.txt', 'w') as f:
    inspect = outflows.memo.unique()
    f.write(',\n'.join([str(x) for x in sorted(inspect)]))