# Prism Data

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
inflows = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-inflows.pqt')

In [3]:
outflows = pd.read_parquet('/uss/hdsi-prismdata/q1-ucsd-outflows.pqt')

In [4]:

inflows.head(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.0,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.0,2022-07-29,EXTERNAL_TRANSFER


In [5]:
outflows.head(5)

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.6,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.0,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.0,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES


## Data Exploration

*inflows*

In [6]:
inflows['prism_consumer_id'].nunique()

2974

In [7]:
inflows['memo'].value_counts()

memo
EXTERNAL_TRANSFER        156533
SELF_TRANSFER            110437
DEPOSIT                   61345
MISCELLANEOUS             55648
PAYCHECK                  33138
PAYCHECK_PLACEHOLDER      26087
REFUND                    23220
INVESTMENT_INCOME         17325
SMALL_DOLLAR_ADVANCE      13621
OTHER_BENEFITS             7708
TAX                        3405
LOAN                       2513
UNEMPLOYMENT_BENEFITS      1961
INSURANCE                   174
Name: count, dtype: int64

In [8]:
inflows[inflows['memo'] == inflows['category']]

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.00,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-07-29,EXTERNAL_TRANSFER
...,...,...,...,...,...,...
513110,5941,acc_9524,EXTERNAL_TRANSFER,8.66,2023-01-21,EXTERNAL_TRANSFER
513111,5941,acc_9524,EXTERNAL_TRANSFER,267.13,2023-01-23,EXTERNAL_TRANSFER
513112,5941,acc_9524,EXTERNAL_TRANSFER,2.00,2023-01-24,EXTERNAL_TRANSFER
513113,5941,acc_9524,EXTERNAL_TRANSFER,207.16,2023-01-24,EXTERNAL_TRANSFER


In [9]:
len(inflows)

513115

In [10]:
inflows['memo'].nunique()

14

In [11]:

inflows['category'].nunique()

14

**Number of unique merchants in each category**

In [12]:
 inflows[['category','memo']].groupby('category').nunique()

Unnamed: 0_level_0,memo
category,Unnamed: 1_level_1
DEPOSIT,1
EXTERNAL_TRANSFER,1
INSURANCE,1
INVESTMENT_INCOME,1
LOAN,1
MISCELLANEOUS,1
OTHER_BENEFITS,1
PAYCHECK,1
PAYCHECK_PLACEHOLDER,1
REFUND,1


*outflows*

In [13]:
outflows['memo'].value_counts()

memo
EXTERNAL_TRANSFER                                                                    320998
AUTOMOTIVE                                                                           208579
ATM_CASH                                                                             117651
UNCATEGORIZED                                                                        117409
LOAN                                                                                  90945
                                                                                      ...  
Par Gators Dockside -                                                                     1
Chilis Mandarin                                                                           1
Chilis Bay Meadows                                                                        1
Southside Liquor                                                                          1
POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS RD MESA AZ  Card 15 #XXXX  MC

In [14]:
list(outflows['category'].unique())

['LOAN',
 'ATM_CASH',
 'FOOD_AND_BEVERAGES',
 'ENTERTAINMENT',
 'GENERAL_MERCHANDISE',
 'ESSENTIAL_SERVICES',
 'GROCERIES',
 'EXTERNAL_TRANSFER',
 'AUTOMOTIVE',
 'UNCATEGORIZED',
 'CREDIT_CARD_PAYMENT',
 'SELF_TRANSFER',
 'PETS',
 'HEALTHCARE_MEDICAL',
 'INSURANCE',
 'ACCOUNT_FEES',
 'HOME_IMPROVEMENT',
 'TRAVEL',
 'MORTGAGE',
 'OVERDRAFT',
 'EDUCATION',
 'RENT',
 'TAX',
 'CHILD_DEPENDENTS',
 'GIFTS_DONATIONS',
 'BILLS_UTILITIES',
 'PAYCHECK',
 'BNPL',
 'AUTO_LOAN']

In [173]:
outflows[outflows['category'] == 'BILLS_UTILITIES']['memo'].unique()

array(['BILLS_UTILITIES'], dtype=object)

**Uncategorized Category**

In [174]:
outflows[outflows['category'] == 'UNCATEGORIZED']

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
27,0,acc_0,UNCATEGORIZED,310.64,2022-02-22,UNCATEGORIZED
65,0,acc_0,UNCATEGORIZED,150.00,2022-03-11,UNCATEGORIZED
75,0,acc_0,UNCATEGORIZED,2.20,2022-08-01,UNCATEGORIZED
84,0,acc_0,UNCATEGORIZED,405.65,2022-03-29,UNCATEGORIZED
119,0,acc_0,UNCATEGORIZED,547.84,2022-08-23,UNCATEGORIZED
...,...,...,...,...,...,...
2597446,5941,acc_9524,UNCATEGORIZED,251.51,2023-01-13,UNCATEGORIZED
2597463,5941,acc_9524,UNCATEGORIZED,804.27,2023-01-18,UNCATEGORIZED
2597467,5941,acc_9524,UNCATEGORIZED,39.64,2023-01-19,UNCATEGORIZED
2597472,5941,acc_9524,UNCATEGORIZED,0.00,2023-01-20,UNCATEGORIZED


In [175]:
outflows[['category','memo']].groupby('category').nunique()

Unnamed: 0_level_0,memo
category,Unnamed: 1_level_1
ACCOUNT_FEES,1
ATM_CASH,1
AUTOMOTIVE,1
AUTO_LOAN,1
BILLS_UTILITIES,1
BNPL,1
CHILD_DEPENDENTS,1
CREDIT_CARD_PAYMENT,1
EDUCATION,2122
ENTERTAINMENT,1


In [176]:
outflows['memo'].value_counts()

memo
EXTERNAL_TRANSFER                                                                    320998
AUTOMOTIVE                                                                           208579
ATM_CASH                                                                             117651
UNCATEGORIZED                                                                        117409
LOAN                                                                                  90945
                                                                                      ...  
Par Gators Dockside -                                                                     1
Chilis Mandarin                                                                           1
Chilis Bay Meadows                                                                        1
Southside Liquor                                                                          1
POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS RD MESA AZ  Card 15 #XXXX  MC

In [177]:
list(outflows['category'].unique())

['LOAN',
 'ATM_CASH',
 'FOOD_AND_BEVERAGES',
 'ENTERTAINMENT',
 'GENERAL_MERCHANDISE',
 'ESSENTIAL_SERVICES',
 'GROCERIES',
 'EXTERNAL_TRANSFER',
 'AUTOMOTIVE',
 'UNCATEGORIZED',
 'CREDIT_CARD_PAYMENT',
 'SELF_TRANSFER',
 'PETS',
 'HEALTHCARE_MEDICAL',
 'INSURANCE',
 'ACCOUNT_FEES',
 'HOME_IMPROVEMENT',
 'TRAVEL',
 'MORTGAGE',
 'OVERDRAFT',
 'EDUCATION',
 'RENT',
 'TAX',
 'CHILD_DEPENDENTS',
 'GIFTS_DONATIONS',
 'BILLS_UTILITIES',
 'PAYCHECK',
 'BNPL',
 'AUTO_LOAN']

In [178]:

outflows[outflows['category'] == 'BILLS_UTILITIES']['memo'].unique()

array(['BILLS_UTILITIES'], dtype=object)

**Uncategorized Transactions**

In [179]:

outflows[outflows['category'] == 'UNCATEGORIZED']

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
27,0,acc_0,UNCATEGORIZED,310.64,2022-02-22,UNCATEGORIZED
65,0,acc_0,UNCATEGORIZED,150.00,2022-03-11,UNCATEGORIZED
75,0,acc_0,UNCATEGORIZED,2.20,2022-08-01,UNCATEGORIZED
84,0,acc_0,UNCATEGORIZED,405.65,2022-03-29,UNCATEGORIZED
119,0,acc_0,UNCATEGORIZED,547.84,2022-08-23,UNCATEGORIZED
...,...,...,...,...,...,...
2597446,5941,acc_9524,UNCATEGORIZED,251.51,2023-01-13,UNCATEGORIZED
2597463,5941,acc_9524,UNCATEGORIZED,804.27,2023-01-18,UNCATEGORIZED
2597467,5941,acc_9524,UNCATEGORIZED,39.64,2023-01-19,UNCATEGORIZED
2597472,5941,acc_9524,UNCATEGORIZED,0.00,2023-01-20,UNCATEGORIZED


In [180]:
outflows[['category','memo']].groupby('category').nunique()

Unnamed: 0_level_0,memo
category,Unnamed: 1_level_1
ACCOUNT_FEES,1
ATM_CASH,1
AUTOMOTIVE,1
AUTO_LOAN,1
BILLS_UTILITIES,1
BNPL,1
CHILD_DEPENDENTS,1
CREDIT_CARD_PAYMENT,1
EDUCATION,2122
ENTERTAINMENT,1


In [181]:
# double checking 

# outflows[outflows['category']=='TAX']

In [182]:
merchant_cat = ['EDUCATION', 'FOOD_AND_BEVERAGES', 'GENERAL_MERCHANDISE', 'GROCERIES', 'MORTGAGE','OVERDRAFT', 'PETS', 'RENT', 'TRAVEL']

In [183]:
merchant_df = outflows[outflows['category'].isin(merchant_cat)][['category','memo']].reset_index()
merchant_df

Unnamed: 0,index,category,memo
0,2,FOOD_AND_BEVERAGES,TST* Casa Del Rio - Exp Fairlawn OH 09/24
1,4,FOOD_AND_BEVERAGES,Buffalo Wild Wings
2,6,GENERAL_MERCHANDISE,Oculus CA 04/16
3,7,FOOD_AND_BEVERAGES,LOS GIRASOLES STOW OH 03/08
4,8,GENERAL_MERCHANDISE,BUZZIS LAUNDRY 1 OH 03/28
...,...,...,...
1306447,2597457,GENERAL_MERCHANDISE,DEBIT CARD WITHDRAWAL PURCHASEAmazon Prime*TI4...
1306448,2597462,EDUCATION,POS WITHDRAWALAZ LOT QUIKTRIP XXXX XXXX E INDI...
1306449,2597465,FOOD_AND_BEVERAGES,POS WITHDRAWALWAL-MART #XXXX XXXX E MCKELLIPS ...
1306450,2597468,FOOD_AND_BEVERAGES,WITHDRAWAL Salt River ProjeTYPE: ONLINE PMT CO...


In [184]:
merchant_df.groupby(['category','memo']).count().sort_values(by=['category','index'], ascending=False).groupby('category').head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,index
category,memo,Unnamed: 2_level_1
TRAVEL,Uber,7989
TRAVEL,Uber Eats,5691
TRAVEL,Lyft,4648
RENT,SOUTHERN INVESTO WEB PMTS,47
RENT,NEWREZ-SHELLPOIN WEB PMTS XXXXXXXXXX WEB ID: XXXXXXXXXX,37
RENT,CHECKCARD XXXX GRACELAND RENTAL XXX-XXXXXXX KY XXXXXXXXXXXXXXXXXXXXXXX,30
PETS,PetSmart,1450
PETS,CHEWY.COM,255
PETS,Pet Supplies Plus,228
OVERDRAFT,Overdraft Item Fee,374


## Chosen Categories(5)

- **FOOD_AND_BEVERAGES**
- **GENERAL_MERCHANDISE**
- **GROCERIES**
- **PETS**
- **TRAVEL**

In [185]:
chosen_cat = ['FOOD_AND_BEVERAGES', 'GENERAL_MERCHANDISE', 'GROCERIES', 'PETS', 'TRAVEL']

In [186]:
chosen_df = outflows[outflows['category'].isin(chosen_cat)][['category','memo']].reset_index()
chosen_df.groupby(['category','memo']).count().sort_values(by=['category','index'], ascending=False).groupby('category').head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,index
category,memo,Unnamed: 2_level_1
TRAVEL,Uber,7989
TRAVEL,Uber Eats,5691
TRAVEL,Lyft,4648
PETS,PetSmart,1450
PETS,CHEWY.COM,255
PETS,Pet Supplies Plus,228
GROCERIES,Walmart,31619
GROCERIES,Kroger,8423
GROCERIES,Target,8343
GENERAL_MERCHANDISE,Amazon,31725


 ***Most Common Merchants by Category (Top 3)***

- **`TRAVEL`** :`Uber, Uber Eats, Lyft`
-  **`PETS`**  : `PetSmart, CHEWY.COM, Pet Supplies Plus`
- **`GROCERIES`** : `Walmart, Kroger, Target`
- **`GENERAL_MERCHANDISE`**: `Amazon, 7-Eleven, Circle K`
- **`FOOD_AND_BEVERAGES`** : `McDonald's, Starbucks, Chick-fil-a`

## Train-Test Split 

In [187]:
from sklearn.model_selection import train_test_split

In [188]:
inflows

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.00,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-07-29,EXTERNAL_TRANSFER
...,...,...,...,...,...,...
513110,5941,acc_9524,EXTERNAL_TRANSFER,8.66,2023-01-21,EXTERNAL_TRANSFER
513111,5941,acc_9524,EXTERNAL_TRANSFER,267.13,2023-01-23,EXTERNAL_TRANSFER
513112,5941,acc_9524,EXTERNAL_TRANSFER,2.00,2023-01-24,EXTERNAL_TRANSFER
513113,5941,acc_9524,EXTERNAL_TRANSFER,207.16,2023-01-24,EXTERNAL_TRANSFER


In [189]:
inflows_consumers = inflows['prism_consumer_id'].unique()
inflows_consumers

array([   0,    2,    4, ..., 5939, 5940, 5941])

In [190]:
in_train_users, in_test_users = train_test_split(inflows_consumers, test_size=0.2)#, random_state=42)
len(in_train_users), len(in_test_users)

(2379, 595)

In [191]:
595/2379

0.25010508617065996

In [192]:
in_train_df = inflows[inflows['prism_consumer_id'].isin(in_train_users)]
in_test_df = inflows[inflows['prism_consumer_id'].isin(in_test_users)]

In [193]:
in_train_df

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,PAYCHECK,2477.02,2022-03-18,PAYCHECK
1,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-10-25,EXTERNAL_TRANSFER
2,0,acc_0,MISCELLANEOUS,6.29,2022-08-26,MISCELLANEOUS
3,0,acc_0,EXTERNAL_TRANSFER,277.00,2022-06-03,EXTERNAL_TRANSFER
4,0,acc_0,EXTERNAL_TRANSFER,100.00,2022-07-29,EXTERNAL_TRANSFER
...,...,...,...,...,...,...
513110,5941,acc_9524,EXTERNAL_TRANSFER,8.66,2023-01-21,EXTERNAL_TRANSFER
513111,5941,acc_9524,EXTERNAL_TRANSFER,267.13,2023-01-23,EXTERNAL_TRANSFER
513112,5941,acc_9524,EXTERNAL_TRANSFER,2.00,2023-01-24,EXTERNAL_TRANSFER
513113,5941,acc_9524,EXTERNAL_TRANSFER,207.16,2023-01-24,EXTERNAL_TRANSFER


In [194]:
in_test_df

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
893,12,acc_27,EXTERNAL_TRANSFER,300.00,2021-04-14,EXTERNAL_TRANSFER
894,12,acc_28,DEPOSIT,1952.52,2020-12-15,DEPOSIT
895,12,acc_29,INVESTMENT_INCOME,0.32,2021-03-31,INVESTMENT_INCOME
896,12,acc_28,EXTERNAL_TRANSFER,2000.00,2021-05-07,EXTERNAL_TRANSFER
897,12,acc_27,EXTERNAL_TRANSFER,300.00,2021-01-19,EXTERNAL_TRANSFER
...,...,...,...,...,...,...
509728,5903,acc_9486,REFUND,24.73,2023-01-27,REFUND
509729,5903,acc_9486,MISCELLANEOUS,175.17,2023-02-01,MISCELLANEOUS
509730,5903,acc_9486,MISCELLANEOUS,816.88,2023-02-01,MISCELLANEOUS
509731,5903,acc_9486,EXTERNAL_TRANSFER,407.74,2023-02-03,EXTERNAL_TRANSFER


In [195]:
len(in_train_df), len(in_test_df)

(414496, 98619)

In [196]:
99026/414089

0.2391418269985438

In [197]:
in_train_df.groupby('prism_consumer_id').size().mean(), in_test_df.groupby('prism_consumer_id').size().mean()

(174.23118957545188, 165.74621848739497)

In [198]:
in_train_df['amount'].mean(), in_test_df['amount'].mean()

(761.5781556154947, 621.7300109512365)

In [199]:

in_train_df['amount'].median(), in_test_df['amount'].median()

(100.0, 98.25)

*outflows*

In [200]:
outflows_consumers = outflows['prism_consumer_id'].unique()
outflows_consumers 


array([   0,    2,    4, ..., 5939, 5940, 5941])

In [201]:
out_train_users, out_test_users = train_test_split(outflows_consumers, test_size=0.2, random_state=42)
len(out_train_users), len(out_test_users)

(2374, 594)

In [202]:
594/2374

0.2502106149957877

In [203]:
out_train_df = outflows[outflows['prism_consumer_id'].isin(out_train_users)]
out_test_df = outflows[outflows['prism_consumer_id'].isin(out_test_users)]
len(out_train_df), len(out_test_df)


(2097805, 499683)

In [204]:
len(out_test_df)/len(out_train_df)

0.23819325437779013

In [205]:
out_train_df.groupby('prism_consumer_id').size().mean(), out_test_df.groupby('prism_consumer_id').size().mean()


(883.6583824768323, 841.2171717171717)

In [206]:
out_train_df['amount'].mean(), out_test_df['amount'].mean()

(138.0927030586732, 174.65576219323057)

In [207]:
results = []

for i in range(10):
    consumers = outflows['prism_consumer_id'].unique()
    train_consumers, test_consumers = train_test_split(consumers, test_size=0.2, random_state=i)

    # create train/test
    out_train_df = outflows[outflows['prism_consumer_id'].isin(train_consumers)]
    out_test_df = outflows[outflows['prism_consumer_id'].isin(test_consumers)]

    train_mean = out_train_df['amount'].mean()
    test_mean = out_test_df['amount'].mean()
    
    results.append({
        'run': i+1,
        'train_mean': train_mean,
        'test_mean': test_mean,
        'test_greater_than_train': test_mean > train_mean
    })
results_df = pd.DataFrame(results)
print("test mean > train mean:", results_df['test_greater_than_train'].sum())

results_df

test mean > train mean: 5


Unnamed: 0,run,train_mean,test_mean,test_greater_than_train
0,1,143.084348,153.033872,True
1,2,145.634828,143.09366,False
2,3,148.333782,132.00657,False
3,4,135.336009,189.045829,True
4,5,146.713798,138.832393,False
5,6,141.546613,159.40071,True
6,7,145.845616,142.237054,False
7,8,148.314072,132.605315,False
8,9,138.849937,168.705635,True
9,10,141.326526,160.93103,True


**We split the data at the consumer level so that all transactions from a single consumer are placed entirely in either the training or test set.**

**The train and test splits appear to be unbiased. In 10 random runs, the test mean was greater than the train mean 5 times, which is roughly half. This indicates that both splits likely represent the same underlying data distribution.**

### ellie's memo cleaning

In [242]:
# out_train_df['memo'].head(50)

In [243]:
import re
import pandas as pd

def clean_memo(text: str) -> str:
    if pd.isna(text):
        return text

    text = text.strip().lower()

    # remove email addresses
    text = re.sub(r'\b([\w\._-]+)@[\w\.-]+\b', r'\1', text)

    # remove tst
    text = re.sub(r'\btst\*', '', text)

    # remove ref/conf/id/payment etc.
    text = re.sub(r'\b(?:ref|conf|id|paymntid|pmt info|payment id|web id|bnf)[#:\s=]*\w*', '', text)

    # remove long X sequences
    text = re.sub(r'x{2,}[a-z0-9]*', '', text)

    # remove dates
    text = re.sub(r'\b\d{1,2}/\d{1,2}(?:/\d{2,4})?\b', '', text)

    # remove 2-letter state abbreviations
    text = re.sub(
        r'\b(?:al|ak|az|ar|ca|co|ct|de|fl|ga|hi|id|il|in|ia|ks|ky|la|me|md|ma|mi|mn|ms|mo|mt|ne|nv|nh|nj|nm|ny|nc|nd|oh|ok|or|pa|ri|sc|sd|tn|tx|ut|vt|va|wa|wv|wi|wy)\b',
        '', text)

    # remove long numeric codes
    text = re.sub(r'\b\d{2,}\b', '', text)

    # remove alphanumeric ids (letters + numbers that are 4–15 chars)
    text = re.sub(r'\b(?=\w*[a-z])(?=\w*\d)[a-z0-9]{4,15}\b', '', text)

    # remove street suffixes
    text = re.sub(r'\b(?:st|rd|ave|blvd|pkwy|plz)\b', '', text)

    # remove only the prefixes 'www' or 'https' but keep domain
    text = re.sub(r'\b(?:https?:\/\/|https?\.|www\.)', '', text)

    # remove unwanted words/phrases
    text = re.sub(r'\bpurchase authorized on\b', '', text)
    text = re.sub(r'\b(authorized|payment)\b', '', text)

    # fix wendy’s → wendys
    text = re.sub(r"’|'|`", '', text)

    # add space after certain words if stuck to other words
    text = re.sub(r'(grubhub|doordash)(\w)', r'\1 \2', text)
    text = re.sub(r'purchaseamazon', 'purchase amazon', text)

    # remove dd if followed by doordash
    text = re.sub(r'\bdd doordash\b', 'doordash', text)

    # replacements / normalization
    replacements = {
        'bk': 'bank',
        'wal-mart': 'walmart',
        'wal mart': 'walmart',
        'business to business': 'b2b',
        'pypl': 'paypal',
        'amzn': 'amazon',
        'burgerkin': 'burger king'
    }
    for k, v in replacements.items():
        text = re.sub(rf'\b{k}\b', v, text)

    # remove company suffixes except 'inc'
    text = re.sub(r'\b(?:llc|corp|co)\b', '', text)

    # normalize punctuation
    text = re.sub(r'[_/]', ' ', text)
    text = re.sub(r'(?<!\w)-(?!\w)', ' ', text)
    text = re.sub(r"[!@#$%^&*+=?:;\"',.<>~`|\\]+", ' ', text)

    # collapse duplicate words
    text = re.sub(r'\b(\w+)( \1\b)+', r'\1', text)

    # normalize spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text



In [244]:
# test runs
df = pd.DataFrame({
    "memo": [
        "TST* Casa Del Rio - Exp Fairlawn OH 9/24",
        "APPLE.COM/BILL CA 04/07",
        "payment to user@email.com REF#12345",
        "Zelle payment from john_doe@email.com 06/12",
        "TGI FRIDAYS XXXX STOW OH 12/31",
        "LOS GIRASOLES STOW OH 03/08",
        "www.elliewang.com",
        "https.elliewang.com",
        "doordash doordash dashmart",
        "wal mart",
        "chick-fil-a",
        "Chick-fiwoodstock",
        "purchase authorized on june",
        "dd doordash",
        "doordashhellofresh",
        "wendy's",
        "purchase authorized on amazon"
    ]
})

df["cleaned_memo"] = df["memo"].apply(clean_memo)
print(df)

                                           memo               cleaned_memo
0      TST* Casa Del Rio - Exp Fairlawn OH 9/24  casa del rio exp fairlawn
1                       APPLE.COM/BILL CA 04/07             apple com bill
2           payment to user@email.com REF#12345                    to user
3   Zelle payment from john_doe@email.com 06/12        zelle from john doe
4                TGI FRIDAYS XXXX STOW OH 12/31           tgi fridays stow
5                   LOS GIRASOLES STOW OH 03/08         los girasoles stow
6                             www.elliewang.com              elliewang com
7                           https.elliewang.com              elliewang com
8                    doordash doordash dashmart          doordash dashmart
9                                      wal mart                    walmart
10                                  chick-fil-a                chick-fil-a
11                            Chick-fiwoodstock          chick-fiwoodstock
12                  purch

In [245]:
# out_train_df.loc[0:49, 'memo'].apply(clean_memo)


In [246]:
subset = out_train_df.sample(n=50, random_state=None).copy()
subset['cleaned_memo'] = subset['memo'].apply(clean_memo)
subset[['memo', 'cleaned_memo']]


Unnamed: 0,memo,cleaned_memo
1847223,GIANT-EAG XXXX Mahonin Youngstow,giant-eag mahonin youngstow
73490,EXTERNAL_TRANSFER,external transfer
2120224,POS Debit - Visa Check Card XXXX - UBER EATS H...,pos debit visa check card uber eats help uber ...
1305386,CHECKCARD XXXX TST* 5 BOROUGH BA CLIVE IA XXXX...,checkcard 5 borough ba clive
356959,CHECKCARD XXXX CAFE RIO XXXX EASTVALE EASTVALE...,checkcard cafe rio eastvale
997075,EXTERNAL_TRANSFER,external transfer
1004440,ESSENTIAL_SERVICES,essential services
304871,Starbucks,starbucks
667436,CHECKCARD XXXX SAKURA SUSHI PHOENIX AZ XXXXXXX...,checkcard sakura sushi phoenix
2125427,EXTERNAL_TRANSFER,external transfer


In [247]:
mask = out_train_df['memo'].str.contains('dd', case=False, na=False)
out_train_df.loc[mask, 'memo'] = out_train_df.loc[mask, 'memo'].apply(clean_memo)
out_train_df.loc[mask, 'memo'].head(50)

1275                             usps change of add s card
2152                            usps change of address usa
5521                   checkcard dnh godaddy com recurring
6720               checkcard dnh godaddy com gaz recurring
9778                       dda purchase walmart old bridge
9782                visa dda pur pasquales pizza iv linden
9783                       visa dda pur americanmuscle com
9785                    visa dda pur bagel world manalapan
9805                           visa dda pur apple com bill
9809                        visa dda pur moe s englishtown
9810             dda purchase tjma 0 u s rte 9 englishtown
9822                    visa dda pur bagel world manalapan
9832     visa dda pur big bob s philly cheeses englishtown
9842              dda purchase marshalls trotters freehold
9844             visa dda pur cool smoothie bubbl freehold
9850           dda purchase wm superc walmart sup freehold
9872                   visa dda pur zacks deli englishto

### Cleaning Banking Transactional Memos

In [50]:
out_train_df

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.60,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.00,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.00,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...
2597483,5941,acc_9524,ATM_CASH,8.42,2023-01-25,ATM_CASH
2597484,5941,acc_9524,ATM_CASH,2.06,2023-01-25,ATM_CASH
2597485,5941,acc_9524,ATM_CASH,262.88,2023-01-25,ATM_CASH
2597486,5941,acc_9524,ATM_CASH,10.00,2023-01-25,ATM_CASH


In [51]:
test_snippet = out_train_df['memo'].iloc[:50]
test_snippet

0                                            LOAN
1                                        ATM_CASH
2       TST* Casa Del Rio - Exp Fairlawn OH 09/24
3                                            LOAN
4                              Buffalo Wild Wings
5                                   ENTERTAINMENT
6                                 Oculus CA 04/16
7                     LOS GIRASOLES STOW OH 03/08
8                       BUZZIS LAUNDRY 1 OH 03/28
9                       BUZZIS LAUNDRY 1 OH 02/13
10                                  ENTERTAINMENT
11                 TGI FRIDAYS XXXX STOW OH 12/31
12    TST* The Basement Sp Cuyahoga Fall OH 06/06
13                                         Lowe's
14                            PIADA - 39 OH 08/23
15                                        GrubHub
16                             ESSENTIAL_SERVICES
17                 HARDEES XXXXXXX AKRON OH 05/29
18    MARKET DI XXXX State Cuyahoga Fall OH 04/06
19             SWENSONS - MONTROSE AKRON OH 06/29


In [71]:
out_train_df[out_train_df['memo'].str.contains('')]['memo'].to_list()[0:5]

["Debit Purchase -visa Card XXXXmcdonald's FXXXXsimi Valley Ca",
 "Debit Purchase -visa Card XXXXmcdonald's FXXXXthousand Oakca",
 "Debit Purchase -visa Card XXXXmcdonald's FXXXXthousand Oakca",
 "Debit Purchase -visa Card XXXXmcdonald's FXXXXnewbury Parkca",
 "Debit Purchase -visa Card XXXXmcdonald's FXXXXcalabasas Ca"]

In [86]:
out_train_df[out_train_df['memo'].str.contains(r'[\u200B\u200C\u200D\u2060\uFEFF\u00A0\u180E]', regex=True)]['memo'].to_list()[0:5]

['DBT CRD XXXX\xa012/05/22 XXXXXXXX DUTCHBROS180 NAMPA \xa0ID C# XXXX',
 'DBT CRD XXXX\xa012/18/22 XXXXXXXX MCDONALD\xa0S FXXXXX NAMPA \xa0 ID C# XXXX',
 'POS DEB XXXX\xa012/24/22 XXXXXXXX\xa0FRED-MEYER #XXXX FRED MEY\xa0NAMPA ID Card#\xa0XXXX',
 'DBT CRD XXXX\xa011/17/22 XXXXXXXX MCDONALD\xa0S FXXXXX NAMPA \xa0 ID C# XXXX',
 'DBT CRD XXXX\xa012/23/22 XXXXXXXX SONIC\xa0DRIVE IN #XXXX NAMPA \xa0 ID C# XXXX']

In [108]:
def cleaning(series):
    series = series.str.lower() # CASE NORMALIZATION
    series = series.apply(lambda x: x.encode('utf-8', errors='ignore').decode('utf-8', errors='ignore')) # CONVERT INTO UTF-8 

    # REMOVE @ BUT EXTRACT THE EMAIL DOMAIN
    def extract_email_domain(text):
        def replacer(match):
            full_email = match.group(0)
            domain = full_email.split('@')[1]  # take part after '@'
            return domain
        return re.sub(r'[\w\.-]+@[\w\.-]+', replacer, text)
    series = series.apply(extract_email_domain)

    # REMOVE ALL . UNLESS IT'S IN A WEBSITE OR EMAIL DOMAIN (remove standalone dots - not part of word/num)
    series = series.str.replace(r'(?<!\w)\.(?!\w)', '', regex=True)

    # REMOVE ALL # AND CHARACTERS AFTER IT (likely numbers)
    series = series.str.replace(r'#.*', '', regex=True)

    # REMOVE ALL % (not useful)
    series = series.str.replace('%', '')

    # REMOVE * (usually follows TST, so we can just use that)
    series = series.str.replace('*', '')

    # REMOVE PARENTHESES
    series = series.str.replace(r'[()]', '', regex=True)

    # REMOVE / AND REPLACE WITH ' '- occurs most commonly in dates but doesn't provide much info to repaying loans
    series = series.str.replace(r'\b\d{1,2}/\d{1,2}(?:/\d{2,4})?\b', '', regex=True)
    series = series.str.replace('/', ' ', regex=True)

    # REPLACE & WITH 'AND' UNLESS IT APPEARS IN 'AT&T' OR 'H&M'
    def replace_ampersand(text):
        if 'h&m' not in text and 'at&t' not in text:
            return text.replace('&', 'and')
        return text
    series = series.apply(replace_ampersand)

    # REMOVE ALL APOSTROPHES
    series = series.str.replace("'", "")

    # REPLACE '_' WITH ' '
    series = series.str.replace('_', ' ')

    # REMOVE ALL $
    series = series.str.replace('$', '')

    # REMOVE ALL :
    series = series.str.replace(':', '')

    # remove standalone sequence of + digits surrounded by word boundaries
    # chose 4 because after removing dashes from dates, we end up with many unneeded 4-digit numbers
    series = series.replace(r'\b\d{4,}\b', '', regex=True) 

    # REMOVE XXXX IF THERE ARE 4+ Xs
    series = series.str.replace(r'x{4,}', '', regex=True)

    # REMOVE ALL DASHES UNLESS SURROUNDED BY ALPHANUMERIC TEXT (detects phone numbers, codes, some merchant names)
    series = series.str.replace(r'(?<!\w)-{1,}(?!\w)', '', regex=True)

    # REMOVE INVISIBLE/ZERO-WIDTH CHARACTERS
    # series = series.str.replace(r'[\u200B\u200C\u200D\u2060\uFEFF\u00A0\u180E]', ' ', regex=True)

    # SPECIAL CASE: DOORDASH
    series = series.str.replace(r'(?<=doordash)(?=[A-Za-z])', ' ', regex=True)

    # REMOVE REDUNDANT WHITESPACE
    series = series.replace(r'\s+', ' ', regex=True).str.strip()

    return series

In [109]:
cleaning(test_snippet).to_list()

['loan',
 'atm cash',
 'tst casa del rio exp fairlawn oh',
 'loan',
 'buffalo wild wings',
 'entertainment',
 'oculus ca',
 'los girasoles stow oh',
 'buzzis laundry 1 oh',
 'buzzis laundry 1 oh',
 'entertainment',
 'tgi fridays stow oh',
 'tst the basement sp cuyahoga fall oh',
 'lowes',
 'piada 39 oh',
 'grubhub',
 'essential services',
 'hardees akron oh',
 'market di state cuyahoga fall oh',
 'swensons montrose akron oh',
 'great clips',
 'external transfer',
 'apple.com bill ca',
 'automotive',
 'apple.com bill ca',
 'automotive',
 'apple.com bill ca',
 'uncategorized',
 'los girasoles stow oh',
 'wing warehouse cuyah cuyahoga fall oh',
 'winking lizard 30 ma oh',
 'entertainment',
 'external transfer',
 'longhorn steak cuyahoga fall oh',
 'on tap cuyahoga fa cuyahoga fall oh',
 'tst the basement sp cuyahoga fall oh',
 'great clips',
 'automotive',
 'oculus ca',
 'credit card payment',
 'automotive',
 'credit card payment',
 'home depot',
 'credit card payment',
 'lowes',
 'falls 

In [111]:
# cleaning(out_train_df[out_train_df['memo'].str.contains(r'[\u200B\u200C\u200D\u2060\uFEFF\u00A0\u180E]', regex=True)]['memo']).to_list()

In [112]:
# Test series
test_memos = pd.Series([
    "POS PURCHASE - STARBUCKS 04/25",               # normal purchase with dash and date
    "Payment to user@google.com",                   # email extraction
    "H&M Store #12345",                             # & and # handling
    "T.J.Maxx (0425)",                              # parentheses, dot handling
    "Transfer 1234-5678",                           # long digits and dash
    "ACH CREDIT PAYROLL 50%",                       # % removal
    "VENMO PAYMENT TO john.doe@venmo.com",          # email domain + dot
    "ATM WDL $200*",                                # asterisk removal
    "AT&T Bill Payment 09/23",                      # & preserved in AT&T
    "Online subscription: Netflix.com",             # dot in domain,
    " POS PURCHASE STARBUCKS 0425 ",
    "ACH CREDIT 12345678"
])

In [113]:
cleaning(test_memos).to_list()

['pos purchase starbucks',
 'payment to google.com',
 'h&m store',
 't.j.maxx',
 'transfer',
 'ach credit payroll 50',
 'venmo payment to venmo.com',
 'atm wdl 200',
 'at&t bill payment',
 'online subscription netflix.com',
 'pos purchase starbucks',
 'ach credit']

In [114]:
# EXAMPLE OF EXTRACTING EMAIL DOMAINS

def extract_email_domain(text):
    def replacer(match):
        full_email = match.group(0)
        domain = full_email.split('@')[1]  # take part after '@'
        return domain
    return re.sub(r'[\w\.-]+@[\w\.-]+', replacer, text)
     
extract_email_domain('happy-cat88@gmail.com')

'gmail.com'

In [115]:
def replace_ampersand(text):
    if 'h&m' not in text and 'at&t' not in text:
        return text.replace('&', 'and')
    return text
replace_ampersand('the cat & i')

'the cat and i'

In [117]:
out_train_df

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category
0,0,acc_0,LOAN,900.60,2022-07-05,LOAN
1,0,acc_0,ATM_CASH,80.00,2022-03-25,ATM_CASH
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES
3,0,acc_0,LOAN,634.00,2023-01-10,LOAN
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES
...,...,...,...,...,...,...
2597483,5941,acc_9524,ATM_CASH,8.42,2023-01-25,ATM_CASH
2597484,5941,acc_9524,ATM_CASH,2.06,2023-01-25,ATM_CASH
2597485,5941,acc_9524,ATM_CASH,262.88,2023-01-25,ATM_CASH
2597486,5941,acc_9524,ATM_CASH,10.00,2023-01-25,ATM_CASH


In [119]:
out_train_df['cleaned_memos'] = cleaning(out_train_df['memo'])
out_train_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  out_train_df['cleaned_memos'] = cleaning(out_train_df['memo'])


Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,cleaned_memos
0,0,acc_0,LOAN,900.60,2022-07-05,LOAN,loan
1,0,acc_0,ATM_CASH,80.00,2022-03-25,ATM_CASH,atm cash
2,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,18.42,2022-09-26,FOOD_AND_BEVERAGES,tst casa del rio exp fairlawn oh
3,0,acc_0,LOAN,634.00,2023-01-10,LOAN,loan
4,0,acc_0,Buffalo Wild Wings,26.47,2022-09-12,FOOD_AND_BEVERAGES,buffalo wild wings
...,...,...,...,...,...,...,...
2597483,5941,acc_9524,ATM_CASH,8.42,2023-01-25,ATM_CASH,atm cash
2597484,5941,acc_9524,ATM_CASH,2.06,2023-01-25,ATM_CASH,atm cash
2597485,5941,acc_9524,ATM_CASH,262.88,2023-01-25,ATM_CASH,atm cash
2597486,5941,acc_9524,ATM_CASH,10.00,2023-01-25,ATM_CASH,atm cash


In [122]:
out_train_df[['memo', 'cleaned_memos']].to_csv('cleaned_data.csv', index=False)