In [1]:
# load processed df
from IPython.utils.capture import capture_output

with capture_output():
    %run 01_memos_func_renamed.ipynb

## Train/Test Split

In [2]:
from sklearn.model_selection import train_test_split

unique_consumer_ids = outflows_cleaned['prism_consumer_id'].unique()
print(f'Unique Consumer IDs: {len(unique_consumer_ids)}')


Unique Consumer IDs: 2952


In [3]:
from sklearn.model_selection import train_test_split

unique_consumer_ids = outflows_cleaned['prism_consumer_id'].unique()

outflows_train_ids, outflows_test_ids = train_test_split(
    unique_consumer_ids,
    test_size=0.25,
    random_state=42,
)

train_df = outflows_cleaned[outflows_cleaned['prism_consumer_id'].isin(outflows_train_ids)].copy()
test_df  = outflows_cleaned[outflows_cleaned['prism_consumer_id'].isin(outflows_test_ids)].copy()

print(f'Train Shape: {train_df.shape}')
print(f'Test Shape: {test_df.shape}')

Train Shape: (969666, 6)
Test Shape: (336786, 6)


## TF/IDF Features

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=2)

X_train_tfidf = vectorizer.fit_transform(train_df['memo_clean'])
X_test_tfidf = vectorizer.transform(test_df['memo_clean'])

## Additional Features

### OHE For Day of the Week
Create `day_of_week` feature (0 = Monday, 6 = Sunday)

In [5]:
train_df.loc[:, 'day_of_week'] = train_df['posted_date'].dt.dayofweek
test_df.loc[:, 'day_of_week']  = test_df['posted_date'].dt.dayofweek

train_df = pd.get_dummies(train_df, columns=['day_of_week'], prefix='day', dtype=int)
test_df  = pd.get_dummies(test_df,  columns=['day_of_week'], prefix='day', dtype=int)

train_df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,amount,posted_date,category,memo_clean,day_0,day_1,day_2,day_3,day_4,day_5,day_6
2,0,acc_0,18.42,2022-09-26,FOOD_AND_BEVERAGES,TST CASA DEL RIO EXP FAIRLAWN,1,0,0,0,0,0,0
4,0,acc_0,26.47,2022-09-12,FOOD_AND_BEVERAGES,BUFFALO WILD WINGS,1,0,0,0,0,0,0
6,0,acc_0,11.73,2022-04-18,GENERAL_MERCHANDISE,OCULUS,1,0,0,0,0,0,0
7,0,acc_0,30.04,2022-03-09,FOOD_AND_BEVERAGES,LOS GIRASOLES STOW,0,0,1,0,0,0,0
8,0,acc_0,4.16,2022-03-29,GENERAL_MERCHANDISE,BUZZIS LAUNDRY,0,1,0,0,0,0,0


### Is Weekend?
Create `is_weekend` feature (1 = Weekend, 0 = Weekday)

In [6]:
# assumes day_of_week already created
train_df['is_weekend'] = (train_df['posted_date'].dt.dayofweek >= 5).astype(int)
test_df['is_weekend']  = (test_df['posted_date'].dt.dayofweek  >= 5).astype(int)

train_df[['posted_date','is_weekend']].head()

Unnamed: 0,posted_date,is_weekend
2,2022-09-26,0
4,2022-09-12,0
6,2022-04-18,0
7,2022-03-09,0
8,2022-03-29,0


### Whole Dollar Amount?
Create `whole_dollar` to better distinguish if certain transactions were from an ATM or transfer payments

In [7]:
import numpy as np

train_df['is_whole_dollar'] = ((train_df['amount'] % 1) == 0).astype(int)
test_df['is_whole_dollar']  = ((test_df['amount'] % 1) == 0).astype(int)

train_df[['amount', 'is_whole_dollar']].head()

Unnamed: 0,amount,is_whole_dollar
2,18.42,0
4,26.47,0
6,11.73,0
7,30.04,0
8,4.16,0


### Month and Day
Create `month` and `day` to capture trends in seasonal spending habits or typical monthly billing cycles

In [8]:
train_df['month'] = train_df['posted_date'].dt.month
test_df['month']  = test_df['posted_date'].dt.month

train_df = pd.get_dummies(train_df, columns=['month'], prefix='month', dtype=int)
test_df  = pd.get_dummies(test_df,  columns=['month'], prefix='month', dtype=int)

train_df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,amount,posted_date,category,memo_clean,day_0,day_1,day_2,day_3,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
2,0,acc_0,18.42,2022-09-26,FOOD_AND_BEVERAGES,TST CASA DEL RIO EXP FAIRLAWN,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,acc_0,26.47,2022-09-12,FOOD_AND_BEVERAGES,BUFFALO WILD WINGS,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,0,acc_0,11.73,2022-04-18,GENERAL_MERCHANDISE,OCULUS,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,0,acc_0,30.04,2022-03-09,FOOD_AND_BEVERAGES,LOS GIRASOLES STOW,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
8,0,acc_0,4.16,2022-03-29,GENERAL_MERCHANDISE,BUZZIS LAUNDRY,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


### Log Amount
Create `amount_log` to account for the skewed dollar amounts

In [9]:
train_df['amount_log'] = np.log1p(train_df['amount'])
test_df['amount_log']  = np.log1p(test_df['amount'])

train_df[['amount', 'amount_log']].head()

Unnamed: 0,amount,amount_log
2,18.42,2.966303
4,26.47,3.313095
6,11.73,2.543961
7,30.04,3.435277
8,4.16,1.640937


### Contains Education Keyword?
Creates 'education_keyword?' to mark if any education related keywords were found in the memo

In [10]:
education_keywords = ['scho', 'univ', 'college', 'education', 'academy', 'transcript', 'parchment', 'student',
                      'study', 'teach', 'learn', 'edu', 'isd']

for keyword in education_keywords:
    col = f"kw_{keyword}"
    train_df[col] = train_df['memo_clean'].str.contains(keyword, case=False).astype(int)
    test_df[col]  = test_df['memo_clean'].str.contains(keyword, case=False).astype(int)

train_df.loc[train_df[[f"kw_{k}" for k in education_keywords]].any(axis=1),
             ['memo_clean'] + [f"kw_{k}" for k in education_keywords]].head()

Unnamed: 0,memo_clean,kw_scho,kw_univ,kw_college,kw_education,kw_academy,kw_transcript,kw_parchment,kw_student,kw_study,kw_teach,kw_learn,kw_edu,kw_isd
2129,UNIVERSITY FLORIDA GAINESVILLE USA,0,1,0,0,0,0,0,0,0,0,0,0,0
3275,SOUTH CENTRAL COLLEGE,0,0,1,0,0,0,0,0,0,0,0,0,0
4617,MLT SCHOOL,1,0,0,0,0,0,0,0,0,0,0,0,0
16109,UNIVERSITY PACKAGE MURFREESBORO,0,1,0,0,0,0,0,0,0,0,0,0,0
16634,UNIVERSITY PACKAGE WINE A MURFREESBORO PIN,0,1,0,0,0,0,0,0,0,0,0,0,0


### Combine Features
Combine both TF/IDF and Date/Amount features into a unified dataframe.

In [11]:
from scipy.sparse import hstack, csr_matrix

# extract target features
y_train = train_df['category'].to_numpy()
y_test  = test_df['category'].to_numpy()

# combine features while dropping category
train_df = train_df.drop(columns=['category'])
test_df  = test_df.drop(columns=['category'])

kw_cols = [f"kw_{k}" for k in education_keywords]

num_cols = [
    'day_0','day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 
    'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'is_weekend', 'is_whole_dollar','amount_log'
] + kw_cols

X_train_num = csr_matrix(train_df[num_cols].to_numpy(dtype=np.float32))
X_test_num  = csr_matrix(test_df[num_cols].to_numpy(dtype=np.float32))

X_train = hstack([X_train_tfidf, X_train_num], format='csr')
X_test  = hstack([X_test_tfidf,  X_test_num],  format='csr')

In [12]:
# Day-of-week one-hot should have exactly one 1 per row
assert ((train_df[[f'day_{i}' for i in range(7)]].sum(axis=1) == 1).all())

# Same month columns present
mcols = [f'month_{i}' for i in range(1,13)]
assert set(mcols).issubset(set(train_df.columns)) and set(mcols).issubset(set(test_df.columns))

# Shapes line up
print(X_train_tfidf.shape, X_test_tfidf.shape)


(969666, 5000) (336786, 5000)


In [13]:
X_train

<969666x5035 sparse matrix of type '<class 'numpy.float64'>'
	with 6372320 stored elements in Compressed Sparse Row format>

In [14]:
train_df.head()


Unnamed: 0,prism_consumer_id,prism_account_id,amount,posted_date,memo_clean,day_0,day_1,day_2,day_3,day_4,...,kw_education,kw_academy,kw_transcript,kw_parchment,kw_student,kw_study,kw_teach,kw_learn,kw_edu,kw_isd
2,0,acc_0,18.42,2022-09-26,TST CASA DEL RIO EXP FAIRLAWN,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,acc_0,26.47,2022-09-12,BUFFALO WILD WINGS,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,acc_0,11.73,2022-04-18,OCULUS,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,acc_0,30.04,2022-03-09,LOS GIRASOLES STOW,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,acc_0,4.16,2022-03-29,BUZZIS LAUNDRY,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Simple Model - Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(max_iter=5000)
log_reg_model.fit(X_train, y_train)

y_pred_lr = log_reg_model.predict(X_test)

In [17]:
# Evaluate model
y_pred_train = log_reg_model.predict(X_train)
from sklearn.metrics import accuracy_score, f1_score, classification_report

print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Train Weighted F1:", f1_score(y_train, y_pred_train, average='weighted'))
print(classification_report(y_train, y_pred_train, zero_division=0))

print("Test Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Test Weighted F1:", f1_score(y_test, y_pred_lr, average='weighted'))
print(classification_report(y_test, y_pred_lr, zero_division=0))

Train Accuracy: 0.95148329424771
Train Weighted F1: 0.951503496820387
                     precision    recall  f1-score   support

          EDUCATION       0.91      0.72      0.80      3329
 FOOD_AND_BEVERAGES       0.92      0.97      0.94    357992
GENERAL_MERCHANDISE       0.97      0.94      0.95    391492
          GROCERIES       0.97      0.96      0.97    162754
           MORTGAGE       0.97      0.98      0.98       710
          OVERDRAFT       0.99      0.98      0.98      2433
               PETS       0.99      0.93      0.96      6599
               RENT       0.94      0.86      0.90      2518
             TRAVEL       0.98      0.93      0.96     41839

           accuracy                           0.95    969666
          macro avg       0.96      0.92      0.94    969666
       weighted avg       0.95      0.95      0.95    969666

Test Accuracy: 0.9310778951619129
Test Weighted F1: 0.9311362629715593
                     precision    recall  f1-score   support

 