In [12]:
# load processed df
from IPython.utils.capture import capture_output

with capture_output():
    %run 03_preprocessing.ipynb

# Trying pre-trained BERT model from hugging face

## Try predicting with the HF model

In [13]:
import os
import warnings
warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import BertTokenizer, BertForSequenceClassification

# Load the model
model_name = "kuro-08/bert-transaction-categorization"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Sample transaction description
transaction = "Starbucks"
inputs = tokenizer(transaction, return_tensors="pt", truncation=True, padding=True)

# Predict the category
outputs = model(**inputs)
logits = outputs.logits
predicted_category = logits.argmax(-1).item()

print(f"Predicted category: {predicted_category}")


Predicted category: 2


## Apply the model in our dataset

In [14]:
import time

start = time.time()

In [15]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

HF_CATEGORIES = {
  0: "Utilities",1: "Health",2: "Dining",3: "Travel",4: "Education",5: "Subscription",
  6: "Family",7: "Food",8: "Festivals",9: "Culture",10: "Apparel",11: "Transportation",
  12: "Investment",13: "Shopping",14: "Groceries",15: "Documents",16: "Grooming",
  17: "Entertainment",18: "Social Life",19: "Beauty",20: "Rent",21: "Money transfer",
  22: "Salary",23: "Tourism",24: "Household"
}

# map HF classes to our 9 categories
HF_TO_ESSENTIAL = {
  "Education": "EDUCATION",
  "Food": "FOOD_AND_BEVERAGES",
  "Dining": "FOOD_AND_BEVERAGES",
  "Groceries": "GROCERIES",
  "Travel": "TRAVEL",
  "Transportation": "TRAVEL",
  "Tourism": "TRAVEL",
  "Rent": "RENT",
  # everything else map to GENERAL_MERCHANDISE
}
# default for everything missing = GENERAL_MERCHANDISE
def map_hf(cat):
    return HF_TO_ESSENTIAL.get(cat, "GENERAL_MERCHANDISE")

def hf_predict(memos, batch_size=64):
    preds = []
    with torch.no_grad():
        for i in range(0, len(memos), batch_size):
            batch = list(memos[i:i+batch_size])
            toks = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors="pt")
            toks = {k: v.to(device) for k,v in toks.items()}
            logits = model(**toks).logits
            preds.extend(logits.argmax(dim=1).cpu().numpy())
    return np.array(preds)

# run HF on test_df
init_prediction = hf_predict(test_df['memo_clean'].fillna(""))
hf_cats = [HF_CATEGORIES[i] for i in init_prediction]
hf_mapped = np.array([map_hf(c) for c in hf_cats])

print("HF BERT (mapped):")
print(classification_report(y_test, hf_mapped, zero_division=0))

HF BERT (mapped):
                     precision    recall  f1-score   support

          EDUCATION       0.24      0.69      0.35      1170
 FOOD_AND_BEVERAGES       0.58      0.27      0.37    124002
GENERAL_MERCHANDISE       0.53      0.75      0.62    132571
          GROCERIES       0.26      0.20      0.23     56577
           MORTGAGE       0.00      0.00      0.00       409
          OVERDRAFT       0.00      0.00      0.00       953
               PETS       0.00      0.00      0.00      2667
               RENT       0.08      0.19      0.11       629
             TRAVEL       0.27      0.59      0.37     17808

           accuracy                           0.47    336786
          macro avg       0.22      0.30      0.23    336786
       weighted avg       0.48      0.47      0.44    336786



In [16]:
end = time.time()
print(f"training took {end-start:.2f} seconds")

training took 175.65 seconds


# Another HF Model

In [17]:
MODEL = "mgrella/autonlp-bank-transaction-classification-5521155"

tok   = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

texts = ["Starbucks purchase", "amazon", "walmart"]  # examples
enc = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

with torch.no_grad():
    logits = model(**enc).logits
probs = torch.softmax(logits, dim=-1)

id2label = model.config.id2label
preds = [id2label[i] for i in probs.argmax(dim=-1).tolist()]
for t, p in zip(texts, preds):
    print(p, " <- ", t)

Category.EATING_OUT_OTHER  <-  Starbucks purchase
Category.OTHER_OTHER  <-  amazon
Category.SHOPPING_OTHER  <-  walmart


## Check their labels to map with our categories

In [18]:
labels = [model.config.id2label[i] for i in range(model.config.num_labels)]
print(len(labels), "labels")
print(labels)

76 labels
['Category.BILLS_SUBSCRIPTIONS_BILLS', 'Category.BILLS_SUBSCRIPTIONS_INTERNET_PHONE', 'Category.BILLS_SUBSCRIPTIONS_OTHER', 'Category.BILLS_SUBSCRIPTIONS_SUBSCRIPTIONS', 'Category.CREDIT_CARDS_CREDIT_CARDS', 'Category.EATING_OUT_COFFEE_SHOPS', 'Category.EATING_OUT_OTHER', 'Category.EATING_OUT_RESTAURANTS', 'Category.EATING_OUT_TAKEAWAY_RESTAURANTS', 'Category.HEALTH_WELLNESS_AID_EXPENSES', 'Category.HEALTH_WELLNESS_DRUGS', 'Category.HEALTH_WELLNESS_GYMS', 'Category.HEALTH_WELLNESS_MEDICAL_EXPENSES', 'Category.HEALTH_WELLNESS_OTHER', 'Category.HEALTH_WELLNESS_WELLNESS_RELAX', 'Category.HOUSING_FAMILY_APPLIANCES', 'Category.HOUSING_FAMILY_CHILDHOOD', 'Category.HOUSING_FAMILY_FURNITURE', 'Category.HOUSING_FAMILY_GROCERIES', 'Category.HOUSING_FAMILY_INSURANCES', 'Category.HOUSING_FAMILY_MAINTENANCE_RENOVATION', 'Category.HOUSING_FAMILY_OTHER', 'Category.HOUSING_FAMILY_RENTS', 'Category.HOUSING_FAMILY_SERVANTS', 'Category.HOUSING_FAMILY_VETERINARY', 'Category.LEISURE_BOOKS', 'Cate

In [19]:
map76_to9 = {
    # Bills & subscriptions → GENERAL_MERCHANDISE
    "Category.BILLS_SUBSCRIPTIONS_BILLS": "GENERAL_MERCHANDISE",
    "Category.BILLS_SUBSCRIPTIONS_INTERNET_PHONE": "GENERAL_MERCHANDISE",
    "Category.BILLS_SUBSCRIPTIONS_OTHER": "GENERAL_MERCHANDISE",
    "Category.BILLS_SUBSCRIPTIONS_SUBSCRIPTIONS": "GENERAL_MERCHANDISE",

    # Credit card payments (not overdraft) → GENERAL_MERCHANDISE
    "Category.CREDIT_CARDS_CREDIT_CARDS": "GENERAL_MERCHANDISE",

    # Eating out → FOOD_AND_BEVERAGES
    "Category.EATING_OUT_COFFEE_SHOPS": "FOOD_AND_BEVERAGES",
    "Category.EATING_OUT_OTHER": "FOOD_AND_BEVERAGES",
    "Category.EATING_OUT_RESTAURANTS": "FOOD_AND_BEVERAGES",
    "Category.EATING_OUT_TAKEAWAY_RESTAURANTS": "FOOD_AND_BEVERAGES",

    # Health & wellness (no dedicated bucket) → GENERAL_MERCHANDISE
    "Category.HEALTH_WELLNESS_AID_EXPENSES": "GENERAL_MERCHANDISE",
    "Category.HEALTH_WELLNESS_DRUGS": "GENERAL_MERCHANDISE",
    "Category.HEALTH_WELLNESS_GYMS": "GENERAL_MERCHANDISE",
    "Category.HEALTH_WELLNESS_MEDICAL_EXPENSES": "GENERAL_MERCHANDISE",
    "Category.HEALTH_WELLNESS_OTHER": "GENERAL_MERCHANDISE",
    "Category.HEALTH_WELLNESS_WELLNESS_RELAX": "GENERAL_MERCHANDISE",

    # Housing & family
    "Category.HOUSING_FAMILY_APPLIANCES": "GENERAL_MERCHANDISE",
    "Category.HOUSING_FAMILY_CHILDHOOD": "GENERAL_MERCHANDISE",
    "Category.HOUSING_FAMILY_FURNITURE": "GENERAL_MERCHANDISE",
    "Category.HOUSING_FAMILY_GROCERIES": "GROCERIES",
    "Category.HOUSING_FAMILY_INSURANCES": "GENERAL_MERCHANDISE",
    "Category.HOUSING_FAMILY_MAINTENANCE_RENOVATION": "GENERAL_MERCHANDISE",
    "Category.HOUSING_FAMILY_OTHER": "GENERAL_MERCHANDISE",
    "Category.HOUSING_FAMILY_RENTS": "RENT",
    "Category.HOUSING_FAMILY_SERVANTS": "GENERAL_MERCHANDISE",
    "Category.HOUSING_FAMILY_VETERINARY": "PETS",

    # Leisure (no dedicated bucket) → GENERAL_MERCHANDISE
    # If you want EDUCATION coverage, you can optionally map BOOKS/MAGAZINES to EDUCATION instead.
    "Category.LEISURE_BOOKS": "GENERAL_MERCHANDISE",
    "Category.LEISURE_CINEMA": "GENERAL_MERCHANDISE",
    "Category.LEISURE_CLUB_ASSOCIATIONS": "GENERAL_MERCHANDISE",
    "Category.LEISURE_GAMBLING": "GENERAL_MERCHANDISE",
    "Category.LEISURE_MAGAZINES_NEWSPAPERS": "GENERAL_MERCHANDISE",
    "Category.LEISURE_MOVIES_MUSICS": "GENERAL_MERCHANDISE",
    "Category.LEISURE_OTHER": "GENERAL_MERCHANDISE",
    "Category.LEISURE_SPORT_EVENTS": "GENERAL_MERCHANDISE",
    "Category.LEISURE_THEATERS_CONCERTS": "GENERAL_MERCHANDISE",
    "Category.LEISURE_VIDEOGAMES": "GENERAL_MERCHANDISE",

    # Loans & mortgages
    "Category.MORTGAGES_LOANS_LOANS": "GENERAL_MERCHANDISE",
    "Category.MORTGAGES_LOANS_MORTGAGES": "MORTGAGE",

    # Other/Profits (cash/checks/income) → GENERAL_MERCHANDISE (or drop if you exclude income)
    "Category.OTHER_CASH": "GENERAL_MERCHANDISE",
    "Category.OTHER_CHECKS": "GENERAL_MERCHANDISE",
    "Category.OTHER_OTHER": "GENERAL_MERCHANDISE",
    "Category.PROFITS_PROFITS": "GENERAL_MERCHANDISE",

    # Shopping → GENERAL_MERCHANDISE
    "Category.SHOPPING_ACCESSORIZE": "GENERAL_MERCHANDISE",
    "Category.SHOPPING_CLOTHING": "GENERAL_MERCHANDISE",
    "Category.SHOPPING_FOOTWEAR": "GENERAL_MERCHANDISE",
    "Category.SHOPPING_HI_TECH": "GENERAL_MERCHANDISE",
    "Category.SHOPPING_OTHER": "GENERAL_MERCHANDISE",
    "Category.SHOPPING_SPORT_ARTICLES": "GENERAL_MERCHANDISE",

    # Taxes & services
    "Category.TAXES_SERVICES_BANK_FEES": "OVERDRAFT",
    "Category.TAXES_SERVICES_DEFAULT_PAYMENTS": "GENERAL_MERCHANDISE",
    "Category.TAXES_SERVICES_MONEY_ORDERS": "GENERAL_MERCHANDISE",
    "Category.TAXES_SERVICES_OTHER": "GENERAL_MERCHANDISE",
    "Category.TAXES_SERVICES_PROFESSIONAL_ACTIVITY": "GENERAL_MERCHANDISE",
    "Category.TAXES_SERVICES_PROFIT_DEDUCTION": "GENERAL_MERCHANDISE",
    "Category.TAXES_SERVICES_TAXES": "GENERAL_MERCHANDISE",

    # Transfers (bank/internal/investments/income/refunds/savings/rent incomes) → GENERAL_MERCHANDISE
    # If your eval excludes income/transfers, you can drop these rows instead.
    "Category.TRANSFERS_BANK_TRANSFERS": "GENERAL_MERCHANDISE",
    "Category.TRANSFERS_GIFTS_DONATIONS": "GENERAL_MERCHANDISE",
    "Category.TRANSFERS_INVESTMENTS": "GENERAL_MERCHANDISE",
    "Category.TRANSFERS_OTHER": "GENERAL_MERCHANDISE",
    "Category.TRANSFERS_REFUNDS": "GENERAL_MERCHANDISE",
    "Category.TRANSFERS_RENT_INCOMES": "GENERAL_MERCHANDISE",
    "Category.TRANSFERS_SAVINGS": "GENERAL_MERCHANDISE",

    # Travel & transportation → TRAVEL
    "Category.TRAVELS_TRANSPORTATION_BUSES": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_CAR_RENTAL": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_FLIGHTS": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_FUEL": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_HOTELS": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_OTHER": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_PARKING_URBAN_TRANSPORTS": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_TAXIS": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_TOLLS": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_TRAINS": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_TRAVELS_HOLIDAYS": "TRAVEL",
    "Category.TRAVELS_TRANSPORTATION_VEHICLE_MAINTENANCE": "TRAVEL",

    # Wages/salary (income) → GENERAL_MERCHANDISE (or drop if excluding income)
    "Category.WAGES_PENSION": "GENERAL_MERCHANDISE",
    "Category.WAGES_PROFESSIONAL_COMPENSATION": "GENERAL_MERCHANDISE",
    "Category.WAGES_SALARY": "GENERAL_MERCHANDISE",
}

In [20]:
start = time.time()

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, classification_report

MODEL = "mgrella/autonlp-bank-transaction-classification-5521155"

allowed = [
    "EDUCATION","FOOD_AND_BEVERAGES","GENERAL_MERCHANDISE","GROCERIES",
    "MORTGAGE","OVERDRAFT","PETS","RENT","TRAVEL"
]

tok = AutoTokenizer.from_pretrained(MODEL)
m   = AutoModelForSequenceClassification.from_pretrained(MODEL).eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
m.to(device)

id2label = m.config.id2label

# precompute: for each 9-label, which 76 indices belong to it
bucket_idxs = {
    nine: [i for i,lbl in id2label.items() if map76_to9.get(lbl)==nine]
    for nine in allowed
}

def predict_9(texts):
    """
    texts: list[str]
    returns: list of your 9 classes
    """
    with torch.no_grad():
        enc = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        logits76 = m(**enc).logits  # [B, 76]

        # build [B, 9] logits by taking max over each bucket
        B = logits76.size(0)
        logits9 = torch.full((B, len(allowed)), float("-inf"), device=logits76.device)

        for j, nine in enumerate(allowed):
            idxs = bucket_idxs[nine]
            if len(idxs)>0:
                logits9[:,j] = logits76[:, idxs].amax(dim=1)

        probs9 = torch.softmax(logits9, dim=1)
        pred_ids = probs9.argmax(dim=1).tolist()
        pred_labels = [allowed[i] for i in pred_ids]
        return pred_labels, probs9

# get predictions
from sklearn.metrics import classification_report

BATCH = 64
preds = []

for i in range(0, len(test_df), BATCH):
    batch = test_df["memo_clean"].iloc[i:i+BATCH].tolist()
    pred_labels, _ = predict_9(batch)
    preds.extend(pred_labels)

true_labels = y_test

print(
    classification_report(
        true_labels,
        preds,
        labels=allowed,
        digits=4,
        zero_division=0
    )
)

                     precision    recall  f1-score   support

          EDUCATION     0.0000    0.0000    0.0000      1170
 FOOD_AND_BEVERAGES     0.5991    0.5582    0.5779    124002
GENERAL_MERCHANDISE     0.5525    0.8057    0.6555    132571
          GROCERIES     0.8143    0.0078    0.0155     56577
           MORTGAGE     0.0000    0.0000    0.0000       409
          OVERDRAFT     0.0000    0.0000    0.0000       953
               PETS     0.0000    0.0000    0.0000      2667
               RENT     0.0000    0.0000    0.0000       629
             TRAVEL     0.3740    0.5745    0.4531     17808

           accuracy                         0.5544    336786
          macro avg     0.2600    0.2162    0.1891    336786
       weighted avg     0.5946    0.5544    0.4974    336786



In [22]:
end = time.time()
print(f"training took {end-start:.2f} seconds")

training took 236.32 seconds
