In [54]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
  AutoTokenizer,
  AutoModelForSequenceClassification,
  Trainer,
  TrainingArguments,
  pipeline
)
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split
import os
from langchain_huggingface import HuggingFaceEndpoint  # New import
import csv
import re
import nltk

In [55]:
file_path = "data/rawdata/expenses_2024/24_months_of_capone_transactions.csv"
try:
    df_cc = pd.read_csv("data/rawdata/expenses_2024/24_months_of_capone_transactions.csv")
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"Error loading dataset: {e}")

Dataset loaded successfully.


In [56]:
# Preprocessing function to strip out prefixes and sufixes
def preprocess_description_strip(text):
    text = text.lower()
    text = text.strip()
    text = re.sub(r"^\d+\s", "", text)
    text = re.sub(r"^sq\s?", "", text)
    text = re.sub(r"\d+$", "", text)
    text = re.sub(r"(amazon|walmart).*", r"\1", text)  # Keep only "amazon" or "walmart"
    return text

In [57]:
# Preprocessing function to normalize the descriptions

def preprocess_description_train(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers (adjust if needed)
    text = text.strip()  # Remove extra whitespace
    # Add more specific grouping/truncation logic here based on your review
    return text

In [58]:
# Preprocess the data
df_cc = df_cc[["Description", "Category"]].dropna()  # Select relevant columns and drop missing values
df_cc_unique = df_cc.drop_duplicates()
df_cc_unique['Description'] = df_cc_unique['Description'].apply(preprocess_description_strip)
df_cc_unique['Description'] = df_cc_unique['Description'].apply(preprocess_description_train)
df_cc_unique = df_cc_unique.rename(columns={"Description": "text", "Category": "labels"}) # Rename columns for consistency
df_cc_unique.to_csv("training_categories.csv", index=False)

# Convert labels to category type
df_cc_unique["labels"] = df_cc_unique["labels"].astype("category")

# Create the mappings
id2label = dict(enumerate(df_cc_unique["labels"].cat.categories))
label2id = {v: k for k, v in id2label.items()}

# Convert labels to numerical codes
df_cc_unique["labels"] = df_cc_unique["labels"].cat.codes

# Define the labels variable
labels = df_cc_unique["labels"].unique()

# Display the mappings and the processed DataFrame
print("id2label:", id2label)
print("label2id:", label2id)

id2label: {0: 'Airfare', 1: 'Car Rental', 2: 'Dining', 3: 'Entertainment', 4: 'Gas/Automotive', 5: 'Health Care', 6: 'Healthcare', 7: 'Insurance', 8: 'Internet', 9: 'Lodging', 10: 'Merchandise', 11: 'Other', 12: 'Other Services', 13: 'Other Travel', 14: 'Payment/Credit', 15: 'Phone/Cable', 16: 'Professional Services', 17: 'Utilities'}
label2id: {'Airfare': 0, 'Car Rental': 1, 'Dining': 2, 'Entertainment': 3, 'Gas/Automotive': 4, 'Health Care': 5, 'Healthcare': 6, 'Insurance': 7, 'Internet': 8, 'Lodging': 9, 'Merchandise': 10, 'Other': 11, 'Other Services': 12, 'Other Travel': 13, 'Payment/Credit': 14, 'Phone/Cable': 15, 'Professional Services': 16, 'Utilities': 17}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cc_unique['Description'] = df_cc_unique['Description'].apply(preprocess_description_strip)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cc_unique['Description'] = df_cc_unique['Description'].apply(preprocess_description_train)


In [59]:
# Split the data into training and validation sets
train_df, eval_df = train_test_split(df_cc_unique, test_size=0.2, random_state=42)

# Convert Pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

### Training

In [60]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
import pandas as pd
import numpy as np
import evaluate


In [61]:
# 2. Load model with label mappings
num_labels = len(labels)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=128
    )
# 3. Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=128
    )

# 4. Tokenize training and eval datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# 5. Load model with label mappings
num_labels = len(labels)
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# 6. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=25,  # Adjust based on your GPU memory
    per_device_eval_batch_size=25,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

# 7. Define metrics
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 8. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)

# 9. Train model
trainer.train()

# 10. Evaluate model
trainer.evaluate()

# 11. Save model
trainer.save_model("fine_tuned_transaction_classifier")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 688/688 [00:00<00:00, 21346.01 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 173/173 [00:00<00:00, 28268.13 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.624931,0.554913
2,No log,1.334246,0.624277
3,No log,1.276715,0.635838
4,No log,1.210516,0.676301
5,No log,1.212929,0.693642


In [62]:
print("Label to ID mapping:", label2id)
print("ID to Label mapping:", id2label)

Label to ID mapping: {'Airfare': 0, 'Car Rental': 1, 'Dining': 2, 'Entertainment': 3, 'Gas/Automotive': 4, 'Health Care': 5, 'Healthcare': 6, 'Insurance': 7, 'Internet': 8, 'Lodging': 9, 'Merchandise': 10, 'Other': 11, 'Other Services': 12, 'Other Travel': 13, 'Payment/Credit': 14, 'Phone/Cable': 15, 'Professional Services': 16, 'Utilities': 17}
ID to Label mapping: {0: 'Airfare', 1: 'Car Rental', 2: 'Dining', 3: 'Entertainment', 4: 'Gas/Automotive', 5: 'Health Care', 6: 'Healthcare', 7: 'Insurance', 8: 'Internet', 9: 'Lodging', 10: 'Merchandise', 11: 'Other', 12: 'Other Services', 13: 'Other Travel', 14: 'Payment/Credit', 15: 'Phone/Cable', 16: 'Professional Services', 17: 'Utilities'}


### Predict Testing

In [63]:
# 1. Load new data (assuming it's in a CSV called 'new_transactions.csv')
new_data = pd.read_csv("data/rawdata/expenses_2024/2024-12-14_bank_card_transaction.csv") 

In [64]:
import re
import nltk

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

# Define the text cleaning function
def clean_text(text):
    # Remove "TST*" prefix
    text = re.sub(r'^TST\*', '', text)
    
    # Remove numeric codes
    text = re.sub(r'\d+', '', text)
    
    # Remove unwanted characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers (adjust if needed)
    text = re.sub(r"^\d+\s", "", text)
    text = re.sub(r"^sq\s?", "", text)
    text = re.sub(r"\d+$", "", text)
    text = re.sub(r"(amazon|walmart).*", r"\1", text)  # Keep only "amazon" or "walmart"
    
    # Remove double quotes
    text = text.replace('"', '')
    
    # Remove extra spaces
    text = ' '.join(text.split())
    
    # Remove stop words (optional)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    
    return text

# Load the new data
new_data = pd.read_csv("data/rawdata/expenses_2024/2024-12-14_bank_card_transaction.csv")

# Apply the cleaning function to the 'Name' column
new_data["Cleaned_Name"] = new_data["Name"].apply(clean_text)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("fine_tuned_transaction_classifier")

# Define the id2label dictionary based on your training data
# Ensure this dictionary matches the one used during training


# Create a pipeline for prediction (set function_to_apply to 'softmax')
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, function_to_apply='softmax')

# Make predictions using the cleaned text data
predictions = classifier(new_data["Cleaned_Name"].tolist())

# Extract the predicted labels (numerical codes)
#predicted_codes = [int(prediction['label'].replace('LABEL_', '')) for prediction in predictions]

# Extract labels from the predictions
labels = [prediction['label'] for prediction in predictions]

# Add the labels as a new column in the DataFrame
new_data['Predicted_Category'] = labels

# Print or save the updated DataFrame
print(new_data[['Cleaned_Name', 'Predicted_Category']])
# new_data.to_csv("categorized_transactions.csv", index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Holden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Device set to use mps:0


                             Cleaned_Name Predicted_Category
0             py coal yard cafe ithaca ny             Dining
1                     rens mart ithaca ny        Merchandise
2                       usps po ithaca ny     Other Services
3                   fedex offic ithaca ny     Other Services
4                        target ithaca ny     Other Services
5                             etsy inc ny        Merchandise
6                google one ai premium ca        Phone/Cable
7               cayuga medical associa ny        Health Care
8                                  amazon        Merchandise
9   carnegie mellon univer httpswwwcmu pa      Entertainment
10                                 amazon        Merchandise
11                                 amazon        Merchandise
12                          payment thank        Merchandise
13            courseraorg httpswwwcour ca      Entertainment
14                                 amazon        Merchandise
15           collegetown

### Predict Full 2024 Data

In [65]:
# 1. Load 2024 Data
full_data = pd.read_csv("data/rawdata/expenses_2024/2024-12-17_credit_card_12_31_2023_2024.csv")
# 2. Apply the cleaning function to the 'Name' column
full_data["Cleaned_Name"] = full_data["Name"].apply(clean_text)

In [73]:
# 3. Load the tokenizer and model same as before
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("fine_tuned_transaction_classifier")

# 4. Create a new pipeline for prediction (set function_to_apply to 'softmax')
classifier_full = pipeline("text-classification", model=model, tokenizer=tokenizer, function_to_apply='softmax')

# Make predictions using the cleaned text data
predictions_full = classifier_full(full_data["Cleaned_Name"].tolist())

# Extract the predicted labels (numerical codes)
#predicted_codes = [int(predictions_full['label'].replace('LABEL_', '')) for prediction in predictions]

# Extract labels from the predictions_full
labels_full = [prediction['label'] for prediction in predictions_full]

# Add the labels as a new column in the DataFrame
full_data['Predicted_Category'] = labels_full

# (Optional) Print or save the updated DataFrame
#full_data

Device set to use mps:0


In [74]:
full_data[['Cleaned_Name', 'Predicted_Category']].drop_duplicates().sort_values(by='Cleaned_Name').head(5)

Unnamed: 0,Cleaned_Name,Predicted_Category
43,adams corners caf ithaca ny,Dining
41,amazon,Merchandise
25,amzn mktp usbsku amzncombill wa,Merchandise
16,amzn mktp usddota amzncombill wa,Merchandise
12,amzn mktp usrae amzncombill wa,Merchandise


### Conclusion

This worked pretty well overall. Having a larger training dataset would have been ideal but that would mean that I would have to spend a lot more money :D or get a third party dataset. Overall, I like this fine tuning approach to local. 

#### Next steps
- On this would be to see if an LLM can do better
- An evaluation of the category
- Compile all the data and see what the difference is in spending per category

#### Notable issues
- Some notable exceptions are Apple.com was categrized as dining.
- A lot of things get categorized into Amazon Merchandise
- I should be scrubbing out payments and credits

I tried increasing the number of epochs but that did not seem to make an observable.