In [33]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.utils import resample
import os
from datasets import Dataset

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vanisingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vanisingh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Importing the dataset to train the model on

This model is trained on the data in *accountactivity.csv*. This is an accumulation of many credit card statements for the *Date*, *Description*, and *Amount* columns. The transactions were then manually labeled in the *LABEL* column.

In [34]:
data_path = "accountactivity.csv"
df = pd.read_csv(data_path, encoding='utf-8', encoding_errors='replace')
df.head()

Unnamed: 0,DATE,DESCRIPTION,AMOUNT,LABEL
0,2024-11-29,Amazon,6.36,Retail
1,2024-11-28,THAI SPICE RESTAURAN,51.47,Dining
2,2024-11-28,Amazon,10.44,Retail
3,2024-11-28,Lyft,12.63,Transportation
4,2024-11-27,Target,9.17,Retail


### Cleaning the Data

remove stopwords and punctuation from the *DESCRIPTION* column to prepare fore text analysis.

In [35]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def text_processing(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    
    # Tokenize, remove stopwords, and lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["DESCRIPTION"] = df["DESCRIPTION"].apply(text_processing)

### Dropping all columns that do not have a value for LABEL

Remove all rows that do not have a value for the target column of the model.

In [36]:
df = df.dropna(subset=["LABEL"])

### Getting value counts for each LABEL category

In [37]:
df['LABEL'].value_counts()

LABEL
Dining            445
Other             216
Retail            213
Grocery           190
Gas                67
Transportation     62
Travel             44
Subscription       37
Mobility           36
Entertainment      26
Pharmaceutical     18
Name: count, dtype: int64

The output clearly shows that the data is **imbalanced**

### Getting the data ready for training the model

In [38]:
# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Encode labels as integers (0 = Retail, 1 = Dining, 2 = Travel)
label_encoder = LabelEncoder()
df["LABEL"] = label_encoder.fit_transform(df["LABEL"])
num_classes = len(label_encoder.classes_)

# Mapping from numeric values to original labels
label_mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

print("Label mapping:", label_mapping)

Label mapping: {0: 'Dining', 1: 'Entertainment', 2: 'Gas', 3: 'Grocery', 4: 'Mobility', 5: 'Other', 6: 'Pharmaceutical', 7: 'Retail', 8: 'Subscription', 9: 'Transportation', 10: 'Travel'}


### Downloading the tokenizer

In [39]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )



### Split the data into train and test

In [40]:
X_train, X_val, y_train, y_val = train_test_split(df["DESCRIPTION"],
                                                  df["LABEL"], test_size=0.2, random_state=42)

### Using SMOTE to oversample under repersented categories

In [41]:
df = pd.concat([X_train, y_train], axis=1)
label_counts = df['LABEL'].value_counts()

# Find the label with the maximum frequency
max_count = int(label_counts.median())

# Initialize an empty DataFrame to hold the oversampled data
oversampled_df = pd.DataFrame()

# For each label, resample the data to match the maximum count
for label, count in label_counts.items():
    label_df = df[df['LABEL'] == label]
    # If the label has fewer rows, oversample it
    if count < max_count:
        oversampled_label_df = resample(label_df, 
                                        replace=True,     # Allow sampling with replacement
                                        n_samples=max_count,  # To match the majority class
                                        random_state=42)  # For reproducibility
        oversampled_df = pd.concat([oversampled_df, oversampled_label_df])
    else:
        # If the label has more or equal rows, keep it as is
        oversampled_df = pd.concat([oversampled_df, label_df])

# Shuffling the DataFrame
oversampled_df_train = oversampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [42]:
X_train = oversampled_df['DESCRIPTION']
y_train = oversampled_df['LABEL']

### Tokenizing the data

Define a custom PyTorch Dataset class, TransactionDataset, to preprocess and encode textual data and labels for training. It tokenizes input texts, converts labels to tensors, and prepares batched data loaders for training and validation with specified batch sizes and shuffling configurations.

In [43]:
class TransactionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=32):
        self.encodings = tokenize_data(texts, tokenizer, max_length)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [44]:
train_dataset = TransactionDataset(X_train.tolist(), y_train, tokenizer)
val_dataset = TransactionDataset(X_val.tolist(), y_val, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

### Loading the model, optimizer and scheduler for learning rate

Used a dynamic learning rate to ensure better convergence.

In [45]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_classes)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [46]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.1, patience=2, verbose=True)



### Defining the train and evaluate function for training BERT

Train the model for a specified number of epochs, computing the training loss and evaluating the model on validation data after each epoch.
Log training loss, validation loss, and validation accuracy, saving them into a history DataFrame.
Save the model's state after every epoch.

In [47]:
def train_model(model, train_dataloader, val_dataloader, epochs, optimizer, scheduler, device):
    # Initialize a DataFrame to store losses
    history = {"epoch": [], "train_loss": [], "val_loss": [], "val_accuracy": []}

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        print(f"Epoch {epoch + 1}/{epochs}")

        for batch in tqdm(train_dataloader, desc="Training"):
            batch = {key: val.to(device) for key, val in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        # Evaluate the model on validation data
        val_loss, val_accuracy = evaluate_model(model, val_dataloader, device)

        # Log losses
        train_loss = total_loss / len(train_dataloader)
        history["epoch"].append(epoch + 1)
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["val_accuracy"].append(val_accuracy)

        print(f"  Training Loss: {train_loss:.4f}")
        print(f"  Validation Loss: {val_loss:.4f}")
        print(f"  Validation Accuracy: {val_accuracy:.4f}")

        # Step the scheduler
        scheduler.step(val_loss)  # Adjust based on the scheduler type (see below)

        epoch_save_path = f"models/model_epoch_{epoch + 1}.pt"
        torch.save(model.state_dict(), epoch_save_path)
        print(f"  Model saved at: {epoch_save_path}")

    # Convert the history dictionary to a DataFrame
    history_df = pd.DataFrame(history)
    return history_df

Evaluate the model on validation data, computing the average loss and accuracy without updating the model's weights.

In [48]:
def evaluate_model(model, val_dataloader, device):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating"):
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    avg_loss = val_loss / len(val_dataloader)
    accuracy = correct / total
    return avg_loss, accuracy

### Train the Model

For each epoch, saves the model in a *models* folder

In [49]:
history_df = train_model(
    model=model,
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    epochs=15,
    optimizer=optimizer,
    device=device,
    scheduler=scheduler
)

Epoch 1/15


Training:  39%|████████████████████████████████████████████▍                                                                     | 58/149 [02:01<03:10,  2.10s/it]


KeyboardInterrupt: 

### Visualizing the results

Choose the model associated with the smallest validation loss

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history_df["epoch"], history_df["train_loss"], label="Training Loss", marker="o")
plt.plot(history_df["epoch"], history_df["val_loss"], label="Validation Loss", marker="o")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training and Validation Loss per Epoch")
plt.legend()
plt.grid()
plt.show()