In [1]:
# load processed df
from IPython.utils.capture import capture_output

with capture_output():
    %run 03_preprocessing.ipynb

# Advanced NLP Models: Neural Network and BERT

This notebook trains advanced models including Deep Neural Networks and BERT-based transformers for transaction categorization.

In [2]:
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Number of classes: {len(np.unique(y_train))}")
print(f"Class distribution in training set:")
print(pd.Series(y_train).value_counts())

Training set shape: (969666, 5022)
Test set shape: (336786, 5022)
Number of classes: 9
Class distribution in training set:
GENERAL_MERCHANDISE    391492
FOOD_AND_BEVERAGES     357992
GROCERIES              162754
TRAVEL                  41839
PETS                     6599
EDUCATION                3329
RENT                     2518
OVERDRAFT                2433
MORTGAGE                  710
Name: count, dtype: int64


## GPU Configuration

Check for GPU availability and configure both TensorFlow and PyTorch to use it.

In [15]:
import tensorflow as tf
import torch

# Check TensorFlow GPU
print("TensorFlow GPU Check:")
print(f"  GPU Available: {tf.config.list_physical_devices('GPU')}")
if tf.config.list_physical_devices('GPU'):
    print(f"  GPU Devices: {[gpu.name for gpu in tf.config.list_physical_devices('GPU')]}")
    # Enable memory growth to avoid allocating all GPU memory at once
    for gpu in tf.config.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)
    print("  ✓ GPU memory growth enabled")
else:
    print("  ⚠ No GPU detected for TensorFlow")

# Check PyTorch GPU
print("\nPyTorch GPU Check:")
print(f"  CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  CUDA Device: {torch.cuda.get_device_name(0)}")
    print(f"  CUDA Version: {torch.version.cuda}")
    device = torch.device("cuda")
    print(f"  ✓ Using device: {device}")
else:
    device = torch.device("cpu")
    print(f"  ⚠ CUDA not available, using CPU")

print(f"\nDevice for BERT training: {device}")

TensorFlow GPU Check:
  GPU Available: []
  ⚠ No GPU detected for TensorFlow

PyTorch GPU Check:
  CUDA Available: True
  CUDA Device: NVIDIA GeForce RTX 4090 Laptop GPU
  CUDA Version: 12.1
  ✓ Using device: cuda

Device for BERT training: cuda


**Note:** TensorFlow doesn't detect GPU because it needs proper CUDA setup. Since PyTorch detects your RTX 4090, we'll use it for BERT training which will be **significantly faster** than CPU. The Neural Network model can still run efficiently on CPU with the current TensorFlow setup.

## 1. Neural Network with TensorFlow/Keras

A deep learning approach using a neural network with dense layers and dropout for regularization.

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import time

# Convert sparse matrices to dense for neural network
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Encode labels
nn_label_encoder = LabelEncoder()
y_train_nn = nn_label_encoder.fit_transform(y_train)
y_test_nn = nn_label_encoder.transform(y_test)

n_classes = len(np.unique(y_train))
input_dim = X_train_dense.shape[1]

print(f"Building Neural Network...")
print(f"Input dimension: {input_dim}")
print(f"Number of classes: {n_classes}")

# Build model
nn_model = keras.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(n_classes, activation='softmax')
])

nn_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

nn_model.summary()

Building Neural Network...
Input dimension: 5022
Number of classes: 9


In [7]:
# Train the neural network
print(f"\nTraining Neural Network...")
start_time = time.time()

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = nn_model.fit(
    X_train_dense, y_train_nn,
    validation_split=0.2,
    epochs=30,
    batch_size=128,
    callbacks=[early_stopping],
    verbose=1
)

training_time = time.time() - start_time
print(f"\nTraining completed in {training_time:.2f} seconds")


Training Neural Network...
Epoch 1/30
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 17ms/step - accuracy: 0.9352 - loss: 0.1832 - val_accuracy: 0.9190 - val_loss: 0.2336
Epoch 2/30
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 17ms/step - accuracy: 0.9531 - loss: 0.1303 - val_accuracy: 0.9216 - val_loss: 0.2314
Epoch 3/30
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 16ms/step - accuracy: 0.9562 - loss: 0.1197 - val_accuracy: 0.9231 - val_loss: 0.2306
Epoch 4/30
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 15ms/step - accuracy: 0.9584 - loss: 0.1131 - val_accuracy: 0.9227 - val_loss: 0.2290
Epoch 5/30
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 16ms/step - accuracy: 0.9598 - loss: 0.1088 - val_accuracy: 0.9218 - val_loss: 0.2404
Epoch 6/30
[1m6061/6061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 18ms/step - accuracy: 0.9609 - loss: 0.1053 - val_accuracy:

In [8]:
# Evaluate Neural Network
y_pred_nn = nn_model.predict(X_test_dense)
y_pred_nn_classes = np.argmax(y_pred_nn, axis=1)
y_pred_nn_labels = nn_label_encoder.inverse_transform(y_pred_nn_classes)

print(f"\n{'='*50}")
print(f"Neural Network Results")
print(f"{'='*50}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nn_labels):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_nn_labels))

[1m10525/10525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step

Neural Network Results
Accuracy: 0.9315

Classification Report:
                     precision    recall  f1-score   support

          EDUCATION       0.85      0.49      0.62      1170
 FOOD_AND_BEVERAGES       0.89      0.96      0.92    124002
GENERAL_MERCHANDISE       0.96      0.91      0.93    132571
          GROCERIES       0.96      0.93      0.94     56577
           MORTGAGE       0.95      0.87      0.91       409
          OVERDRAFT       0.99      0.98      0.98       953
               PETS       0.99      0.92      0.95      2667
               RENT       0.74      0.83      0.78       629
             TRAVEL       0.96      0.92      0.94     17808

           accuracy                           0.93    336786
          macro avg       0.92      0.87      0.89    336786
       weighted avg       0.93      0.93      0.93    336786



## 2. BERT-based Models

Let's try fine-tuning pre-trained BERT models for transaction categorization. We'll use DistilBERT (a lighter version) for efficiency.

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Prepare data for BERT - need the original text with categories
# Get from outflows_cleaned which has both memo_clean and category
bert_train_df = outflows_cleaned[outflows_cleaned['prism_consumer_id'].isin(outflows_train_ids)].copy()
bert_test_df = outflows_cleaned[outflows_cleaned['prism_consumer_id'].isin(outflows_test_ids)].copy()

# Use a subset for faster training
sample_size = 50000
bert_train_df = bert_train_df.sample(n=min(sample_size, len(bert_train_df)), random_state=42)
bert_test_df = bert_test_df.sample(n=min(10000, len(bert_test_df)), random_state=42)

print(f"BERT training samples: {len(bert_train_df)}")
print(f"BERT test samples: {len(bert_test_df)}")

BERT training samples: 50000
BERT test samples: 10000


In [12]:
# Create label mapping
unique_labels = sorted(bert_train_df['category'].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Label mapping:")
for label, idx in label2id.items():
    print(f"  {label}: {idx}")

Label mapping:
  EDUCATION: 0
  FOOD_AND_BEVERAGES: 1
  GENERAL_MERCHANDISE: 2
  GROCERIES: 3
  MORTGAGE: 4
  OVERDRAFT: 5
  PETS: 6
  RENT: 7
  TRAVEL: 8


In [13]:
# Prepare datasets
bert_train_dataset = Dataset.from_dict({
    'text': bert_train_df['memo_clean'].tolist(),
    'label': [label2id[label] for label in bert_train_df['category'].tolist()]
})

bert_test_dataset = Dataset.from_dict({
    'text': bert_test_df['memo_clean'].tolist(),
    'label': [label2id[label] for label in bert_test_df['category'].tolist()]
})

print(f"Train dataset: {bert_train_dataset}")
print(f"Test dataset: {bert_test_dataset}")

Train dataset: Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})
Test dataset: Dataset({
    features: ['text', 'label'],
    num_rows: 10000
})


In [14]:
# Load DistilBERT tokenizer and tokenize datasets
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

bert_train_tokenized = bert_train_dataset.map(tokenize_function, batched=True)
bert_test_tokenized = bert_test_dataset.map(tokenize_function, batched=True)

print("Datasets tokenized successfully")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Datasets tokenized successfully


In [17]:
# Load DistilBERT model and move to GPU if available
bert_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

if torch.cuda.is_available():
    bert_model = bert_model.to(device)
    print(f"✓ Model loaded with {len(unique_labels)} labels and moved to GPU")
else:
    print(f"Model loaded with {len(unique_labels)} labels (CPU)")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Model loaded with 9 labels and moved to GPU


In [18]:
# Define training arguments with GPU optimization
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32 if torch.cuda.is_available() else 16,  # Larger batch on GPU
    per_device_eval_batch_size=32 if torch.cuda.is_available() else 16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir='./logs',
    logging_steps=100,
    fp16=torch.cuda.is_available(),  # Enable mixed precision on GPU
    dataloader_num_workers=0,  # Windows compatibility
    no_cuda=not torch.cuda.is_available(),  # Use GPU if available
)

print(f"✓ Training configured to use {'GPU with FP16' if torch.cuda.is_available() else 'CPU'}")

✓ Training configured to use GPU with FP16


In [19]:
# Define compute metrics function
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("Metrics function defined")

Metrics function defined


In [20]:
# Create Trainer and train the model
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=bert_train_tokenized,
    eval_dataset=bert_test_tokenized,
    compute_metrics=compute_metrics,
)

print("Starting BERT training...")
start_time = time.time()

trainer.train()

bert_training_time = time.time() - start_time
print(f"\nBERT training completed in {bert_training_time:.2f} seconds")

Starting BERT training...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2056,0.222107,0.9274,0.926494,0.9258,0.9274
2,0.1298,0.232667,0.934,0.933459,0.934482,0.934
3,0.0828,0.249376,0.9362,0.935752,0.936758,0.9362


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



BERT training completed in 317.76 seconds


In [22]:
# Evaluate BERT on full test set
print("Evaluating BERT model on full test set...")
bert_eval_results = trainer.evaluate()

print(f"\n{'='*50}")
print(f"BERT (DistilBERT) Results")
print(f"{'='*50}")
print(f"Accuracy: {bert_eval_results['eval_accuracy']:.4f}")
print(f"F1 Score: {bert_eval_results['eval_f1']:.4f}")
print(f"Precision: {bert_eval_results['eval_precision']:.4f}")
print(f"Recall: {bert_eval_results['eval_recall']:.4f}")


Evaluating BERT model on full test set...

BERT (DistilBERT) Results
Accuracy: 0.9362
F1 Score: 0.9358
Precision: 0.9368
Recall: 0.9362


## 3. RoBERTa Model

RoBERTa is an optimized version of BERT with improved training methodology, often performing better on classification tasks.

In [31]:
# Load RoBERTa model
roberta_model_name = "roberta-base"
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)

# Tokenize datasets for RoBERTa
def roberta_tokenize_function(examples):
    return roberta_tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

roberta_train_tokenized = bert_train_dataset.map(roberta_tokenize_function, batched=True)
roberta_test_tokenized = bert_test_dataset.map(roberta_tokenize_function, batched=True)

print("RoBERTa datasets tokenized successfully")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

RoBERTa datasets tokenized successfully


In [32]:
# Load RoBERTa model for sequence classification
roberta_model = AutoModelForSequenceClassification.from_pretrained(
    roberta_model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

if torch.cuda.is_available():
    roberta_model = roberta_model.to(device)
    print(f"✓ RoBERTa loaded with {len(unique_labels)} labels and moved to GPU")
else:
    print(f"RoBERTa loaded with {len(unique_labels)} labels (CPU)")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ RoBERTa loaded with 9 labels and moved to GPU


In [33]:
# Define training arguments for RoBERTa
roberta_training_args = TrainingArguments(
    output_dir="./results_roberta",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32 if torch.cuda.is_available() else 16,
    per_device_eval_batch_size=32 if torch.cuda.is_available() else 16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir='./logs_roberta',
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,
    no_cuda=not torch.cuda.is_available(),
)

print(f"✓ RoBERTa training configured to use {'GPU with FP16' if torch.cuda.is_available() else 'CPU'}")

✓ RoBERTa training configured to use GPU with FP16


In [34]:
# Create Trainer and train RoBERTa
roberta_trainer = Trainer(
    model=roberta_model,
    args=roberta_training_args,
    train_dataset=roberta_train_tokenized,
    eval_dataset=roberta_test_tokenized,
    compute_metrics=compute_metrics,
)

print("Starting RoBERTa training...")
start_time = time.time()

roberta_trainer.train()

roberta_training_time = time.time() - start_time
print(f"\nRoBERTa training completed in {roberta_training_time:.2f} seconds")

Starting RoBERTa training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3031,0.29708,0.9087,0.907777,0.907644,0.9087
2,0.2058,0.282049,0.9173,0.916942,0.91757,0.9173
3,0.16,0.273245,0.9221,0.922129,0.922351,0.9221


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



RoBERTa training completed in 477.02 seconds


In [35]:
# Evaluate RoBERTa
print("Evaluating RoBERTa model...")
roberta_eval_results = roberta_trainer.evaluate()

print(f"\n{'='*50}")
print(f"RoBERTa Results")
print(f"{'='*50}")
print(f"Accuracy: {roberta_eval_results['eval_accuracy']:.4f}")
print(f"F1 Score: {roberta_eval_results['eval_f1']:.4f}")
print(f"Precision: {roberta_eval_results['eval_precision']:.4f}")
print(f"Recall: {roberta_eval_results['eval_recall']:.4f}")

Evaluating RoBERTa model...



RoBERTa Results
Accuracy: 0.9221
F1 Score: 0.9221
Precision: 0.9224
Recall: 0.9221


## Model Comparison

Compare all three models side by side.

In [36]:
# Model Comparison
print(f"\n{'='*50}")
print(f"Model Comparison Summary")
print(f"{'='*50}")
results_df = pd.DataFrame({
    'Model': ['Neural Network', 'BERT (DistilBERT)', 'RoBERTa'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_nn_labels),
        bert_eval_results['eval_accuracy'],
        roberta_eval_results['eval_accuracy']
    ],
    'F1 Score': [
        precision_recall_fscore_support(y_test, y_pred_nn_labels, average='weighted')[2],
        bert_eval_results['eval_f1'],
        roberta_eval_results['eval_f1']
    ]
})

results_df = results_df.sort_values('Accuracy', ascending=False).reset_index(drop=True)
results_df


Model Comparison Summary


Unnamed: 0,Model,Accuracy,F1 Score
0,BERT (DistilBERT),0.9362,0.935752
1,Neural Network,0.931535,0.931525
2,RoBERTa,0.9221,0.922129
