# ELECTRA LLM

## Considerations:

- __Combining Features__: Use ELECTRA embeddings for the memo text, and concatenate them with normalized versions of your numeric features (amount and date) for a combined representation.
- __Optimizing for Inference__: Quantization (e.g., 8-bit) can improve inference times without a significant loss in accuracy, which helps meet your latency requirements.
- __Class Imbalance__: During fine-tuning, focus on weighted cross-entropy or focal loss to handle the imbalance effectively.


In [13]:
# !pip install transformers datasets evaluate accelerate

In [2]:
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
import numpy as np
import matplotlib.pyplot as plt
import re
from datasets import Dataset, ClassLabel
from sklearn.preprocessing import LabelEncoder
import pickle


In [3]:
outflows = pd.read_parquet('ucsd-outflows.pqt')
outflows = outflows[outflows.category != outflows.memo].reset_index(drop=True)
outflows.posted_date = pd.to_datetime(outflows.posted_date)

In [4]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
outflows['amount'] = scaler.fit_transform(outflows[['amount']])

outflows['day_of_week'] = outflows.posted_date.dt.dayofweek
outflows['month_num'] = outflows.posted_date.dt.month
outflows['is_weekend'] = outflows.day_of_week >= 5

outflows.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,day_of_week,month_num,is_weekend
0,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,0.0001,2022-09-26,FOOD_AND_BEVERAGES,0,9,False
1,0,acc_0,Buffalo Wild Wings,0.0001,2022-09-12,FOOD_AND_BEVERAGES,0,9,False
2,0,acc_0,Oculus CA 04/16,0.0,2022-04-18,GENERAL_MERCHANDISE,0,4,False
3,0,acc_0,LOS GIRASOLES STOW OH 03/08,0.0001,2022-03-09,FOOD_AND_BEVERAGES,2,3,False
4,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,0.0,2022-03-29,GENERAL_MERCHANDISE,1,3,False


In [5]:
categories = outflows.category.unique()
categories

array(['FOOD_AND_BEVERAGES', 'GENERAL_MERCHANDISE', 'GROCERIES', 'PETS',
       'TRAVEL', 'MORTGAGE', 'OVERDRAFT', 'EDUCATION', 'RENT'],
      dtype=object)

In [6]:
id2label = {i: categories[i] for i in range(len(categories))}
label2id = {categories[i]: i for i in range(len(categories))}

In [7]:
id2label

{0: 'FOOD_AND_BEVERAGES',
 1: 'GENERAL_MERCHANDISE',
 2: 'GROCERIES',
 3: 'PETS',
 4: 'TRAVEL',
 5: 'MORTGAGE',
 6: 'OVERDRAFT',
 7: 'EDUCATION',
 8: 'RENT'}

In [8]:
label2id

{'FOOD_AND_BEVERAGES': 0,
 'GENERAL_MERCHANDISE': 1,
 'GROCERIES': 2,
 'PETS': 3,
 'TRAVEL': 4,
 'MORTGAGE': 5,
 'OVERDRAFT': 6,
 'EDUCATION': 7,
 'RENT': 8}

In [9]:
labels = list(label2id.keys())
labels

['FOOD_AND_BEVERAGES',
 'GENERAL_MERCHANDISE',
 'GROCERIES',
 'PETS',
 'TRAVEL',
 'MORTGAGE',
 'OVERDRAFT',
 'EDUCATION',
 'RENT']

In [10]:
outflows['category_id'] = outflows.category.apply(lambda c: label2id[c])
outflows.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,day_of_week,month_num,is_weekend,category_id
0,0,acc_0,TST* Casa Del Rio - Exp Fairlawn OH 09/24,0.0001,2022-09-26,FOOD_AND_BEVERAGES,0,9,False,0
1,0,acc_0,Buffalo Wild Wings,0.0001,2022-09-12,FOOD_AND_BEVERAGES,0,9,False,0
2,0,acc_0,Oculus CA 04/16,0.0,2022-04-18,GENERAL_MERCHANDISE,0,4,False,1
3,0,acc_0,LOS GIRASOLES STOW OH 03/08,0.0001,2022-03-09,FOOD_AND_BEVERAGES,2,3,False,0
4,0,acc_0,BUZZIS LAUNDRY 1 OH 03/28,0.0,2022-03-29,GENERAL_MERCHANDISE,1,3,False,1


## Tokenize and Prepare Features

In [11]:
from transformers import AutoTokenizer

In [12]:
# outflows.columns

In [13]:
outflows = outflows.drop(columns=['prism_consumer_id', 'prism_account_id', 'category'])

outflows = outflows[['memo', 'category_id']]

In [14]:
# dataset[0]

In [15]:
tokenizer = AutoTokenizer.from_pretrained("google/electra-small-discriminator", clean_up_tokenization_spaces=True)
tokenizer

ElectraTokenizerFast(name_or_path='google/electra-small-discriminator', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [16]:
def tokenize_function(examples):
    # Tokenize the text
    tokens = tokenizer(examples["memo"], padding="max_length", truncation=True)
    
    # Add additional features
    tokens['amount'] = examples['amount']
    tokens['day_of_week'] = examples['day_of_week']
    tokens['month_num'] = examples['month_num']
    tokens['is_weekend'] = examples['is_weekend']
    return tokens


In [17]:
memos = outflows.memo.values.tolist()

tokenized_memos = tokenizer(memos, truncation=True) # potentially create col in df and feed that to MemoDataset

In [18]:
import torch
class MemoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [19]:
test_size = int(len(memos) * 0.25)
train_size = len(memos) - test_size

In [20]:
# outflows.head()

In [21]:
X_train = tokenized_memos[:train_size]
X_test = tokenized_memos[train_size:]

y_train = outflows.category_id.values.tolist()[:train_size]
y_test = outflows.category_id.values.tolist()[train_size:]

In [22]:
train = MemoDataset(X_train, y_train)
test = MemoDataset(X_test, y_test)

In [23]:
train.__len__()

979839

In [29]:
# tokenized_dataset = dataset.map(tokenize_function, batched=True,) # 4 cause broken pipe error
# num_proc=1 --> subprocessses keep failing (does not take too long though)

# tokenized_dataset = dataset.map(lambda examples: tokenizer(examples["memo"], padding="max_length", truncation=True), batched=True,) # 4 cause broken pipe error


In [30]:
# tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", 'memo', "category_id"])

In [51]:
# tokenized_dataset.rename_column("category_id", "labels")


In [27]:
# train_test_split = tokenized_dataset.train_test_split(test_size=0.25)
# train = train_test_split['train']
# test = train_test_split['test']


In [28]:
# train.num_rows, test.num_rows

In [29]:
# train.features

In [None]:
# train['token_type_ids'][:10]

# Defining Model

### Model Training

In [24]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

import torch
print("CUDA available:", torch.cuda.is_available())

print("GPU in use:", torch.cuda.current_device())
torch.cuda.empty_cache()

2024-11-08 02:12:00.666732: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-08 02:12:00.666803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-08 02:12:00.668441: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 02:12:00.678081: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  1
CUDA available: True
GPU in use: 0


In [25]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, ElectraForSequenceClassification


In [26]:
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm

In [27]:
# !export PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True"


In [28]:
num_labels = outflows.category_id.nunique() # should be 9
print(f'number of labels = {num_labels}')


number of labels = 9


In [29]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [30]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=ElectraTokenizerFast(name_or_path='google/electra-small-discriminator', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='

In [31]:
model = ElectraForSequenceClassification.from_pretrained(
    "SALT-NLP/FLANG-ELECTRA", num_labels=num_labels, id2label=id2label, label2id=label2id, #problem_type="multi_label_classification"
)


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at SALT-NLP/FLANG-ELECTRA and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
training_args = TrainingArguments(
    output_dir="./electra_llm/results",              # output directory where model checkpoints will be saved
    num_train_epochs=3,                  # number of epochs
    # fp16=True,                           # enable mixed precisioning, may speed up training
    per_device_train_batch_size=8,       # batch size for training
    per_device_eval_batch_size=64,       # batch size for evaluation
    warmup_steps=500,                    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                   # strength of weight decay
    logging_dir="./electra_llm/logs",    # directory to store logs
    logging_steps=10,                    # log every 10 steps
    eval_strategy="epoch",               # evaluate after each epoch
    save_strategy="epoch",               # save model after each epoch
    load_best_model_at_end=True,         # load the best model when finished training
    metric_for_best_model="accuracy",    # metric to determine the best model
    logging_first_step=True,             # log first step
    # use_cpu=False,
    dataloader_num_workers=8,             # have 8 cpus to use
    report_to='tensorboard',
    # disable_tqdm=False,                   # Enable tqdm progress bar
)


In [33]:
# google/electra-small-discriminator
# model = AutoModelForSequenceClassification.from_pretrained("SALT-NLP/FLANG-ELECTRA", num_labels=num_labels, id2label=id2label, label2id=label2id)


In [34]:
# !pip install --upgrade transformers

In [35]:
trainer = Trainer(
    model=model,                          # your model
    args=training_args,                   # training arguments
    data_collator=data_collator,          # collator for batching
    compute_metrics=compute_metrics,      # metrics function
    processing_class=tokenizer,           # tokenizer
    train_dataset=train,          # training dataset
    eval_dataset=test             # evaluation dataset
)

model.to('cuda')
model.gradient_checkpointing_enable()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

In [36]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

AttributeError: Caught AttributeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
           ^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/tmp/ipykernel_9218/609361498.py", line 8, in __getitem__
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
                                                        ^^^^^^^^^^^^^^^^^^^^
AttributeError: 'list' object has no attribute 'items'


# Model Evaluation

In [None]:
results = trainer.evaluate(test_dataset)

In [None]:
model.save_pretrained("./electra_llm/final_model")
tokenizer.save_pretrained("./electra_llm/final_model")

In [157]:
import torch
from transformers import AutoTokenizer, ElectraForSequenceClassification

In [158]:
tokenizer = AutoTokenizer.from_pretrained("bhadresh-savani/electra-base-emotion")
model = ElectraForSequenceClassification.from_pretrained("bhadresh-savani/electra-base-emotion", problem_type="multi_label_classification")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/336 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

In [159]:
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_ids = torch.arange(0, logits.shape[-1])[torch.sigmoid(logits).squeeze(dim=0) > 0.5]

In [161]:
num_labels = len(model.config.id2label)
model = ElectraForSequenceClassification.from_pretrained(
    "bhadresh-savani/electra-base-emotion", num_labels=num_labels, problem_type="multi_label_classification"
)

labels = torch.sum(
    torch.nn.functional.one_hot(predicted_class_ids[None, :].clone(), num_classes=num_labels), dim=1
).to(torch.float)
loss = model(**inputs, labels=labels).loss

In [168]:
text = "I'm am not feeling sad."

# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")

# Get predictions without gradient calculation
with torch.no_grad():
    logits = model(**inputs).logits

# Apply sigmoid to get probabilities
probabilities = torch.sigmoid(logits).squeeze()

# Set a threshold (e.g., 0.5) to decide on the predicted labels
threshold = 0.5
predicted_class_ids = torch.where(probabilities > threshold)[0]

# Map predicted class IDs to class names
predicted_labels = [model.config.id2label[class_id.item()] for class_id in predicted_class_ids]

print("Predicted labels:", predicted_labels)


Predicted labels: ['sadness']


In [165]:
labels

tensor([[0., 1., 1., 0., 0., 0.]])