# Basic Business Event Extraction Using FinBERT

In [None]:
!pip install datasets



In [None]:
# import statements

# to be able to access files from my drive
from google.colab import drive

# to be able to read the files and load data
import csv
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from collections import Counter

# to be able to extract the text from URLS
from bs4 import BeautifulSoup
import requests

# to be able to run and finetune the FinBERT model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, TFAutoModelForSequenceClassification
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch.nn import CrossEntropyLoss

# to address imbalance
from sklearn.model_selection import train_test_split

# if I decide to go back to tensorflow
#import tensorflow as tf
#from tensorflow import keras

# to save model
import os

# to be able to validate the fine tuning
from sklearn.metrics import classification_report, confusion_matrix

# for URL processing which we don't use
# to be able to extract the text from URLS
from bs4 import BeautifulSoup
import requests
# to clean the text
import re

In [None]:
# mount my google drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Dataset to finetune FinBERT for event detection
- load training data available from
  - https://drive.google.com/drive/folders/1xKjd9hzA8UTn2DXVIYYnX5TngNAMom19
  - https://github.com/Zhihan1996/TradeTheEvent/tree/main/data  
  
- Full setences
- Stratefied Split

In [None]:
# load files from my drive
file_path_EDT = {
    'train':'/content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/train.txt',
    'test': '/content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/dev.txt',
    }

In [None]:
# for setence level fine tuning
# helper functions to process setences
with open(file_path_EDT['train'], 'r') as file:
    lines = file.readlines()

# Initialize variables
sentences = []
current_sentence = []
current_labels = []

# alternative funciton to get max value label
def get_summary_label(labels):
  # get unique values
  unique_labels = list(set(labels))

  # if len greater than 1
  if len(unique_labels) > 1 and 'O' in unique_labels:
    unique_labels.remove('O')
    if len(unique_labels) > 1:
      label_counts = Counter(labels)  # Count occurrences of each label
      max_label = label_counts.most_common(1)[0][0]
      return max_label[0]
    else:
      return unique_labels[0]
  else:
      return unique_labels[0]


# Function to process a file and convert it to a DataFrame
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sentences = []
    current_sentence = []
    current_labels = []

    # Iterate over each line in the file
    for line in lines:
        line = line.strip()  # Remove leading/trailing whitespace/newlines
        if line:  # If the line is not empty, process it
            word, label = line.split('\t')
            current_sentence.append(word)  # Store the word
            current_labels.append(label)  # Store the label
        else:  # Empty line indicates end of a sentence
            if current_sentence:
                sentence = ' '.join(current_sentence)
                summary_label = get_summary_label(current_labels)
                sentences.append({'0': sentence, '1':summary_label})
                current_sentence = []  # Reset current sentence
                current_labels = []  # Reset current labels

    # If there's a sentence left after the last line, add it
    if current_sentence:
        sentence = ' '.join(current_sentence)
        summary_label = get_summary_label(current_labels)
        sentences.append({'0': sentence, '1': summary_label})

    return pd.DataFrame(sentences)

In [None]:
# processing for sentence level, using highest frequency
df_train = process_file(file_path_EDT['train'])
df_test = process_file(file_path_EDT['test'])

In [None]:
# attempting to address balance issues
# Combine train and test for stratification
df = pd.concat([df_train, df_test])

# Perform stratified split
train_df, test_df = train_test_split(
    df,
    stratify=df['1'],
    test_size=0.2,
    random_state=42
)

# for the sentence labeling approach
# EITHER run this cell or word labeling cell
dataset_EDT = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

In [None]:
# check to see how you loaded the data
dataset_EDT['train'][0]

{'0': 'Altimar Acquisition Corporation Announces Pricing of $250 Million Initial Public Offering NEW YORK , Oct . 22 , 2020 / / Altimar Acquisition Corporation ( the "Company" ) , a special purpose acquisition company , announced the pricing of its initial public offering of 25,000,000 units at a price of $10.00 per unit . The units will be listed on The New York Stock Exchange and trade under the ticker symbol "ATACU" beginning October 23 , 2020 . Each unit consists of one Class A ordinary share of the Company and one-third of one redeemable warrant . Each whole warrant entitles the holder thereof to purchase one Class A ordinary share of the Company at a price of $11.50 per share . Once the securities comprising the units begin separate trading , the Class A ordinary shares and warrants are expected to be listed on The New York Stock Exchange under the symbols "ATAC" and "ATACW" , respectively . The Company is sponsored by Altimar Sponsor , LLC , an affiliate of HPS Investment Partne

## Label Mapping

In [None]:
# create a label mapping source: https://github.com/Zhihan1996/TradeTheEvent/tree/main/data
label_map = {
    'I-A': 0, # Acquisition
    'I-CT': 1, # Clinical Trial
    'I-RD': 2, # Regular Dividend
    'I-DC': 3, # Dividend Cut
    'I-DI': 4, # Dividend Increase
    'I-GI': 5, # Guidance Increase
    'I-NC': 6, # New Contract
    'I-RSS': 7, # Reverse Stock Split
    'I-SD': 8, # Special Dividend
    'I-SR': 9, # Stock Repurchase
    'I-SS': 10, # Stock Split(SS)
    'O': 11, # No Event
}

# helper function to map the labels
def map_labels(example):
  try:
    example['1'] = label_map[example['1']]
  except KeyError as e:
    example['1'] = label_map['O']
    print(f'KeyError: {e} - Key not found in label_map')
  return example

# create a column for the label
dataset_EDT = dataset_EDT.map(map_labels)

Map:   0%|          | 0/7774 [00:00<?, ? examples/s]

KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found i

Map:   0%|          | 0/1944 [00:00<?, ? examples/s]

KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map
KeyError: 'I-GC' - Key not found in label_map


In [None]:
dataset_EDT['train'][1]

{'0': "Qorvo Awarded DoD Contract to Advance Copper-Pillar-on-GaN Technology Program will enable smaller footprint in phased array , communication and electronic warfare systems GREENSBORO , N.C. , May 20 , 2020 ( ) Qorvo ( Nasdaq:QRVO ) , a leading provider of innovative RF solutions that connect the world , has been awarded a three-year contract to further advance the development of copper-pillar-on-GaN flip-chip technology . This Department of Defense ( DoD ) program will create a high-yield domestic foundry to mature the copper flip assembly process , which enables vertical die stacking in space-constrained phased array radar systems and other defense electronics . Conventional module-level integration on printed wiring boards is performed by wire bonding and uses metal housing and chip-and-wire modules . This wastes space while adding weight and cost to the system . Copper pillar flip-chip technology enables vertical stacking of die to integrate many more components , resulting in

## Tokenization

In [None]:
# load model and tokenizer
model_name = 'yiyanghkust/finbert-tone'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # getting an error if I try to change to 12
# ignoring errors and hopefully I can adjust in fine tuning
model = AutoModelForSequenceClassification.from_pretrained(
    "yiyanghkust/finbert-pretrain",
    num_labels=12,
    ignore_mismatched_sizes=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Build a tokenization function for the text column
# Here you have to choose if you return tensor for pytorch or tensorflow
# the text is only about one word each so I set the max length to 20
def tokenize_data(batch, max_length=20):
    # handle invalid texts
    texts = batch['0']
    valid_texts = []
    for text in texts:
      if text is not None and isinstance(text, str):
        valid_texts.append(text)
      else:
        print(f"Warning: Invalid or missing text for example: {text}")
        valid_texts.append('[UNKNOWN]')

    # tokenize the valid te4xts
    tokenized_output = tokenizer(
        valid_texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt' # decide tensorflow or pytorch
    )

    # return the tokenized data in the correct format
    return {
        'input_ids': tokenized_output['input_ids'],
        'attention_mask': tokenized_output['attention_mask'],
        'labels': torch.tensor(batch['1'])
    }

In [None]:
# check to make sure it is working the way you think before running all of it
example_test = dataset_EDT['train'][0]
output_test = tokenize_data(example_test)
print(output_test, type(output_test['input_ids']))

{'input_ids': tensor([[   3,   11,    4,  ...,    0,    0,    0],
        [   3, 4612,    4,  ...,    0,    0,    0],
        [   3,  599,    4,  ...,    0,    0,    0],
        ...,
        [   3, 4619,    4,  ...,    0,    0,    0],
        [   3, 5674,    4,  ...,    0,    0,    0],
        [   3,   58,    4,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor(11)} <class 'torch.Tensor'>


In [None]:
tokenized_data = dataset_EDT.map(
    tokenize_data,
    batched = True,
    batch_size = 6
    )

Map:   0%|          | 0/7774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1944 [00:00<?, ? examples/s]

In [None]:
print(type(tokenized_data['train'][0]['input_ids']))

<class 'list'>


In [None]:
tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
type(tokenized_data['train'][0]['input_ids'])

torch.Tensor

## Train Model
- load optimizer
- select loss
- create a small train dataset
- build a function to train the model
- test the pipeline
- train the full model

In [None]:
# Pytorch loss and optimizers
# set learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss()

In [None]:
# create a sample dataset
small_train_dataset = tokenized_data['train'].select(range(100))
small_train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=16)
small_eval_dataset = tokenized_data['test'].select(range(100))
small_eval_dataloader = DataLoader(small_eval_dataset, batch_size=16)

In [None]:
# dataloader no consideration for balance for eval
eval_dataloader = DataLoader(tokenized_data['test'], batch_size=16)

In [None]:
# dataloader with attempt to balance weights

# calculate sample weights
def calculate_sample_weights(labels):
    labels = torch.tensor(labels)
    class_counts = torch.bincount(labels)
    class_weights = 1.0/ class_counts
    sample_weights = class_weights[labels]
    return sample_weights

sample_weights = calculate_sample_weights(tokenized_data['train']['1'])
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement = True)

train_loader = DataLoader(tokenized_data['train'], batch_size=16, sampler=WeightedRandomSampler(sample_weights, len(sample_weights)))

In [None]:
# look at label distribution in each first batch
for batch in train_loader:
  print(batch['labels'])
  break

tensor([ 8,  0,  2,  1,  9,  0,  9, 10,  9,  2,  0,  2, 11,  9,  9,  0])


In [None]:
# change file path to where you want to save the checkpoitns
checkpoint_dir = '/content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy'

In [None]:
# attempt to build in checkpoints, evaluate, and save the model

def train_checkpoints(model, optimizer, loss_fn, train_dataloader, eval_dataloader, epochs=3):
    # set up the directory
    os.makedirs('checkpoints', exist_ok=True)

    # initialize the highest possible loss
    best_eval_loss = float('inf')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_dataloader:

            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        # save and print average training loss
        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}')

        # Add Evaluation Loop
        model.eval()
        eval_loss = 0
        with torch.no_grad():
          for batch in eval_dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            eval_loss += outputs.loss.item()

        # save and print average evaluation loss
        avg_eval_loss = eval_loss / len(eval_dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Evaluation Loss: {avg_eval_loss}')

        # Save the model after each epoch
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_finbert-edt_epoch_{epoch + 1}.pth')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'eval_loss': avg_eval_loss
        }, checkpoint_path)
        print(f'Checkpoint saved. Location: {checkpoint_path}')

        # Save the best model
        if avg_eval_loss < best_eval_loss:
            best_checkpoint_path = os.path.join(checkpoint_dir, 'best_model.pt')
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'eval_loss': avg_eval_loss
            }, best_checkpoint_path)
            best_eval_loss = avg_eval_loss
            print(f'Best model updated and saved at {best_checkpoint_path}')

In [None]:
# train a sample to make sure everything runs
train_checkpoints(model, optimizer, loss_fn, small_train_dataloader, small_eval_dataloader)

Epoch 1/3, Training Loss: 1.9998880284173148
Epoch 1/3, Evaluation Loss: 1.6927329472133092
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_1.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 2/3, Training Loss: 1.551113281931196
Epoch 2/3, Evaluation Loss: 1.3171438319342477
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_2.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 3/3, Training Loss: 1.2406273909977503
Epoch 3/3, Evaluation Loss: 1.11692955664226
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_3.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt


In [None]:
saved_model_path = '/content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt'

In [None]:
# load the checkpoint
# the model is still the original FinBERT model from fine-tuning
checkpoint = torch.load(saved_model_path)
model.load_state_dict(checkpoint['model_state_dict'])

# Set the model to evaluation mode
model.eval()

  checkpoint = torch.load(saved_model_path)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# generate tokenized test data
test_news_text = [
    "Merger announced acquistion alerted ",
    "Quarterly profits at an all time low",
    'When deciding whether to buy, sell, or hold a stock, \
    investors often rely on analyst recommendations. Media reports about rating \
    changes by these brokerage-firm-employed (or sell-side)',
    'generated income profit earnings statements'
]

tokenized_test_news = tokenizer(
    test_news_text,
    padding=True,
    truncation=True,
    max_length = 20, # same max length as training
    return_tensors='pt'
    )

In [None]:
# Move model and inputs to the same device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
tokenized_test_inputs = {key: val.to(device) for key, val in tokenized_test_news.items()}

# Perform inference
with torch.no_grad():
    outputs = model(**tokenized_test_inputs)

# Extract logits (raw prediction scores)
logits = outputs.logits

# Convert logits to probabilities (optional)
probabilities = torch.softmax(logits, dim=-1)

# Get predicted class labels
predicted_classes = torch.argmax(probabilities, dim=-1)

print("Predicted classes:", predicted_classes.tolist())


Predicted classes: [11, 11, 11, 11]


In [None]:
df_train.shape

(7770, 2)

In [None]:
# train the full model
train_checkpoints(model, optimizer, loss_fn, train_loader, eval_dataloader)

Epoch 1/3, Training Loss: 0.5217996328041999
Epoch 1/3, Evaluation Loss: 0.37304810346027867
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_1.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 2/3, Training Loss: 0.059511496825335325
Epoch 2/3, Evaluation Loss: 0.25366745151762593
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_2.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 3/3, Training Loss: 0.04358067246031307
Epoch 3/3, Evaluation Loss: 0.2021788966032814
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_3.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt


In [None]:
# train with more epochs overnight
train_checkpoints(model, optimizer, loss_fn, train_loader, eval_dataloader, epochs = 10)

Epoch 1/10, Training Loss: 0.03215366709735935
Epoch 1/10, Evaluation Loss: 0.21325457555654107
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_1.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 2/10, Training Loss: 0.02714702533845456
Epoch 2/10, Evaluation Loss: 0.19450519974801506
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_2.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 3/10, Training Loss: 0.023279656933950177
Epoch 3/10, Evaluation Loss: 0.20113644263798708
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_3.pth
Epoch 4/10, Training Loss: 0.015495812440858273
Epoch 4/10, Evaluation Loss: 0.2704284057788337
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert

## LoRAVersion

In [None]:
!pip install peft # if needed



In [None]:
# imports
from peft import LoraConfig, get_peft_model, TaskType

# Configure LoRA
lora_config = LoraConfig(
    r=8, # low rank factor
    lora_alpha=16, # scaling
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS
)

# add LoRA to the model
model_lora = get_peft_model(model, lora_config)

# Freeze original parameters and only fine-tune LoRA weights
for name, param in model_lora.named_parameters():
  if "lora" not in name:
    param.requires_grad = False
  else:
    param.requires_grad = True

In [None]:
# train a sample to make sure everything runs
train_checkpoints(model_lora, optimizer, loss_fn, small_train_dataloader, small_eval_dataloader)

Epoch 1/3, Training Loss: 0.15604976511427335
Epoch 1/3, Evaluation Loss: 0.20620145608804055
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_1.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 2/3, Training Loss: 0.10414654828075852
Epoch 2/3, Evaluation Loss: 0.13843706037317002
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_2.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 3/3, Training Loss: 0.05242473206349781
Epoch 3/3, Evaluation Loss: 0.1336783395547952
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_3.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt


In [None]:
# test it on sample text
# Move model and inputs to the same device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_lora.to(device)
tokenized_test_inputs = {key: val.to(device) for key, val in tokenized_test_news.items()}

# Perform inference
with torch.no_grad():
    outputs = model_lora(**tokenized_test_inputs)

# Extract logits (raw prediction scores)
logits = outputs.logits

# Convert logits to probabilities (optional)
probabilities = torch.softmax(logits, dim=-1)

# Get predicted class labels
predicted_classes = torch.argmax(probabilities, dim=-1)

print("Predicted classes:", predicted_classes.tolist())

Predicted classes: [11, 11, 11, 11]


In [None]:
# train the full LoRA model
train_checkpoints(model_lora, optimizer, loss_fn, train_loader, eval_dataloader)

Epoch 1/3, Training Loss: 0.033484284629002825
Epoch 1/3, Evaluation Loss: 0.17073443106787858
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_1.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model.pt
Epoch 2/3, Training Loss: 0.027338229597894927
Epoch 2/3, Evaluation Loss: 0.20998629447469702
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_2.pth
Epoch 3/3, Training Loss: 0.02198450396090378
Epoch 3/3, Evaluation Loss: 0.1747253921005081
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-edt_epoch_3.pth
