# Lora FinBERT fine tuning
lora is subject to sensitivities about balance. This model undersamples to generate a balanced dataset.

In [None]:
!pip install datasets
!pip install accelerate
!pip install peft

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# import statements

# to be able to access files from my drive
from google.colab import drive

# to be able to read the files and load data
import csv
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from collections import Counter

# to be able to extract the text from URLS
from bs4 import BeautifulSoup
import requests

# to be able to run and finetune the FinBERT model
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, TFAutoModelForSequenceClassification
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch.nn import CrossEntropyLoss

# to address imbalance
from sklearn.model_selection import train_test_split

# to configure LoRA
from peft import LoraConfig, get_peft_model, TaskType

# to save model
import os

# to be able to validate the fine tuning
from sklearn.metrics import classification_report, confusion_matrix

# for URL processing which we don't use
# to be able to extract the text from URLS
from bs4 import BeautifulSoup
import requests
# to clean the text
import re

In [None]:
# mount my google drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load Dataset to finetune FinBERT for event detection
- load training data available from
  - https://drive.google.com/drive/folders/1xKjd9hzA8UTn2DXVIYYnX5TngNAMom19
  - https://github.com/Zhihan1996/TradeTheEvent/tree/main/data  
  
- Full sentences
- Stratefied Split

In [None]:
# load files from my drive
file_path_EDT = {
    'train':'/content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/train.txt',
    'test': '/content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/dev.txt',
    }

In [None]:
# for setence level fine tuning
# helper functions to process setences
with open(file_path_EDT['train'], 'r') as file:
    lines = file.readlines()

# Initialize variables
sentences = []
current_sentence = []
current_labels = []

# alternative funciton to get max value label
def get_summary_label(labels):
  # get unique values
  unique_labels = list(set(labels))

  # if len greater than 1
  if len(unique_labels) > 1 and 'O' in unique_labels:
    unique_labels.remove('O')
    if len(unique_labels) > 1:
      label_counts = Counter(labels)  # Count occurrences of each label
      max_label = label_counts.most_common(1)[0][0]
      return max_label[0]
    else:
      return unique_labels[0]
  else:
      return unique_labels[0]


# Function to process a file and convert it to a DataFrame
def process_file(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    sentences = []
    current_sentence = []
    current_labels = []

    # Iterate over each line in the file
    for line in lines:
        line = line.strip()  # Remove leading/trailing whitespace/newlines
        if line:  # If the line is not empty, process it
            word, label = line.split('\t')
            current_sentence.append(word)  # Store the word
            current_labels.append(label)  # Store the label
        else:  # Empty line indicates end of a sentence
            if current_sentence:
                sentence = ' '.join(current_sentence)
                summary_label = get_summary_label(current_labels)
                sentences.append({'0': sentence, '1':summary_label})
                current_sentence = []  # Reset current sentence
                current_labels = []  # Reset current labels

    # If there's a sentence left after the last line, add it
    if current_sentence:
        sentence = ' '.join(current_sentence)
        summary_label = get_summary_label(current_labels)
        sentences.append({'0': sentence, '1': summary_label})

    return pd.DataFrame(sentences)

In [None]:
# processing for sentence level, using highest frequency
df_train = process_file(file_path_EDT['train'])
df_test = process_file(file_path_EDT['test'])

In [None]:
# remove missing label
print(df_train.shape)
df_train = df_train[df_train['1'] != 'I-GC']
print(df_train.shape)

(7770, 2)
(7692, 2)


In [None]:
# remove missing label test
print(df_test.shape)
df_test = df_test[df_test['1'] != 'I-GC']
print(df_test.shape)

(1948, 2)
(1928, 2)


In [None]:
# undersample to create a balanced dataframe
# 10 is the smallest value for test
df_test = df_test.groupby('1').apply(lambda x: x.sample(n=min(len(x), 10), random_state=42))
# 40 is the smallest value for train
df_train = df_train.groupby('1').apply(lambda x: x.sample(n=min(len(x), 40), random_state=42))

  df_test = df_test.groupby('1').apply(lambda x: x.sample(n=min(len(x), 10), random_state=42))
  df_train = df_train.groupby('1').apply(lambda x: x.sample(n=min(len(x), 40), random_state=42))


In [None]:
df_test['1'].value_counts(), df_train['1'].value_counts()

(1
 I-A      10
 I-CT     10
 I-DC     10
 I-DI     10
 I-NC     10
 I-RD     10
 I-RSS    10
 I-SD     10
 I-SR     10
 I-SS     10
 O        10
 Name: count, dtype: int64,
 1
 I-A      40
 I-CT     40
 I-DC     40
 I-DI     40
 I-NC     40
 I-RD     40
 I-RSS    40
 I-SD     40
 I-SR     40
 I-SS     40
 O        40
 Name: count, dtype: int64)

In [None]:
# attempting to address balance issues
# Combine train and test for stratification
df = pd.concat([df_train, df_test])

# Perform stratified split
train_df, test_df = train_test_split(
    df,
    stratify=df['1'],
    test_size=0.2,
    random_state=42
)

# for the sentence labeling approach
# EITHER run this cell or word labeling cell
dataset_EDT = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'test': Dataset.from_pandas(test_df)
})

In [None]:
# check to see the counts are still valid
pd.Series(dataset_EDT['train']['1']).value_counts()

Unnamed: 0,count
I-NC,40
I-CT,40
I-DI,40
I-A,40
I-SR,40
I-SS,40
I-RD,40
O,40
I-RSS,40
I-DC,40


In [None]:
dataset_EDT['train'][0]

{'0': 'Aegion Corporation Awarded $6.9 Million Two-Year Wastewater Pipeline Rehabilitation Contract in Miami-Dade County ST . LOUIS , Oct . 09 , 2019 ( ) Aegion Corporation ( NASDAQ:AEGN ) today announced that its subsidiary , Insituform Technologies , LLC , has been awarded a two-year term contract valued at $6.9 million from Miami-Dade County in Florida . The contract includes an option for the county to renew for an additional two years on a yearly basis . Insituform will receive several work releases beginning in late 2019 and expects to rehabilitate more than 26 miles of 8- to 36-inch diameter wastewater pipelines using Insituform cured-in-place pipe ( CIPP ) throughout the county over the two-year period . Charles R . Gordon , Aegion s President and CEO , said , Insituform has held the annual contract for CIPP in Miami-Dade for the past four years and has completed a majority of the CIPP installation work in the area for the past five years . We are pleased to continue to work wi

## Label Mapping

In [None]:
# create a label mapping source: https://github.com/Zhihan1996/TradeTheEvent/tree/main/data
label_map = {
    'I-A': 0, # Acquisition
    'I-CT': 1, # Clinical Trial
    'I-RD': 2, # Regular Dividend
    'I-DC': 3, # Dividend Cut
    'I-DI': 4, # Dividend Increase
    'I-GI': 5, # Guidance Increase
    'I-NC': 6, # New Contract
    'I-RSS': 7, # Reverse Stock Split
    'I-SD': 8, # Special Dividend
    'I-SR': 9, # Stock Repurchase
    'I-SS': 10, # Stock Split(SS)
    'O': 11, # No Event
}

# helper function to map the labels
def map_labels(example):
  try:
    example['1'] = label_map[example['1']]
  except KeyError as e:
    example['1'] = label_map['O']
    print(f'KeyError: {e} - Key not found in label_map')
  return example

# create a column for the label
dataset_EDT = dataset_EDT.map(map_labels)

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [None]:
dataset_EDT['train'][1]

{'0': 'Avenue Therapeutics Announces Positive Topline Data from Second Pivotal Phase 3 Study of Intravenous Tramadol in the Management of Postoperative Pain Management to host a conference call today at 9 am EDT NEW YORK , June 03 , 2019 ( ) Avenue Therapeutics , Inc . ( NASDAQ: ATXI ) ( Avenue ) , a specialty pharmaceutical company focused on the development and commercialization of intravenous ( IV ) tramadol , today announced that its second pivotal Phase 3 trial of IV tramadol achieved the primary endpoint of a statistically significant improvement in Sum of Pain Intensity Difference over 24 hours ( SPID24 ) compared to placebo in patients with postoperative pain following abdominoplasty surgery . In addition , the trial met all of its key secondary endpoints . The study also includes a standard-of-care IV opioid as an active comparator: IV morphine 4 mg . In this study , IV tramadol also demonstrated similar efficacy and safety to that of IV morphine . The strong safety and effica

## Model Creation
- load model and tokenizer
- configure lora

In [None]:
# load model and tokenizer
model_name = 'yiyanghkust/finbert-tone'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # getting an error if I try to change to 12
# ignoring errors and hopefully I can adjust in fine tuning
model = AutoModelForSequenceClassification.from_pretrained(
    "yiyanghkust/finbert-pretrain",
    num_labels=12,
    ignore_mismatched_sizes=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yiyanghkust/finbert-pretrain and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=8, # low rank factor
    lora_alpha=16, # scaling
    lora_dropout=0.1,
    task_type=TaskType.SEQ_CLS
)

# add LoRA to the model
model_lora = get_peft_model(model, lora_config)

# Freeze original parameters and only fine-tune LoRA weights
for name, param in model_lora.named_parameters():
  if "lora" not in name:
    param.requires_grad = False
  else:
    param.requires_grad = True

In [97]:
for name, param in model_lora.named_parameters():
  if "lora" in name:
    print(name)
  else:
    print('no')

no
no
no
no
no
no
no
base_model.model.bert.encoder.layer.0.attention.self.query.lora_A.default.weight
base_model.model.bert.encoder.layer.0.attention.self.query.lora_B.default.weight
no
no
no
no
base_model.model.bert.encoder.layer.0.attention.self.value.lora_A.default.weight
base_model.model.bert.encoder.layer.0.attention.self.value.lora_B.default.weight
no
no
no
no
no
no
no
no
no
no
no
no
base_model.model.bert.encoder.layer.1.attention.self.query.lora_A.default.weight
base_model.model.bert.encoder.layer.1.attention.self.query.lora_B.default.weight
no
no
no
no
base_model.model.bert.encoder.layer.1.attention.self.value.lora_A.default.weight
base_model.model.bert.encoder.layer.1.attention.self.value.lora_B.default.weight
no
no
no
no
no
no
no
no
no
no
no
no
base_model.model.bert.encoder.layer.2.attention.self.query.lora_A.default.weight
base_model.model.bert.encoder.layer.2.attention.self.query.lora_B.default.weight
no
no
no
no
base_model.model.bert.encoder.layer.2.attention.self.value.lo

## Tokenize Data

In [None]:
# Build a tokenization function for the text column
# Here you have to choose if you return tensor for pytorch or tensorflow
# the text is only about one word each so I set the max length to 20
def tokenize_data(batch, max_length=20):
    # handle invalid texts
    texts = batch['0']
    valid_texts = []
    for text in texts:
      if text is not None and isinstance(text, str):
        valid_texts.append(text)
      else:
        print(f"Warning: Invalid or missing text for example: {text}")
        valid_texts.append('[UNKNOWN]')

    # tokenize the valid te4xts
    tokenized_output = tokenizer(
        valid_texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt' # decide tensorflow or pytorch
    )

    # return the tokenized data in the correct format
    return {
        'input_ids': tokenized_output['input_ids'],
        'attention_mask': tokenized_output['attention_mask'],
        'labels': torch.tensor(batch['1'])
    }

In [None]:
# check to make sure it is working the way you think before running all of it
example_test = dataset_EDT['train'][0]
output_test = tokenize_data(example_test)
print(output_test, type(output_test['input_ids']))

{'input_ids': tensor([[   3,   11,    4,  ...,    0,    0,    0],
        [   3, 2178,    4,  ...,    0,    0,    0],
        [   3, 3549,    4,  ...,    0,    0,    0],
        ...,
        [   3,    4,    0,  ...,    0,    0,    0],
        [   3, 1506,    4,  ...,    0,    0,    0],
        [   3, 2859,    4,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor(6)} <class 'torch.Tensor'>


In [None]:
tokenized_data = dataset_EDT.map(
    tokenize_data,
    batched = True,
    batch_size = 6
    )

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [None]:
print(type(tokenized_data['train'][0]['input_ids']))

<class 'list'>


In [None]:
tokenized_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
type(tokenized_data['train'][0]['input_ids'])

torch.Tensor

## Train Model
- load optimizer
- select loss
- create weighted Random Sampler
- build a function to train the model
- test the pipeline
- train the full model

In [None]:
# Pytorch loss and optimizers
# set learning rate
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = CrossEntropyLoss()

In [None]:
# dataloader no consideration for balance for eval
eval_dataloader = DataLoader(tokenized_data['test'], batch_size=16)

In [None]:
# dataloader with attempt to balance weights in training

# calculate sample weights
def calculate_sample_weights(labels):
    labels = torch.tensor(labels)
    class_counts = torch.bincount(labels)
    class_weights = 1.0/ class_counts
    sample_weights = class_weights[labels]
    return sample_weights

sample_weights = calculate_sample_weights(tokenized_data['train']['1'])
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement = True)

train_loader = DataLoader(tokenized_data['train'], batch_size=16, sampler=WeightedRandomSampler(sample_weights, len(sample_weights)))

In [None]:
# look at label distribution in each first batch
for batch in train_loader:
  print(batch['labels'])
  break

tensor([ 0, 11,  4,  0,  8,  4,  8,  8,  0,  1,  1,  4, 11,  7,  4,  8])


In [None]:
# change file path to where you want to save the checkpoitns
checkpoint_dir = '/content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy'

### Training Function with Checkpoints

In [None]:
# attempt to build in checkpoints, evaluate, and save the model

def train_checkpoints(model, optimizer, loss_fn, train_dataloader, eval_dataloader, epochs=3):
    # set up the directory
    os.makedirs('checkpoints', exist_ok=True)

    # initialize the highest possible loss
    best_eval_loss = float('inf')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_dataloader:

            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        # save and print average training loss
        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}')

        # Add Evaluation Loop
        model.eval()
        eval_loss = 0
        with torch.no_grad():
          for batch in eval_dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            eval_loss += outputs.loss.item()

        # save and print average evaluation loss
        avg_eval_loss = eval_loss / len(eval_dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Evaluation Loss: {avg_eval_loss}')

        # Save the model after each epoch
        checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_finbert-lora_epoch_{epoch + 1}.pth')
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'eval_loss': avg_eval_loss
        }, checkpoint_path)
        print(f'Checkpoint saved. Location: {checkpoint_path}')

        # Save the best model
        if avg_eval_loss < best_eval_loss:
            best_checkpoint_path = os.path.join(checkpoint_dir, 'best_model-lora.pt')
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_loss': avg_train_loss,
                'eval_loss': avg_eval_loss
            }, best_checkpoint_path)
            best_eval_loss = avg_eval_loss
            print(f'Best model updated and saved at {best_checkpoint_path}')

In [None]:
df_train.shape

(440, 2)

In [None]:
# train the full model lora
train_checkpoints(model_lora, optimizer, loss_fn, train_loader, eval_dataloader)

Epoch 1/3, Training Loss: 2.5255068029676164
Epoch 1/3, Evaluation Loss: 2.5175374150276184
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-lora_epoch_1.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model-lora.pt
Epoch 2/3, Training Loss: 2.507033211844308
Epoch 2/3, Evaluation Loss: 2.5175374150276184
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-lora_epoch_2.pth
Epoch 3/3, Training Loss: 2.522517834390913
Epoch 3/3, Evaluation Loss: 2.5175374150276184
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-lora_epoch_3.pth


In [None]:
# train with more epochs
# Lora doesn't usually improve with a lot of epochs
# train_checkpoints(model_lora, optimizer, loss_fn, train_loader, eval_dataloader, epochs = 20)

Epoch 1/20, Training Loss: 2.5304519619260515
Epoch 1/20, Evaluation Loss: 2.5135950360979353
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-lora_epoch_1.pth
Best model updated and saved at /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/best_model-lora.pt
Epoch 2/20, Training Loss: 2.502150629247938
Epoch 2/20, Evaluation Loss: 2.5135950360979353
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-lora_epoch_2.pth
Epoch 3/20, Training Loss: 2.5168871113232205
Epoch 3/20, Evaluation Loss: 2.5135950360979353
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-lora_epoch_3.pth
Epoch 4/20, Training Loss: 2.500494497162955
Epoch 4/20, Evaluation Loss: 2.5135950360979353
Checkpoint saved. Location: /content/drive/MyDrive/DATASCI_266_NLP/EDT_Copy/checkpoint_finbert-lora_epoch_4.pth
Epoch 5/20, Training Loss: 2.492887624672481
Epoch 5/20, Evaluation Loss: 2.513