fine-tuning distilbert-base-uncased using the Financial PhraseBank dataset from the Hugging Face Datasets Repository

In [1]:
pip install fsspec==2024.10.0



In [2]:
pip install transformers datasets torch scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [3]:
from transformers import DataCollatorWithPadding

Load and Preprocess the Dataset

In [4]:
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset('takala/financial_phrasebank', 'sentences_allagree')

# View the dataset structure
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

financial_phrasebank.py:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

The repository for takala/financial_phrasebank contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/takala/financial_phrasebank.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})


In [5]:
# Split the dataset into training and validation sets
train_test_split = dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [6]:
train_dataset

Dataset({
    features: ['sentence', 'label'],
    num_rows: 1811
})

In [7]:
val_dataset

Dataset({
    features: ['sentence', 'label'],
    num_rows: 453
})

In [8]:
print(
    f"{train_dataset.shape=}",
    f"{train_dataset.num_columns=}",
    f"{train_dataset.num_rows=}",
    f"{train_dataset.column_names=}",
    sep="\n\n"
)

train_dataset.shape=(1811, 2)

train_dataset.num_columns=2

train_dataset.num_rows=1811

train_dataset.column_names=['sentence', 'label']


In [9]:
train_dataset[0]

{'sentence': "Koff 's market share of the volume of the market was 23.4 % , Karhu 's 21.4 % .",
 'label': 1}

In [10]:
train_dataset.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}

Use the DistilBERT tokenizer to preprocess the text data for training

In [11]:
from transformers import DistilBertTokenizer

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize function
def tokenize(batch):
    return tokenizer(batch['sentence'], truncation=True, padding=True, max_length=128)

# Apply tokenizer to the datasets
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)

# Set the format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Map:   0%|          | 0/453 [00:00<?, ? examples/s]

In [12]:
train_dataset

Dataset({
    features: ['sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1811
})

In [13]:
train_dataset[0]

{'label': tensor(1),
 'input_ids': tensor([  101, 12849,  4246,  1005,  1055,  3006,  3745,  1997,  1996,  3872,
          1997,  1996,  3006,  2001,  2603,  1012,  1018,  1003,  1010, 10556,
         25032,  2226,  1005,  1055,  2538,  1012,  1018,  1003,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [14]:
val_dataset[0]

{'label': tensor(1),
 'input_ids': tensor([  101, 22472,  1998,  2061,  8202,  9006,  3710,  4261,  2581,  1010,
          2199, 17073,  1998,  2018,  1037,  3006,  3745,  1997,  3155,  2676,
          1003,  2004,  1997,  2089,  2289,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0])}

Load the pre-trained distilbert-base-uncased model and modify it for sentiment classification

In [15]:
from transformers import DistilBertForSequenceClassification

# Load the model with 3 output labels (positive, neutral, negative)
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=3
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Use the Hugging Face Trainer for training and evaluation

In [17]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# Define a compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)  # Get predicted class from logits
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')  # Weighted to handle class imbalance
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [18]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    evaluation_strategy='epoch',    # Evaluate every epoch
    save_strategy='epoch',          # Save model every epoch
    logging_dir='./logs',           # Log directory
    learning_rate=2e-5,             # Learning rate
    per_device_train_batch_size=32, # Batch size for training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    num_train_epochs=4,             # Number of epochs
    weight_decay=0.01,              # Weight decay
    logging_steps=10,               # Log frequency
    save_total_limit=2,             # Save only the 2 most recent checkpoints
    seed=42                         # Random seed for reproducibility
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,   # Added data collator for dynamic padding
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4812,0.356668,0.896247,0.893115,0.894153,0.896247
2,0.1578,0.182963,0.935982,0.935819,0.936635,0.935982
3,0.0653,0.165968,0.93819,0.937874,0.937645,0.93819
4,0.0482,0.170022,0.93819,0.938298,0.938419,0.93819


TrainOutput(global_step=228, training_loss=0.2579154437571241, metrics={'train_runtime': 100.623, 'train_samples_per_second': 71.991, 'train_steps_per_second': 2.266, 'total_flos': 239902737214464.0, 'train_loss': 0.2579154437571241, 'epoch': 4.0})

In [19]:
pred_output = trainer.predict(val_dataset)
pred_output

PredictionOutput(predictions=array([[-1.8298757 ,  3.491975  , -1.7927507 ],
       [-1.7545061 ,  3.5166361 , -1.9101479 ],
       [-0.7545952 ,  1.256938  , -0.68558264],
       ...,
       [-1.6110903 , -1.2313405 ,  2.1863768 ],
       [-1.7777617 ,  3.4123278 , -1.6952436 ],
       [-1.7890335 ,  3.48794   , -1.8133243 ]], dtype=float32), label_ids=array([1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 2, 1, 2, 1, 1, 1, 2, 2, 2,
       0, 0, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 1, 0, 2, 2, 1, 2, 1, 1, 1, 2,
       1, 1, 2, 0, 2, 0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 1,
       2, 2, 0, 1, 2, 1, 2, 1, 0, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0,
       2, 1, 0, 1, 0, 1, 1, 0, 2, 2, 1, 0, 1, 0, 1, 1, 1, 1, 2, 0, 2, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2,
       2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 0, 2,
       2, 1, 1, 0, 2, 1, 1, 2, 2, 2, 1, 1, 1, 0, 0, 1, 1, 2,

In [20]:
pred_output.metrics

{'test_loss': 0.17002245783805847,
 'test_accuracy': 0.9381898454746137,
 'test_f1': 0.9382975104613333,
 'test_precision': 0.9384185944051928,
 'test_recall': 0.9381898454746137,
 'test_runtime': 1.0274,
 'test_samples_per_second': 440.904,
 'test_steps_per_second': 14.599}

After training, save the model and tokenizer for future use

In [33]:
model.save_pretrained('./financial-distilbert-lg')

In [34]:
tokenizer.save_pretrained('./financial-distilbert-lg')

('./financial-distilbert-lg/tokenizer_config.json',
 './financial-distilbert-lg/special_tokens_map.json',
 './financial-distilbert-lg/vocab.txt',
 './financial-distilbert-lg/added_tokens.json')

In [24]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


Use the Fine-Tuned Model for Sentiment Analysis

In [26]:
from transformers import pipeline

# Load the fine-tuned model
classifier = pipeline(
    'sentiment-analysis',
    model='./financial-distilbert-lg',
    tokenizer='./financial-distilbert-lg'
)

# Perform sentiment analysis on sample articles
articles = [
    "The company announced record-breaking profits this quarter.",
    "The market crash led to widespread panic among investors.",
    "The new product launch received mixed reviews."
]

# Get predictions
results = classifier(articles)

# Print results
for article, result in zip(articles, results):
    print(f"Article: {article}")
    print(f"Sentiment: {result['label']}, Confidence: {result['score']:.4f}")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Article: The company announced record-breaking profits this quarter.
Sentiment: LABEL_2, Confidence: 0.9697
Article: The market crash led to widespread panic among investors.
Sentiment: LABEL_0, Confidence: 0.9198
Article: The new product launch received mixed reviews.
Sentiment: LABEL_0, Confidence: 0.6330


In [28]:
# Define the label mappings
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}

# Update the model configuration
model.config.id2label = id2label
model.config.label2id = label2id

In [29]:
model.save_pretrained('./financial-distilbert-lg')

In [30]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('./financial-distilbert-lg')
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'NEUTRAL', 2: 'POSITIVE'}


In [32]:
from transformers import pipeline

# Load the fine-tuned model
classifier = pipeline(
    'sentiment-analysis',
    model='./financial-distilbert-lg',
    tokenizer='./financial-distilbert-lg',
    device=0  # Use the first GPU (0 indicates the first GPU)
)

# Perform sentiment analysis on sample articles
articles = [
    "The company announced record-breaking profits this quarter.",
    "The market crash led to widespread panic among investors.",
    "The new product launch received mixed reviews."
]

# Get predictions
results = classifier(articles)

# Print results
for article, result in zip(articles, results):
    print(f"Article: {article}")
    print(f"Sentiment: {result['label']}, Confidence: {result['score']:.4f}")

Article: The company announced record-breaking profits this quarter.
Sentiment: POSITIVE, Confidence: 0.9697
Article: The market crash led to widespread panic among investors.
Sentiment: NEGATIVE, Confidence: 0.9198
Article: The new product launch received mixed reviews.
Sentiment: NEGATIVE, Confidence: 0.6330


In [35]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
save_path = '/content/drive/My Drive/Colab Notebooks/NLP_Ignas/nlp_finalproject/models/financial-distilbert-lg'

# Save the model and tokenizer to Google Drive
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")

Model saved to /content/drive/My Drive/Colab Notebooks/NLP_Ignas/nlp_finalproject/models/financial-distilbert-lg


### model: Refers to the fine-tuned model weights and architecture (pytorch_model.bin and config.json files).

### tokenizer: Refers to the vocabulary and tokenization logic (vocab.txt or similar tokenizer-specific files). It processes text into token IDs for the model.