<a href="https://colab.research.google.com/github/dafrie/fin-disclosures-nlp/blob/master/Multi_class_classification_with_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multi-Class classification with Transformers

# Setup

In [1]:
# Load Google drive where the data and models are stored
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
############################## CONFIG ##############################
TASK = "multi-class" #@param ["multi-class"]

# Set to true if fine-tuning should be enabled. Else it loads fine-tuned model
ENABLE_FINE_TUNING = True #@param {type:"boolean"}

# See list here: https://huggingface.co/models
TRANSFORMER_MODEL_NAME = 'distilbert-base-cased' #@param ["bert-base-uncased", "bert-large-uncased", "albert-base-v2", "albert-large-v2", "albert-xlarge-v2", "albert-xxlarge-v2", "roberta-base", "roberta-large", "distilbert-base-uncased", "distilbert-base-cased"]

# The DataLoader needs to know our batch size for training. BERT Authors recommend 16 or 32, however this leads to an error due to not enough GPU memory
BATCH_SIZE = 16 #@param ["8", "16", "32"] {type:"raw"}
MAX_TOKEN_SIZE = 256 #@param [512,256,128] {type:"raw"}
EPOCHS = 4 # @param [1,2,3,4] {type:"raw"}
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.0 # TODO: Necessary?

# Evaluation metric config. See for context: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
AVERAGING_STRATEGY = 'macro' #@param ["micro",  "macro", "weighted"]

# To make the notebook reproducible (not guaranteed for pytorch on different releases/platforms!)
SEED_VALUE = 0

# Enable comet-ml logging
DISABLE_COMET_ML = True #@param {type:"boolean"}
####################################################################

full_task_name = TASK

parameters = {
    "task": TASK,
    "enable_fine_tuning": ENABLE_FINE_TUNING,
    "model_type": "transformer",
    "model_name": TRANSFORMER_MODEL_NAME,
    "batch_size": BATCH_SIZE,
    "max_token_size": MAX_TOKEN_SIZE,
    "epochs": EPOCHS,
    "learning_rate": LEARNING_RATE,
    "weight_decay": WEIGHT_DECAY,
    "seed_value": SEED_VALUE,
}

# TODO: This could then be used to send to cometml to keep track of experiments...

In [3]:
# Install transformers library + datasets helper
!pip install transformers --quiet
!pip install datasets --quiet
!pip install optuna --quiet

import os
import pandas as pd
import numpy as np
import torch
import textwrap
import random
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from transformers import logging, AutoTokenizer

model_id = TRANSFORMER_MODEL_NAME
print(f"Selected {TRANSFORMER_MODEL_NAME} as transformer model for the task...")

# Setup the models path
saved_models_path = "/content/drive/My Drive/{YOUR_PROJECT_HERE}/models/finetuned_models/"
expected_model_path = os.path.join(saved_models_path, TASK, model_id)
has_model_path = os.path.isdir(expected_model_path)
model_checkpoint = TRANSFORMER_MODEL_NAME if ENABLE_FINE_TUNING else expected_model_path

# Check if model exists
if not ENABLE_FINE_TUNING:
  assert has_model_path, f"No fine-tuned model found at '{expected_model_path}', you need first to fine-tune a model from a pretrained checkpoint by enabling the 'ENABLE_FINE_TUNING' flag!"

[K     |████████████████████████████████| 1.4MB 12.1MB/s 
[K     |████████████████████████████████| 890kB 23.6MB/s 
[K     |████████████████████████████████| 2.9MB 55.4MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 163kB 13.7MB/s 
[K     |████████████████████████████████| 245kB 32.1MB/s 
[K     |████████████████████████████████| 17.7MB 200kB/s 
[K     |████████████████████████████████| 266kB 18.1MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 163kB 50.0MB/s 
[K     |████████████████████████████████| 81kB 11.0MB/s 
[K     |████████████████████████████████| 81kB 11.7MB/s 
[K     |████████████████████████████████| 112kB 24.1MB/s 
[K     |████████████████████████████████| 51kB 7.4MB/s 
[K     |████████████████████████████████| 133kB 50.5MB/s 

# Data loading

In [4]:
# Note: Uses https://huggingface.co/docs/datasets/package_reference/main_classes.html
from datasets import DatasetDict, Dataset, load_dataset, Sequence, ClassLabel, Features, Value, concatenate_datasets

# TODO: Adapt
doc_column = 'text' # Contains the text
label_column = 'cro' # Needs to be an integer that represents the respective class

# TODO: Load train/test data
df_train = pd.read_pickle("/content/drive/My Drive/fin-disclosures-nlp/data/labels/Firm_AnnualReport_Labels_Training.pkl")
df_test = pd.read_pickle("/content/drive/My Drive/fin-disclosures-nlp/data/labels/Firm_AnnualReport_Labels_Test.pkl")

df_train = df_train.query(f"{label_column} == {label_column}")
df_test = df_test.query(f"{label_column} == {label_column}")

category_labels = df_train[label_column].unique().tolist()
no_of_categories = len(category_labels)

# TODO: Not sure if this step is necessary, but if you have the category in text and not integers
# This assumes that there is t
df_train[label_column] = df_train[label_column].astype('category').cat.codes.to_numpy(copy=True)
df_test[label_column] = df_test[label_column].astype('category').cat.codes.to_numpy(copy=True)

train_dataset = pd.DataFrame(df_train[[doc_column, label_column]].to_numpy(), columns=['text', 'labels'])
test_dataset = pd.DataFrame(df_test[[doc_column, label_column]].to_numpy(), columns=['text', 'labels'])

features = Features({'text': Value('string'), 'labels': ClassLabel(names=category_labels, num_classes=no_of_categories)})

# Setup Hugginface Dataset
train_dataset = Dataset.from_pandas(train_dataset, features=features)
test_dataset = Dataset.from_pandas(test_dataset, features=features)

dataset = DatasetDict({ 'train': train_dataset, 'test': test_dataset })

## Tokenization

In [5]:
# Load the tokenizer.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# Encode the whole dataset
def encode(data, max_len=MAX_TOKEN_SIZE):
  return tokenizer(data["text"], truncation=True, padding='max_length', max_length=max_len)

dataset = dataset.map(encode, batched=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




## Validation set preparation

In [6]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# See here for this workaround: https://github.com/huggingface/datasets/issues/767
dataset['train'], dataset['valid'] = dataset['train'].train_test_split(test_size=0.1, seed=SEED_VALUE).values()

In [7]:
dataset['train'].features

{'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'labels': ClassLabel(num_classes=3, names=['OP', 'PR', 'TR'], names_file=None, id=None),
 'text': Value(dtype='string', id=None)}

# Model Setup and Training

In [8]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, matthews_corrcoef
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers.trainer_pt_utils import nested_detach
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from scipy.special import expit 

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Sets the evaluation metric depending on the task
# TODO: Set your evaluation metric! Needs to be also in the provided "compute_metrics" function below
metric_name = "matthews_correlation"

# The training arguments
args = TrainingArguments(
    output_dir=f"/content/models/{TASK}/{model_id}",
    evaluation_strategy = "epoch",
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    num_train_epochs = EPOCHS,
    weight_decay = WEIGHT_DECAY,
    load_best_model_at_end = True,
    metric_for_best_model = metric_name,
    greater_is_better = True,
    seed = SEED_VALUE,
)


def model_init():
  """Model initialization. Disabels logging temporarily to avoid spamming messages and loads the pretrained or fine-tuned model""" 
  logging.set_verbosity_error() # Workaround to hide warnings that the model weights are randomly set and fine-tuning is necessary (which we do later...)
  model = AutoModelForSequenceClassification.from_pretrained(
      model_checkpoint, # Load from model checkpoint, i.e. the pretrained model or a previously saved fine-tuned model
    num_labels = no_of_categories, # The number of different categories/labels
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.)
  )
  logging.set_verbosity_warning()
  return model

def compute_metrics(pred):
    """Computes classification task metric"""
    labels = pred.label_ids
    preds = pred.predictions

    # Convert to probabilities
    preds_prob = expit(preds)
    
    # Convert to 0/1, i.e. set to 1 the class with the highest logit
    preds = preds.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=AVERAGING_STRATEGY)
    acc = accuracy_score(labels, preds)
    matthews_corr = matthews_corrcoef(labels, preds)
    return {
      'f1': f1,
      'precision': precision,
      'recall': recall,
      'matthews_correlation': matthews_corr
    }

class CroTrainer(Trainer):
    # Note: If you need to do extra customization (like to alter the loss computation by adding weights), this can be done here
    pass

trainer = CroTrainer(
    model_init=model_init,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Only train if enabled, else we just want to load the model
if ENABLE_FINE_TUNING:
  trainer.train()
  trainer.save_model()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…




Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Matthews Correlation
1,No log,0.517094,0.293839,0.262712,0.333333,0.0
2,No log,0.317745,0.787143,0.806001,0.772924,0.726577
3,No log,0.227818,0.882658,0.905034,0.873507,0.836056
4,No log,0.221527,0.826008,0.82064,0.83184,0.788483



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


invalid value encountered in double_scalars



In [8]:
eval_metrics = trainer.evaluate()
# experiment.log_metrics(eval_metrics)

In [None]:
predict_result = trainer.predict(dataset['test'])

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report
from scipy.special import expit 

preds = predict_result.predictions
labels = predict_result.label_ids

test_roc_auc = roc_auc_score(labels, preds, average=AVERAGING_STRATEGY)
print("Test ROC AuC: ", test_roc_auc)

preds_prob = expit(preds)
threshold = 0.5
preds_bool = (preds_prob > threshold)

label_list = test_dataset.features['labels'].feature.names
multilabel_confusion_matrix(labels, preds_bool)
print(classification_report(labels, preds_bool, target_names=label_list))