# BERT fine-tuning
This notebook uses a roBERTa model pre-trained on twitter data for the task of sentiment analysis.
Starting from this base model, we fine-tune either just the parameters of the output layer, or the ones of the output layer and the Layer norms.

### Imports and setup

In [1]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_auc_score
from typing import Callable, Dict
import logging


os.environ["WANDB_DISABLED"] = "true"

### Evaluation methods

In [3]:
def _log_metrics(metrics):
  logging.info(
    '---\n' +
    '\n'.join([f'* {x}: {y}' for x,y in metrics.items()]) +
    '\n---'
  )


def evaluate_prob(y: np.array, y_pred: np.array, verbose=True) -> Dict[str, float]:
  """
  Returns BCE loss, AUC in this order.
  """

  bce = log_loss(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  result = {'bce': bce, 'auc': auc}

  if verbose:
    _log_metrics(result)
  return result


def evaluate(y: np.array, y_pred: np.array) -> Dict[str, float]:
  """
  Returns accuracy, precision, recall, F1, BCE loss, AUC in this order.

  * accuracy: proportion of correctly classified answers
  * precision: proportion of correctly classified positives
  * recall: proportion of actual positives correctly classified
  * F1: combination of precision & recall
  """

  accuracy = accuracy_score(y, y_pred)
  precision = precision_score(y, y_pred)
  recall = recall_score(y, y_pred)
  f1 = f1_score(y, y_pred)
  # prob_metrics = evaluate_prob(y, y_pred, verbose=False)
  result = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

  _log_metrics(result)
  return result


def evaluate_model(model: Callable[[pd.DataFrame], np.array], df: pd.DataFrame) -> Dict[str, float]:
  """
  Expects a dataframe with columns `x` and `y`.
  """

  y = df['y'].to_numpy()
  y_pred = model(df)

  return evaluate(y, y_pred)



### Preprocessing methods

In [4]:
from typing import Dict, Optional
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from textblob import Word
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))

nltk.download('wordnet')
tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

tqdm.pandas()

def to_lower(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains sentences.
  """
  df[x_col] = df[x_col].apply(lambda sentence: sentence.lower())

def tokenize(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains sentences.
  """
  df[x_col] = df[x_col].apply(lambda sentence: tokenizer.tokenize(sentence))

def remove_tags(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains sentences.
  Deprecated in favour of remove_tag_tokens(df: pd.DataFrame)
  """
  df[x_col] = df[x_col].apply(lambda sentence: sentence.replace('<user>', '').replace('<url>', '').strip())

def remove_tag_tokens(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w for w in tokens if not w in ['user', '<url>']])

def remove_stopwords(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w for w in tokens if not w in stop_words])

def lemmatize(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])

def remove_single_symbols(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w for w in tokens if len(w) > 1])

def spelling_correction(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].progress_apply(lambda tokens: [Word(w).correct() for w in tokens])


def replace_user_handles(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w if not (w.startswith("@") and len(w) > 1) else "<user>" for w in tokens])

def replace_urls(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: [w if not (w.startswith("http://") or w.startswith("https://") or w.startswith("www.")) else "<url>" for w in tokens])

def untokenize(df: pd.DataFrame, x_col='x'):
  """
  To be applied to a dataframe with a column called 'x' that contains tokens.
  """
  df[x_col] = df[x_col].apply(lambda tokens: " ".join(tokens))

def preprocess(df: pd.DataFrame, flags: Optional[Dict[str, bool]], x_col='x'):
  if flags is not None:
    if flags.get('to_lower', False):
      to_lower(df, x_col=x_col)
    if flags.get('tokenize', False):
      tokenize(df, x_col=x_col)
    if flags.get('replace_user_handles', False):
      replace_user_handles(df, x_col=x_col)
    if flags.get('replace_urls', False):
      replace_urls(df, x_col=x_col)  
    if flags.get('remove_tags', False):
      remove_tags(df, x_col=x_col)
    if flags.get('remove_tag_tokens', False):
      remove_tag_tokens(df, x_col=x_col)
    if flags.get('remove_stopwords', False):
      remove_stopwords(df, x_col=x_col)
    if flags.get('lemmatize', False):
      lemmatize(df, x_col=x_col)
    if flags.get('remove_single_symbols', False):
      remove_single_symbols(df, x_col=x_col)
    if flags.get('spelling_correction', False):
      spelling_correction(df, x_col=x_col)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Methods to get and prepare the dataset, the base model, compute the metrics, and train the model

In [5]:
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, ClassLabel
from scipy.special import softmax


def prepare_dataset(df, preprocessing=None):
  preprocess(df, flags=preprocessing, x_col='text')
  dataset = Dataset.from_pandas(df)

  new_features = dataset.features.copy()
  new_features['label'] = ClassLabel(names=['0', '1'])

  dataset = dataset.cast(new_features)
  return dataset


def tokenize(ds, tokenizer, path=None, force=True):
  def tokenize_function(ds):
    return tokenizer(ds['text'], padding=True, truncation=True)

  def load_or_tokenize(ds, path, force):
    if not force and path is not None and Path(path).exists():
      return Dataset.load_from_disk(path)
    else:
      ds_tokenized = ds.map(tokenize_function, batched=True)
      if path is not None:
        ds_tokenized.save_to_disk(path)
      return ds_tokenized

  return load_or_tokenize(ds, path=path, force=force)


def get_BERT(model_name, device):
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to(device)
  model.save_pretrained(model_name)
  return model


def compute_metrics(eval_pred):
  predictions, labels = eval_pred

  point_estimates = np.argmax(predictions, axis=1)
  point_estimate_eval = evaluate(labels, point_estimates)

  prob_estimates = softmax(predictions, axis=1)[:, 1]
  prob_estimates_eval = evaluate_prob(labels, prob_estimates)
  confidence = np.max(softmax(predictions, axis=1), axis=1)
  all_confidence = confidence.mean()
  all_confidence_std = confidence.std()
  correct_confidence = confidence[labels == point_estimates].mean()
  correct_confidence_std = confidence[labels == point_estimates].std()
  incorrect_confidence = confidence[labels != point_estimates].mean()
  incorrect_confidence_std = confidence[labels != point_estimates].std()

  return {
    **point_estimate_eval,
    **prob_estimates_eval,
    'confidence': all_confidence,
    'confidence_std': all_confidence_std,
    'correct_confidence': correct_confidence,
    'correct_confidence_std': correct_confidence_std,
    'incorrect_confidence': incorrect_confidence,
    'incorrect_confidence_std': incorrect_confidence_std,
  }


def train(model_name, tokenizer_name, device, df_train, df_val, preprocessing=None, batch_size=32, epochs=1, force_tokenize=True, train_LN=True):
  dataset_train = prepare_dataset(df_train, preprocessing=preprocessing)
  dataset_val = prepare_dataset(df_val, preprocessing=preprocessing)

  tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  train_tokenized = tokenize(
    dataset_train,
    tokenizer,
    force=force_tokenize)
  val_tokenized = tokenize(
    dataset_val,
    tokenizer,
    force=force_tokenize)

  model = get_BERT(model_name, device)
  for param in model.parameters():
    param.requires_grad = False

  for param in model.classifier.parameters():
    param.requires_grad = True

  if train_LN:
    n_params = set_trainable(model)

  training_args = TrainingArguments(
    output_dir='bert_data/test_trainer',
    num_train_epochs=epochs,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True)

  trainer = Trainer(
    model,
    training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

  trainer.train()

  val_pred = trainer.predict(val_tokenized)
  y_pred = np.argmax(val_pred.predictions, axis=1)
  y = val_tokenized.to_pandas()['label']
  metrics = evaluate(y, y_pred)

  return model


Loading of the training and evaluation data; methods to select a subset of the data.

In [6]:
import pandas as pd

DF_TRAIN = pd.read_csv('../input/twittersentiment/train.csv')[['index', 'text', 'label']]
DF_EVAL = pd.read_csv('../input/twittersentiment/test.csv')[['index', 'text', 'label']]


def select_train(size=160_000):
  df = DF_TRAIN
  if size is not None:
    df = df.iloc[:size]
  return df


def select_eval(size=40_000):
  df = DF_EVAL
  if size is not None:
    df = df.iloc[:size]
  return df

Methods to set the layer norm parameters to trainable and display the model and parameters in a table.

In [7]:
import numpy as np
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Trainable Parameters", "Parameters"])
    total_params_train = 0
    total_params = 0
    for name, parameter in model.named_parameters():
        params = parameter.numel()
        params_t = params
        if not parameter.requires_grad:
            params_t = 0
        table.add_row([name, params_t, params])
        total_params_train += params_t
        total_params += params
    print(table)
    print(f"Total Trainable Params: {total_params_train}")
    pourcent = float(total_params_train)/total_params
    print(f"Percentage of trainable parameters: {pourcent}")

    return total_params_train, pourcent


def set_trainable(model):
    count = 0
    for name, parameter in model.named_parameters():
        if "layer_norm" in str(name) or "LayerNorm" in str(name):
            count += 1
            parameter.requires_grad = True

    print(count)
    return count_parameters(model)[0]

---

## Training and evaluation of the model
Set `train_LN` to False if you wish to only train the output layer.

In [8]:
logging.basicConfig(level=logging.INFO)

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [10]:
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
TOKENIZER = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

EPOCHS = 5
BATCH_SIZE = 32
TRAIN_LN = True

In [11]:
df_train = select_train()
df_eval = select_eval()
model = train(MODEL, TOKENIZER, device, df_train, df_eval, batch_size=BATCH_SIZE, epochs=EPOCHS, train_LN=TRAIN_LN)

Casting the dataset:   0%|          | 0/16 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

  0%|          | 0/160 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/40 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

50
+------------------------------------------------------------+----------------------+------------+
|                          Modules                           | Trainable Parameters | Parameters |
+------------------------------------------------------------+----------------------+------------+
|         roberta.embeddings.word_embeddings.weight          |          0           |  38603520  |
|       roberta.embeddings.position_embeddings.weight        |          0           |   394752   |
|      roberta.embeddings.token_type_embeddings.weight       |          0           |    768     |
|            roberta.embeddings.LayerNorm.weight             |         768          |    768     |
|             roberta.embeddings.LayerNorm.bias              |         768          |    768     |
|    roberta.encoder.layer.0.attention.self.query.weight     |          0           |   589824   |
|     roberta.encoder.layer.0.attention.self.query.bias      |          0           |    768     |
|     r

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Bce,Auc,Confidence,Confidence Std,Correct Confidence,Correct Confidence Std,Incorrect Confidence,Incorrect Confidence Std
1,0.3297,0.308614,0.86645,0.871013,0.86066,0.865806,0.308614,0.943056,0.890005,0.134136,0.911891,0.118058,0.748017,0.145018
2,0.3128,0.293247,0.8722,0.877169,0.865954,0.871526,0.293247,0.947629,0.885157,0.13673,0.907403,0.120732,0.733331,0.142749
3,0.3048,0.28742,0.876575,0.865952,0.891425,0.878504,0.28742,0.949487,0.894863,0.134078,0.916201,0.117503,0.743322,0.146262
4,0.2977,0.285683,0.87805,0.869805,0.889527,0.879556,0.285684,0.950206,0.895644,0.134801,0.916795,0.118492,0.743356,0.14664
5,0.2989,0.284905,0.878675,0.873535,0.885881,0.879665,0.284905,0.950482,0.89542,0.134714,0.916376,0.118718,0.743653,0.145922


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, index. If text, index are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 40000
  Batch size = 8
Saving model checkpoint to bert_data/test_trainer/checkpoint-5000
Configuration saved in bert_data/test_trainer/checkpoint-5000/config.json
Model weights saved in bert_data/test_trainer/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in bert_data/test_trainer/checkpoint-5000/tokenizer_config.json
Special tokens file saved in bert_data/test_trainer/checkpoint-5000/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text, index. If text, index are not expected by `RobertaForSequenceClassification.forward`,  you c

In [12]:
model.save_pretrained('model-twitter')

Configuration saved in model-twitter/config.json
Model weights saved in model-twitter/pytorch_model.bin
