In [None]:
# Inspired from Notebook of Sinan Odzemir from a training

In [12]:
# Let's compare BERT with XLNET
from transformers import BertTokenizer, BertModel
  
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

bert_model = BertModel.from_pretrained("bert-base-cased")
bert_model


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [2]:
one_encoded = bert_tokenizer.encode_plus('How much will this cost?', add_special_tokens=True, return_tensors='pt')
two_encoded = bert_tokenizer.encode_plus('Is it expensive?', add_special_tokens=True, return_tensors='pt')


In [5]:
# the CLS token is at the beginning in BERT
one_embedded = bert_model(**one_encoded).last_hidden_state[:,0,:]
two_embedded = bert_model(**two_encoded).last_hidden_state[:,0,:]
one_encoded, two_encoded

({'input_ids': tensor([[ 101, 1731, 1277, 1209, 1142, 2616,  136,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])},
 {'input_ids': tensor([[ 101, 2181, 1122, 5865,  136,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])})

In [6]:
import torch

torch.nn.CosineSimilarity()(one_embedded, two_embedded)

tensor([0.9723], grad_fn=<DivBackward0>)

In [9]:
# Install Missing librart for XLNET (uncomment below line if required)
# !pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp38-cp38-win_amd64.whl (1.1 MB)
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [1]:
from transformers import XLNetTokenizer, XLNetModel

xlnet_tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

xlnet_model = XLNetModel.from_pretrained("xlnet-base-cased")


Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
xlnet_model

XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, b

In [3]:
one_encoded = xlnet_tokenizer.encode_plus('How much will this cost?', add_special_tokens=True, return_tensors='pt')
two_encoded = xlnet_tokenizer.encode_plus('Is it expensive?', add_special_tokens=True, return_tensors='pt')

In [4]:
xlnet_tokenizer.convert_ids_to_tokens(one_encoded['input_ids'][0])

['▁How', '▁much', '▁will', '▁this', '▁cost', '?', '<sep>', '<cls>']

In [32]:
# XLNET Splits unknown token into smallest possible known tokens (similar to BERT but without ##)
x = xlnet_tokenizer.encode_plus('abcdefghjijklmnopqrstuvwx', add_special_tokens=True, return_tensors='pt')
xlnet_tokenizer.convert_ids_to_tokens(x['input_ids'][0])

['▁a',
 'bc',
 'def',
 'gh',
 'j',
 'ijk',
 'l',
 'm',
 'no',
 'p',
 'q',
 'rs',
 't',
 'uv',
 'w',
 'x',
 '<sep>',
 '<cls>']

In [5]:
# the CLS token is at the end in XLNET
one_embedded = xlnet_model(**one_encoded).last_hidden_state[:,-1,:]
two_embedded = xlnet_model(**two_encoded).last_hidden_state[:,-1,:]

In [7]:
import torch
torch.nn.CosineSimilarity()(one_embedded, two_embedded)

tensor([0.9734], grad_fn=<DivBackward0>)

## Fine-tuning XLNET

In [10]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from datasets import Dataset

In [11]:
# Ingest 100 tweets from the Kaggle disaster tweet comopetition
import pandas as pd

tweets = pd.read_csv('../data/disaster_sample.csv')

tweets.head(2)

Unnamed: 0,index,id,keyword,location,text,target,label
0,7138,10224,volcano,,@MrMikeEaton @Muazimus_Prime hill hill mountai...,1,1
1,2151,3086,deaths,Blackpool,Cancers equate for around 25% of all deaths in...,1,1


In [14]:
tweet_dataset = Dataset.from_pandas(tweets)

# We will pad our dataset so that our input matrices are the same length and truncate anything longer than 512 tokens
def preprocess(data):
    return xlnet_tokenizer(data['text'], padding=True, truncation=True, max_length=512)

tweet_dataset = tweet_dataset.map(preprocess, batched=True, batch_size=len(tweet_dataset))

# Dataset has a built in train test split method

tweet_dataset = tweet_dataset.train_test_split(test_size=0.2)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
xlnet_sequence_classification_model = XLNetForSequenceClassification.from_pretrained(
    'xlnet-base-cased', num_labels=2
)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [20]:
from transformers import TrainingArguments, Trainer
import numpy as np

training_args = TrainingArguments(
    output_dir='./xlnet_clf',
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    warmup_steps=len(tweet_dataset['train']) // 5,  # number of warmup steps for learning rate scheduler,
    weight_decay = 0.05,
    logging_steps=1,
    log_level='info',
    evaluation_strategy='epoch',
    save_strategy='epoch'
)

# Define accuracy metric:

from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define the trainer:

trainer = Trainer(
    model=xlnet_sequence_classification_model,
    args=training_args,
    train_dataset=tweet_dataset['train'],
    eval_dataset=tweet_dataset['test'],
    compute_metrics=compute_metrics
)

# Get initial metrics
trainer.evaluate()

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


{'eval_loss': 0.6958828568458557,
 'eval_accuracy': 0.45,
 'eval_runtime': 6.1098,
 'eval_samples_per_second': 6.547,
 'eval_steps_per_second': 0.327}

In [21]:
trainer.train()

***** Running training *****
  Num examples = 160
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 50


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6721,0.702312,0.5
2,0.6607,0.671728,0.525
3,0.6004,0.624582,0.725
4,0.5898,0.488513,0.825
5,0.4572,0.435203,0.8
6,0.4599,0.435832,0.9
7,0.1483,0.570979,0.825
8,0.229,0.593296,0.825
9,0.06,0.657469,0.825
10,0.0213,0.554992,0.825


***** Running Evaluation *****
  Num examples = 40
  Batch size = 32
Saving model checkpoint to ./xlnet_clf\checkpoint-5
Configuration saved in ./xlnet_clf\checkpoint-5\config.json
Model weights saved in ./xlnet_clf\checkpoint-5\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32
Saving model checkpoint to ./xlnet_clf\checkpoint-10
Configuration saved in ./xlnet_clf\checkpoint-10\config.json
Model weights saved in ./xlnet_clf\checkpoint-10\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32
Saving model checkpoint to ./xlnet_clf\checkpoint-15
Configuration saved in ./xlnet_clf\checkpoint-15\config.json
Model weights saved in ./xlnet_clf\checkpoint-15\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 40
  Batch size = 32
Saving model checkpoint to ./xlnet_clf\checkpoint-20
Configuration saved in ./xlnet_clf\checkpoint-20\config.json
Model weights saved in ./xlnet_clf\checkpoint-20\pytorch_model.bin
***

TrainOutput(global_step=50, training_loss=0.4009051025286317, metrics={'train_runtime': 976.5943, 'train_samples_per_second': 1.638, 'train_steps_per_second': 0.051, 'total_flos': 70329819014400.0, 'train_loss': 0.4009051025286317, 'epoch': 10.0})

In [22]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 40
  Batch size = 32


{'eval_loss': 0.43520259857177734,
 'eval_accuracy': 0.8,
 'eval_runtime': 5.3537,
 'eval_samples_per_second': 7.471,
 'eval_steps_per_second': 0.374,
 'epoch': 10.0}

['▁a',
 'bc',
 'def',
 'gh',
 'j',
 'ijk',
 'l',
 'm',
 'no',
 'p',
 'q',
 'rs',
 't',
 'uv',
 'w',
 'x',
 '<sep>',
 '<cls>']