 # Test Jabberwockie

In [44]:
import pandas as pd
import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

from bert import BERT
from utils_vocab import BasicTokenizer, BERTDataset, evaluate

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [45]:
# tokenizer_file = 'tokenizer_5.pkl'
# tokenizer_file = 'tokenizer_10.pkl'
# tokenizer_file = 'tokenizer_15.pkl'
tokenizer_file = 'tokenizer_20.pkl'

# path_model = 'equivalencia_5.pt'
# path_model = 'equivalencia_10.pt'
# path_model = 'equivalencia_15.pt'
path_model = 'equivalencia_20.pt'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

cuda


## Crear el Tokenizer

In [46]:
special_symbols = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']

simple_tokenizer = lambda tokens_string: tokens_string.strip().split()
tokenizer = BasicTokenizer.create_using_stoi(simple_tokenizer, special_symbols, tokenizer_file)
print('vocabulary_size:', tokenizer.get_vocab_size())

vocabulary_size: 86


## Cargar datos Jabberwockie

In [47]:
path_dataset = 'bert_data_equivalencia_jabberwockie_5.csv'

df = pd.read_csv(path_dataset)
df.columns = ['BERT Input', 'BERT Label', 'Segment Label', 'Is Next']
print(df.shape)
df.head(3)

(10650, 4)


Unnamed: 0,BERT Input,BERT Label,Segment Label,Is Next
0,"[2, 5, 6, 25, 26, 27, 3, 24, 25, 26, 5, 4, 3, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 27, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0]",1
1,"[2, 5, 6, 25, 4, 27, 3, 24, 25, 28, 5, 27, 3, 1]","[1, 1, 1, 1, 28, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0]",1
2,"[2, 5, 4, 25, 29, 27, 3, 24, 25, 29, 5, 27, 3, 1]","[1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0]",1


### Crear el dataloader

In [48]:
PAD_IDX = 1

def collate_batch(batch):
    bert_inputs_batch, bert_labels_batch, segment_labels_batch, is_nexts_batch = [], [], [], []

    for bert_input, bert_label, segment_label, is_next in batch:
        # Convert each sequence to a tensor and append to the respective list
        bert_inputs_batch.append(torch.tensor(bert_input, dtype=torch.long))
        bert_labels_batch.append(torch.tensor(bert_label, dtype=torch.long))
        segment_labels_batch.append(torch.tensor(segment_label, dtype=torch.long))
        is_nexts_batch.append(is_next)

    # Pad the sequences in the batch
    bert_inputs_final = pad_sequence(bert_inputs_batch, padding_value=PAD_IDX, batch_first=False)
    bert_labels_final = pad_sequence(bert_labels_batch, padding_value=PAD_IDX, batch_first=False)
    segment_labels_final = pad_sequence(segment_labels_batch, padding_value=PAD_IDX, batch_first=False)
    is_nexts_batch = torch.tensor(is_nexts_batch, dtype=torch.long)

    return bert_inputs_final.to(device), bert_labels_final.to(device), segment_labels_final.to(device), is_nexts_batch.to(device)

In [49]:
BATCH_SIZE = 128

jabberwockie_dataset = BERTDataset(df)

jabberwockie_dataloader = DataLoader(jabberwockie_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

for bert_input, bert_label, segment_label, is_next in jabberwockie_dataloader:
    print(f'{bert_input=}')
    print(f'{bert_label=}')
    print(f'{segment_label=}')
    print(f'{is_next=}')
    break

bert_input=tensor([[ 2,  2,  2,  ...,  2,  2,  2],
        [ 6,  4,  6,  ..., 24,  5,  6],
        [ 4, 24,  4,  ..., 38,  4, 39],
        ...,
        [ 1,  3,  5,  ...,  1,  1,  5],
        [ 1,  1,  4,  ...,  1,  1, 33],
        [ 1,  1,  3,  ...,  1,  1,  3]], device='cuda:0')
bert_label=tensor([[ 1,  1,  1,  ...,  1,  1,  1],
        [ 1,  5,  1,  ...,  1,  1,  1],
        [32,  1, 37,  ...,  1,  6,  1],
        ...,
        [ 1,  1,  1,  ...,  1,  1,  1],
        [ 1,  1, 35,  ...,  1,  1,  1],
        [ 1,  1,  1,  ...,  1,  1,  1]], device='cuda:0')
segment_label=tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 2, 2,  ..., 1, 1, 2],
        [1, 1, 2,  ..., 1, 1, 2],
        [1, 1, 2,  ..., 1, 1, 2]], device='cuda:0')
is_next=tensor([0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
        0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
        1, 1, 0, 0,

In [50]:
EMBEDDING_DIM = 16

# Define parameters
vocab_size = tokenizer.get_vocab_size()  # Replace VOCAB_SIZE with your vocabulary size
d_model = EMBEDDING_DIM  # Replace EMBEDDING_DIM with your embedding dimension
n_layers = 4  # Number of Transformer layers
initial_heads = 4
# Ensure the number of heads is a factor of the embedding dimension
heads = initial_heads - d_model % initial_heads

dropout = 0.1  # Dropout rate

# Create an instance of the BERT model
model = BERT(vocab_size, d_model, n_layers, heads, dropout)
model.to(device)

model.load_state_dict(torch.load(path_model, weights_only=True,map_location=torch.device(device)))
model.eval()

BERT(
  (bert_embedding): BERTEmbedding(
    (token_embedding): TokenEmbedding(
      (embedding): Embedding(86, 16)
    )
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (segment_embedding): Embedding(3, 16)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
    )
    (linear1): Linear(in_features=16, out_features=32, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=32, out_features=16, bias=True)
    (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerE

In [51]:
loss_fn_mlm = nn.CrossEntropyLoss(ignore_index=PAD_IDX)# The loss function must ignore PAD tokens and only calculates loss for the masked tokens
loss_fn_nsp = nn.CrossEntropyLoss()

In [52]:
loss, acc, f1 = evaluate(
    dataloader=jabberwockie_dataloader,
    model=model,
    loss_fn_mlm=loss_fn_mlm,
    loss_fn_nsp=loss_fn_nsp,
    device=device
)
print(f'Test loss: {loss}')
print(f'Test f1 score: {f1}')
print(f'Test accurracy: {acc}')

Test loss: 5.0041118285235235
Test f1 score: 0.8455674145066434
Test accurracy: 0.800281690140845
