# Test Consistencia

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from copy import deepcopy
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from bert import BERT
from utils_vocab import BasicTokenizer, BERTDatasetNoLabels, evaluate

from sklearn.metrics import accuracy_score, f1_score

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
# raw_dataset = 'equivalencia_5.csv'
# raw_dataset = 'equivalencia_10.csv'
# raw_dataset = 'equivalencia_15.csv'
raw_dataset = 'equivalencia_20.csv'

# tokenizer_file = 'tokenizer_5.pkl'
# tokenizer_file = 'tokenizer_10.pkl'
# tokenizer_file = 'tokenizer_15.pkl'
tokenizer_file = 'tokenizer_20.pkl'

# path_model = 'equivalencia_5.pt'
# path_model = 'equivalencia_10.pt'
# path_model = 'equivalencia_15.pt'
path_model = 'equivalencia_20.pt'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

### Crear el Tokenizer

In [3]:
special_symbols = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']
PAD_IDX = 1
CLS_IDX = 2
SEP_IDX = 3

simple_tokenizer = lambda tokens_string: tokens_string.strip().split()
tokenizer = BasicTokenizer.create_using_stoi(simple_tokenizer, special_symbols, tokenizer_file)
print('vocabulary_size:', tokenizer.get_vocab_size())

vocabulary_size: 86


### Cargar datos y crear dataloader

In [4]:
df = pd.read_csv(raw_dataset)
print(df.shape)

(2612399, 3)


In [5]:
def direct_prepare_bert_final_inputs(sentences1, sentences2, is_nexts, to_tensor=True):
    """
    Prepare the final input lists for BERT training.
    """
    def zero_pad_list_pair(pair_, pad=PAD_IDX):
        pair = deepcopy(pair_)
        max_len = max(len(pair[0]), len(pair[1]))
        #append [PAD] to each sentence in the pair till the maximum length reaches
        pair[0].extend([pad] * (max_len - len(pair[0])))
        pair[1].extend([pad] * (max_len - len(pair[1])))
        return pair[0], pair[1]

    #flatten the tensor
    flatten = lambda l: [item for sublist in l for item in sublist]

    bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final = [], [], [], []

    for sentence1, sentence2, is_next in zip(sentences1, sentences2, is_nexts):
        # Tokenize each sentence
        tokens1 = tokenizer.encode(sentence1).ids
        tokens2 = tokenizer.encode(sentence2).ids
        bert_input = ([CLS_IDX] + tokens1 + [SEP_IDX], tokens2 + [SEP_IDX])

        # Create segment labels for each pair of sentences
        segment_label = [[1] * len(bert_input[0]), [2] * len(bert_input[1])]

        # Zero-pad the bert_input and bert_label and segment_label
        bert_input_padded = zero_pad_list_pair(bert_input)
        segment_label_padded = zero_pad_list_pair(segment_label,pad=0)

        #convert to tensors
        if to_tensor:

            # Flatten the padded inputs and labels, transform tokens to their corresponding vocab indices, and convert them to tensors
            # bert_inputs_final.append(torch.tensor(tokens_to_index(flatten(bert_input_padded)),dtype=torch.int64))
            # bert_labels_final.append(torch.tensor(tokens_to_index(flatten(bert_label_padded)),dtype=torch.int64))
            # segment_labels_final.append(torch.tensor(flatten(segment_label_padded),dtype=torch.int64))
            bert_inputs_final.append(flatten(bert_input_padded))
            segment_labels_final.append(flatten(segment_label_padded))
            is_nexts_final.append(is_next)

        else:
          # Flatten the padded inputs and labels
            bert_inputs_final.append(flatten(bert_input_padded))
            segment_labels_final.append(flatten(segment_label_padded))
            is_nexts_final.append(is_next)

    return bert_inputs_final, segment_labels_final, is_nexts_final

In [6]:
sentences1 = df.iloc[:, 0]
sentences2 = df.iloc[:, 1]
is_nexts = df.iloc[:, 2]
bert_inputs_final, segment_labels_final, is_nexts_final = direct_prepare_bert_final_inputs(sentences1, sentences2, is_nexts)

In [7]:
df_final = pd.DataFrame({
    'BERT Input': bert_inputs_final,
    'Segment Label': segment_labels_final,
    'Is Next': is_nexts_final
})
print(f'{df_final.shape=}')

df_final.shape=(2612399, 3)


In [8]:
dataset = BERTDatasetNoLabels(df_final)
bert_inputs, segment_labels, is_nexts = dataset[1]

In [9]:
PAD_IDX = 1

print(device)

def collate_batch(batch):
    bert_inputs_batch, segment_labels_batch, is_nexts_batch = [], [], []

    for bert_input, segment_label, is_next in batch:
        # Convert each sequence to a tensor and append to the respective list
        bert_inputs_batch.append(torch.tensor(bert_input, dtype=torch.long))
        segment_labels_batch.append(torch.tensor(segment_label, dtype=torch.long))
        is_nexts_batch.append(is_next)

    # Pad the sequences in the batch
    bert_inputs_final = pad_sequence(bert_inputs_batch, padding_value=PAD_IDX, batch_first=False)
    segment_labels_final = pad_sequence(segment_labels_batch, padding_value=PAD_IDX, batch_first=False)
    is_nexts_batch = torch.tensor(is_nexts_batch, dtype=torch.long)

    return bert_inputs_final.to(device), segment_labels_final.to(device), is_nexts_batch.to(device)

cuda


In [10]:
BATCH_SIZE = 128

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)


In [11]:
bert_inputs, segment_labels, is_nexts = next(iter(dataloader))

### Cargar el modelo

In [12]:
EMBEDDING_DIM = 16

# Define parameters
vocab_size = tokenizer.get_vocab_size()  # Replace VOCAB_SIZE with your vocabulary size
d_model = EMBEDDING_DIM  # Replace EMBEDDING_DIM with your embedding dimension
n_layers = 4  # Number of Transformer layers
initial_heads = 4
# Ensure the number of heads is a factor of the embedding dimension
heads = initial_heads - d_model % initial_heads

dropout = 0.1  # Dropout rate

# Create an instance of the BERT model
model = BERT(vocab_size, d_model, n_layers, heads, dropout)
model.to(device)

model.load_state_dict(torch.load(path_model, weights_only=True,map_location=torch.device(device)))
model.eval()

BERT(
  (bert_embedding): BERTEmbedding(
    (token_embedding): TokenEmbedding(
      (embedding): Embedding(86, 16)
    )
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (segment_embedding): Embedding(3, 16)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
    )
    (linear1): Linear(in_features=16, out_features=32, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=32, out_features=16, bias=True)
    (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerE

In [13]:
bert_inputs, segment_labels, is_nexts = next(iter(dataloader))

In [14]:
bert_inputs.cpu().numpy().transpose(1, 0).tolist()

[[2, 5, 69, 50, 15, 38, 68, 42, 3, 85, 50, 15, 38, 68, 42, 3, 1, 1, 1, 1],
 [2, 5, 6, 62, 14, 36, 68, 47, 3, 69, 62, 14, 5, 36, 67, 5, 47, 3, 1, 1],
 [2, 6, 56, 27, 37, 67, 31, 3, 69, 56, 27, 37, 67, 31, 3, 1, 1, 1, 1, 1],
 [2, 5, 6, 63, 28, 9, 67, 46, 3, 69, 63, 28, 5, 9, 68, 5, 46, 3, 1, 1],
 [2, 69, 63, 8, 29, 68, 45, 3, 85, 63, 8, 29, 68, 45, 3, 1, 1, 1, 1, 1],
 [2, 69, 66, 10, 46, 3, 85, 66, 10, 46, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [2, 69, 60, 12, 36, 68, 37, 3, 85, 60, 12, 36, 68, 37, 3, 1, 1, 1, 1, 1],
 [2, 69, 56, 42, 68, 32, 3, 85, 56, 42, 68, 32, 3, 1, 1, 1, 1, 1, 1, 1],
 [2, 6, 55, 10, 43, 67, 47, 3, 69, 55, 10, 43, 67, 47, 3, 1, 1, 1, 1, 1],
 [2, 5, 6, 61, 23, 38, 67, 42, 3, 69, 61, 23, 5, 38, 67, 5, 42, 3, 1, 1],
 [2, 6, 54, 21, 46, 3, 69, 54, 21, 46, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [2, 5, 6, 53, 26, 44, 67, 37, 3, 69, 53, 26, 5, 44, 67, 5, 37, 3, 1, 1],
 [2, 69, 61, 27, 9, 67, 38, 3, 85, 61, 27, 9, 67, 38, 3, 1, 1, 1, 1, 1],
 [2, 6, 56, 17, 5, 41, 68, 5, 29, 3, 85, 56, 17,

In [15]:
with torch.no_grad():
    next_sentence_prediction, masked_language = model(bert_inputs.to(device), segment_labels.to(device))

In [16]:
def invert_sentences(bert_input):
    def zero_pad_list_pair(pair_, pad=PAD_IDX):
        pair = deepcopy(pair_)
        max_len = max(len(pair[0]), len(pair[1]))
        #append [PAD] to each sentence in the pair till the maximum length reaches
        pair[0].extend([pad] * (max_len - len(pair[0])))
        pair[1].extend([pad] * (max_len - len(pair[1])))
        return pair[0] + pair[1]

    tokens = bert_input.cpu().numpy().tolist()
    assert(len(tokens) % 2 == 0), len(tokens)
    try:
        len_sents = tokens.index(SEP_IDX)
    except Exception as e:
        print(tokens)
        print(tokenizer.decode(tokens))
        raise Exception(e)
    first = tokens[1:len_sents - 1]
    second_ = tokens[len_sents:]
    try:
        sep_in_second = second_.index(SEP_IDX)
    except Exception as e:
        print(second_)
        print(tokenizer.decode(second_))
        raise Exception(e)
    second = second_[:sep_in_second]
    bert_input = ([CLS_IDX] + second + [SEP_IDX], first + [SEP_IDX])
    # Create segment labels for each pair of sentences
    segment_label = ([1] * len(bert_input[0]), [2] * len(bert_input[1]))
    # Zero-pad the bert_input and bert_label and segment_label
    bert_input_padded = zero_pad_list_pair(bert_input)
    bert_input_padded = torch.tensor(bert_input_padded, dtype=torch.long).unsqueeze(dim=1)
    segment_label_padded = zero_pad_list_pair(segment_label,pad=0)
    segment_label_padded = torch.tensor(segment_label_padded, dtype=torch.long).unsqueeze(dim=1)
    return bert_input_padded, segment_label_padded

In [17]:
def get_prediction(input, segment_label, is_next, device):
    with torch.no_grad():
        model_prediction, _ = model(input.to(device), segment_label.to(device))
    logits = torch.softmax(model_prediction, dim=1)
    prediction = torch.argmax(logits, dim=1)     
    correct = prediction == is_next
    return correct

In [18]:
test = list()
golden = list()
for bert_inputs, segment_labels, is_nexts in tqdm(dataloader):
    corrects = get_prediction(
        input=bert_inputs,
        segment_label=segment_labels,
        is_next=is_nexts,
        device=device
    )
    for idx, correct in enumerate(corrects):
        if correct:
            bert_input = bert_inputs.transpose(1,0)[idx]
            bert_input_reversed, segment_label = invert_sentences(bert_input)
            check = get_prediction(
                input=bert_input_reversed, 
                segment_label=segment_label,
                is_next=is_nexts[idx],
                device=device
            )
            test.append(check.cpu().item())
            golden.append(is_nexts[idx].cpu())
acc = accuracy_score(golden, test)
f1 = f1_score(golden, test)    
print(f"F1 score: {f1}")
print(f"Accuracy: {acc}")

  0%|          | 0/20410 [00:00<?, ?it/s]

F1 score: 0.7141874856749942
Accuracy: 0.5554367201426025
