# Test Consistencia

In [67]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from copy import deepcopy
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from bert import BERT
from utils_vocab import BasicTokenizer, BERTDatasetNoLabels, evaluate

from sklearn.metrics import accuracy_score, f1_score

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [68]:
# raw_dataset = 'implicacion_5.csv'
# raw_dataset = 'implicacion_10.csv'
# raw_dataset = 'implicacion_15.csv'
raw_dataset = 'implicacion_20.csv'

# tokenizer_file = 'tokenizer_5.pkl'
# tokenizer_file = 'tokenizer_10.pkl'
# tokenizer_file = 'tokenizer_15.pkl'
tokenizer_file = 'tokenizer_20.pkl'

# path_model = 'implicacion_5.pt'
# path_model = 'implicacion_10.pt'
# path_model = 'implicacion_15.pt'
path_model = 'implicacion_20.pt'

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(device)

mps


### Crear el Tokenizer

In [69]:
special_symbols = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']
PAD_IDX = 1
CLS_IDX = 2
SEP_IDX = 3

simple_tokenizer = lambda tokens_string: tokens_string.strip().split()
tokenizer = BasicTokenizer.create_using_stoi(simple_tokenizer, special_symbols, tokenizer_file)
print('vocabulary_size:', tokenizer.get_vocab_size())

vocabulary_size: 86


### Cargar datos y crear dataloader

In [70]:
df = pd.read_csv(raw_dataset)
print(df.shape)

(4586399, 4)


In [71]:
def direct_prepare_bert_final_inputs(sentences1, sentences2, is_nexts, is_consistents, to_tensor=True):
    """
    Prepare the final input lists for BERT training.
    """
    def zero_pad_list_pair(pair_, pad=PAD_IDX):
        pair = deepcopy(pair_)
        max_len = max(len(pair[0]), len(pair[1]))
        #append [PAD] to each sentence in the pair till the maximum length reaches
        pair[0].extend([pad] * (max_len - len(pair[0])))
        pair[1].extend([pad] * (max_len - len(pair[1])))
        return pair[0], pair[1]

    #flatten the tensor
    flatten = lambda l: [item for sublist in l for item in sublist]

    bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final, is_consistents_final = [], [], [], [], []

    for sentence1, sentence2, is_next, is_consistent in zip(sentences1, sentences2, is_nexts, is_consistents):
        # Tokenize each sentence
        tokens1 = tokenizer.encode(sentence1).ids
        tokens2 = tokenizer.encode(sentence2).ids
        bert_input = ([CLS_IDX] + tokens1 + [SEP_IDX], tokens2 + [SEP_IDX])

        # Create segment labels for each pair of sentences
        segment_label = [[1] * len(bert_input[0]), [2] * len(bert_input[1])]

        # Zero-pad the bert_input and bert_label and segment_label
        bert_input_padded = zero_pad_list_pair(bert_input)
        segment_label_padded = zero_pad_list_pair(segment_label,pad=0)

        bert_inputs_final.append(flatten(bert_input_padded))
        segment_labels_final.append(flatten(segment_label_padded))
        is_nexts_final.append(is_next)
        is_consistents_final.append(is_consistent)

    return bert_inputs_final, segment_labels_final, is_nexts_final, is_consistents_final

In [72]:
sentences1 = df.iloc[:, 0]
sentences2 = df.iloc[:, 1]
is_nexts = df.iloc[:, 2]
is_consistent = df.iloc[:, 3]
bert_inputs_final, segment_labels_final, is_nexts_final, is_consistent_final = direct_prepare_bert_final_inputs(sentences1, sentences2, is_nexts, is_consistent)

In [73]:
df_final = pd.DataFrame({
    'BERT Input': bert_inputs_final,
    'Segment Label': segment_labels_final,
    'Is Next': is_nexts_final,
    'Is Consistent': is_consistent_final
})
print(f'{df_final.shape=}')
df_final.head(10)

df_final.shape=(4586399, 4)


Unnamed: 0,BERT Input,Segment Label,Is Next,Is Consistent
0,"[2, 5, 6, 8, 3, 49, 6, 8, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
1,"[2, 5, 6, 9, 3, 49, 6, 9, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
2,"[2, 5, 6, 10, 3, 49, 6, 10, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
3,"[2, 5, 6, 11, 3, 49, 6, 11, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
4,"[2, 5, 6, 12, 3, 49, 6, 12, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
5,"[2, 5, 6, 13, 3, 49, 6, 13, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
6,"[2, 5, 6, 14, 3, 49, 6, 14, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
7,"[2, 5, 6, 15, 3, 49, 6, 15, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
8,"[2, 5, 6, 16, 3, 49, 6, 16, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0
9,"[2, 5, 6, 17, 3, 49, 6, 17, 3, 1]","[1, 1, 1, 1, 1, 2, 2, 2, 2, 0]",1,0


In [74]:
dataset = BERTDatasetNoLabels(df_final)
bert_inputs, segment_labels, is_nexts, is_consistents = dataset[1]

In [75]:
PAD_IDX = 1

def collate_batch(batch):
    bert_inputs_batch, segment_labels_batch, is_nexts_batch, is_consistents_batch = [], [], [], []

    for bert_input, segment_label, is_next, is_consistent in batch:
        # Convert each sequence to a tensor and append to the respective list
        bert_inputs_batch.append(torch.tensor(bert_input, dtype=torch.long))
        segment_labels_batch.append(torch.tensor(segment_label, dtype=torch.long))
        is_nexts_batch.append(torch.tensor(is_next, dtype=torch.long))
        is_consistents_batch.append(torch.tensor(is_consistent, dtype=torch.long))

    # Pad the sequences in the batch
    bert_inputs_final = pad_sequence(bert_inputs_batch, padding_value=PAD_IDX, batch_first=False)
    segment_labels_final = pad_sequence(segment_labels_batch, padding_value=PAD_IDX, batch_first=False)
    is_nexts_batch = torch.tensor(is_nexts_batch)
    is_consistents_batch = torch.tensor(is_consistents_batch)

    return bert_inputs_final.to(device), segment_labels_final.to(device), is_nexts_batch.to(device), is_consistents_batch.to(device)

In [76]:
BATCH_SIZE = 128

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)


In [77]:
bert_inputs, segment_labels, is_nexts, is_consistents = next(iter(dataloader))

### Cargar el modelo

In [78]:
EMBEDDING_DIM = 16

# Define parameters
vocab_size = tokenizer.get_vocab_size()  # Replace VOCAB_SIZE with your vocabulary size
d_model = EMBEDDING_DIM  # Replace EMBEDDING_DIM with your embedding dimension
n_layers = 4  # Number of Transformer layers
initial_heads = 4
# Ensure the number of heads is a factor of the embedding dimension
heads = initial_heads - d_model % initial_heads

dropout = 0.1  # Dropout rate

# Create an instance of the BERT model
model = BERT(vocab_size, d_model, n_layers, heads, dropout)
model.to(device)

model.load_state_dict(torch.load(path_model, weights_only=True,map_location=torch.device(device)))
model.eval()

BERT(
  (bert_embedding): BERTEmbedding(
    (token_embedding): TokenEmbedding(
      (embedding): Embedding(86, 16)
    )
    (positional_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (segment_embedding): Embedding(3, 16)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
    )
    (linear1): Linear(in_features=16, out_features=32, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=32, out_features=16, bias=True)
    (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerE

In [79]:
bert_inputs, segment_labels, is_nexts, is_consistents = next(iter(dataloader))

In [80]:
bert_inputs.cpu().numpy().transpose(1, 0).tolist()

[[2, 49, 33, 60, 22, 47, 14, 3, 49, 33, 22, 47, 14, 3, 1, 1, 1, 1, 1, 1],
 [2, 70, 5, 37, 50, 25, 46, 17, 3, 49, 37, 50, 70, 25, 47, 70, 17, 3, 1, 1],
 [2, 70, 49, 38, 51, 16, 47, 13, 3, 5, 38, 51, 70, 16, 46, 70, 13, 3, 1, 1],
 [2, 5, 30, 57, 14, 46, 8, 3, 1, 49, 30, 57, 70, 14, 47, 70, 8, 3, 1, 1],
 [2, 5, 30, 66, 7, 47, 20, 3, 5, 30, 7, 47, 20, 3, 1, 1, 1, 1, 1, 1],
 [2, 70, 5, 29, 58, 25, 46, 16, 3, 49, 29, 58, 70, 25, 47, 70, 16, 3, 1, 1],
 [2, 49, 42, 62, 17, 46, 26, 3, 1, 5, 42, 62, 70, 17, 47, 70, 26, 3, 1, 1],
 [2, 49, 34, 59, 13, 47, 10, 3, 1, 5, 34, 59, 70, 13, 46, 70, 10, 3, 1, 1],
 [2, 48, 45, 15, 46, 21, 3, 1, 49, 45, 70, 15, 47, 70, 21, 3, 1, 1, 1, 1],
 [2, 5, 32, 59, 22, 46, 25, 3, 1, 1, 70, 49, 32, 59, 70, 22, 47, 70, 25, 3],
 [2, 70, 49, 42, 63, 14, 47, 16, 3, 5, 42, 63, 70, 14, 46, 70, 16, 3, 1, 1],
 [2, 49, 33, 66, 21, 47, 11, 3, 1, 5, 33, 66, 70, 21, 46, 70, 11, 3, 1, 1],
 [2, 49, 6, 55, 17, 47, 7, 3, 1, 5, 6, 55, 70, 17, 46, 70, 7, 3, 1, 1],
 [2, 70, 49, 42, 55, 1

In [81]:
with torch.no_grad():
    next_sentence_prediction, masked_language = model(bert_inputs.to(device), segment_labels.to(device))

In [82]:
def invert_sentences(bert_input):
    def zero_pad_list_pair(pair_, pad=PAD_IDX):
        pair = deepcopy(pair_)
        max_len = max(len(pair[0]), len(pair[1]))
        #append [PAD] to each sentence in the pair till the maximum length reaches
        pair[0].extend([pad] * (max_len - len(pair[0])))
        pair[1].extend([pad] * (max_len - len(pair[1])))
        return pair[0] + pair[1]

    tokens = bert_input.cpu().numpy().tolist()
    assert(len(tokens) % 2 == 0), len(tokens)
    try:
        len_sents = tokens.index(SEP_IDX)
    except Exception as e:
        print(tokens)
        print(tokenizer.decode(tokens))
        raise Exception(e)
    first = tokens[1:len_sents - 1]
    second_ = tokens[len_sents:]
    try:
        sep_in_second = second_.index(SEP_IDX)
    except Exception as e:
        print(second_)
        print(tokenizer.decode(second_))
        raise Exception(e)
    second = second_[:sep_in_second]
    bert_input = ([CLS_IDX] + second + [SEP_IDX], first + [SEP_IDX])
    # Create segment labels for each pair of sentences
    segment_label = ([1] * len(bert_input[0]), [2] * len(bert_input[1]))
    # Zero-pad the bert_input and bert_label and segment_label
    bert_input_padded = zero_pad_list_pair(bert_input)
    bert_input_padded = torch.tensor(bert_input_padded, dtype=torch.long).unsqueeze(dim=1)
    segment_label_padded = zero_pad_list_pair(segment_label,pad=0)
    segment_label_padded = torch.tensor(segment_label_padded, dtype=torch.long).unsqueeze(dim=1)
    return bert_input_padded, segment_label_padded

In [83]:
def get_prediction(input, segment_label, is_next, device):
    with torch.no_grad():
        model_prediction, _ = model(input.to(device), segment_label.to(device))
    logits = torch.softmax(model_prediction, dim=1)
    prediction = torch.argmax(logits, dim=1)     
    correct = prediction == is_next
    return correct, prediction

In [84]:
test = list()
golden = list()
for bert_inputs, segment_labels, is_nexts, is_consistents in tqdm(dataloader):
    corrects,_ = get_prediction(
        input=bert_inputs,
        segment_label=segment_labels,
        is_next=is_nexts,
        device=device
    )
    for idx, correct in enumerate(corrects):
        if correct:
            bert_input = bert_inputs.transpose(1,0)[idx]
            bert_input_reversed, segment_label = invert_sentences(bert_input)
            check, prediction = get_prediction(
                input=bert_input_reversed, 
                segment_label=segment_label,
                is_next=is_consistents[idx],
                device=device
            )
            test.append(prediction.cpu().item())
            golden.append(is_consistents[idx].cpu())
acc = accuracy_score(golden, test)
f1 = f1_score(golden, test)    

print(f"F1 score: {f1}")
print(f"Accuracy: {acc}")


  0%|          | 0/35832 [00:00<?, ?it/s]

F1 score: 0.0
Accuracy: 0.7142857142857143
