# Creación del dataset para oraciones de relaciones lógicas

In [None]:
import sys
sys.path.append("../..")

import torch
import pandas as pd

from copy import deepcopy

from src.config import PATHS
from src.utils.utils_vocab import BasicTokenizer

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [None]:
dataset_folder = PATHS["training_data_folder"]
tokenizer_folder = PATHS["tokenizer_folder"]

raw_dataset = dataset_folder/'implicacion_5.csv'
# raw_dataset = dataset_folder/'implicacion_10.csv'
# raw_dataset = dataset_folder/'implicacion_15.csv'
# raw_dataset = dataset_folder/'implicacion_20.csv'

tokenizer_file = tokenizer_folder/'tokenizer_5.pkl'
# tokenizer_file = tokenizer_folder/'tokenizer_10.pkl'
# tokenizer_file = tokenizer_folder/'tokenizer_15.pkl'
# tokenizer_file = tokenizer_folder/'tokenizer_20.pkl'

csv_file_path = dataset_folder/'bert_data_implicacion_5.csv'
# csv_file_path = dataset_folder/'bert_data_implicacion_10.csv'
# csv_file_path = dataset_folder/'bert_data_implicacion_15.csv'
# csv_file_path = dataset_folder/'bert_data_implicacion_20.csv'

jabberwockie_dataset = dataset_folder/'implicacion_jabberwockie_5.csv'
# csv_file_path =dataset_folder/'bert_data_implicacion_jabberwockie_5.csv'

## Creamos el tokenizer

In [275]:
words_df1 = pd.read_csv(raw_dataset, names=['Sentence 1', 'Sentence 2', 'Relation', 'Consistencia']) 
words_jabberwockie = pd.read_csv(jabberwockie_dataset, names=['Sentence 1', 'Sentence 2', 'Relation', 'Consistencia'])
words_df = pd.concat([words_df1, words_jabberwockie], ignore_index=True)
words_df.head()

Unnamed: 0,Sentence 1,Sentence 2,Relation,Consistencia
0,todo abuelo acuerda,algún abuelo acuerda,1,0
1,todo abuelo aguanta,algún abuelo aguanta,1,0
2,todo abuelo ama,algún abuelo ama,1,0
3,todo abuelo amanece,algún abuelo amanece,1,0
4,todo abuelo anochece,algún abuelo anochece,1,0


In [276]:
words = list(words_df.iloc[:,0].values)
words += list(words_df.iloc[:,1].values)
print(words[:10])
print(len(words))

['todo abuelo acuerda ', 'todo abuelo aguanta ', 'todo abuelo ama ', 'todo abuelo amanece ', 'todo abuelo anochece ', 'todo abuelo aparece ', 'todo abuelo aplaude ', 'todo abuelo avanza ', 'todo abuelo baja ', 'todo abuelo bosteza ']
9210600


In [277]:
y = [w.strip().split(' ')  for w in words]
y = [x for w in y for x in w]
y = [w for w in y if w != '']
y = list(set(y))
print(y)
print(len(y))

['ama', 'amanece', 'carraspea', 'amarillo', 'contempla', 'no', 'árbol', 'asombroso', 'brillante', 'avión', 'débil', 'burro', 'dulce', 'alce', 'y', 'dormínico', 'aparece', 'albañil', 'apagado', 'brinca', 'canta', 'blicket', 'abuelo', 'avanza', 'bosteza', 'celebra', 'cable', 'arquitecto', 'brispado', 'barril', 'áspero', 'alto', 'bosque', 'claro', 'flakle', 'caballo', 'jufzyl', 'calla', 'camino', 'algún', 'flexivo', 'jufmoq', 'caracol', 'dernea', 'auto', 'delgado', 'agujero', 'claribundo', 'todo', 'camello', 'o', 'bajo', 'flajuf', 'aplaude', 'animal', 'caliente', 'amplio', 'complejo', 'drifla', 'cambia', 'actor', 'amargo', 'florido', 'blando', 'alegre', 'acuerda', 'barco', 'baja', 'brilca', 'ningún', 'cae', 'aguanta', 'azul', 'converge', 'carro', 'cansado', 'bliscea', 'anochece', 'brunza', 'difícil', 'camina']
81


In [278]:
# Define special symbols for the tokenizer
special_symbols = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']

# Create tokenizer from words

simple_tokenizer = lambda tokens_string: tokens_string.strip().split()
tokenizer = BasicTokenizer(simple_tokenizer, special_symbols)
tokenizer.initialize_from_iterable(words)

# Save to file
tokenizer.save(tokenizer_file)

In [None]:
VOCAB_SIZE = len(tokenizer.itos)
print(f'Tamaño del vocabulario: {VOCAB_SIZE}')

86


In [280]:
print(tokenizer.itos)

['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]', 'todo', 'abuelo', 'acuerda', 'aguanta', 'ama', 'amanece', 'anochece', 'aparece', 'aplaude', 'avanza', 'baja', 'bosteza', 'brinca', 'cae', 'calla', 'cambia', 'camina', 'canta', 'carraspea', 'celebra', 'contempla', 'converge', 'actor', 'agujero', 'albañil', 'alce', 'animal', 'árbol', 'arquitecto', 'auto', 'avión', 'barco', 'barril', 'bosque', 'burro', 'caballo', 'cable', 'camello', 'camino', 'caracol', 'carro', 'y', 'o', 'ningún', 'algún', 'alegre', 'alto', 'amargo', 'amplio', 'amarillo', 'apagado', 'asombroso', 'áspero', 'azul', 'bajo', 'blando', 'brillante', 'caliente', 'cansado', 'claro', 'complejo', 'débil', 'delgado', 'difícil', 'dulce', 'no', 'blicket', 'bliscea', 'brilca', 'brunza', 'dernea', 'drifla', 'flajuf', 'flakle', 'jufmoq', 'jufzyl', 'brispado', 'claribundo', 'dormínico', 'florido', 'flexivo']


## Creamos el dataset

In [281]:
sentences_df = pd.read_csv(raw_dataset, names=['sentence 1', 'sentence 2', 'relation', 'consistencia'])
# sentences_df = pd.read_csv(jabberwockie_dataset, names=['sentence 1', 'sentence 2', 'relation', 'consistencia'])
print(sentences_df.shape)
sentences_df.tail()

(4586400, 4)


Unnamed: 0,sentence 1,sentence 2,relation,consistencia
4586395,no algún carro converge y camina,algún carro converge y camina,0,0
4586396,no algún carro converge y canta,algún carro converge y canta,0,0
4586397,no algún carro converge y carraspea,algún carro converge y carraspea,0,0
4586398,no algún carro converge y celebra,algún carro converge y celebra,0,0
4586399,no algún carro converge y contempla,algún carro converge y contempla,0,0


In [282]:
sentences_df['tokens sentence 1'] = sentences_df['sentence 1'].apply(lambda x: tokenizer.encode(x).tokens)
sentences_df['tokens sentence 2'] = sentences_df['sentence 2'].apply(lambda x: tokenizer.encode(x).tokens)
sentences_df.head()

Unnamed: 0,sentence 1,sentence 2,relation,consistencia,tokens sentence 1,tokens sentence 2
0,todo abuelo acuerda,algún abuelo acuerda,1,0,"[todo, abuelo, acuerda]","[algún, abuelo, acuerda]"
1,todo abuelo aguanta,algún abuelo aguanta,1,0,"[todo, abuelo, aguanta]","[algún, abuelo, aguanta]"
2,todo abuelo ama,algún abuelo ama,1,0,"[todo, abuelo, ama]","[algún, abuelo, ama]"
3,todo abuelo amanece,algún abuelo amanece,1,0,"[todo, abuelo, amanece]","[algún, abuelo, amanece]"
4,todo abuelo anochece,algún abuelo anochece,1,0,"[todo, abuelo, anochece]","[algún, abuelo, anochece]"


In [283]:
def bernoulli_true_false(p):
    # Create a Bernoulli distribution with probability p
    bernoulli_dist = torch.distributions.Bernoulli(torch.tensor([p]))
    # Sample from this distribution and convert 1 to True and 0 to False
    return bernoulli_dist.sample().item() == 1

In [284]:
def Masking(token):
    # Decide whether to mask this token (20% chance)
    mask = bernoulli_true_false(0.2)

    # If mask is False, immediately return with '[PAD]' label
    if not mask:
        return token, '[PAD]'

    # If mask is True, proceed with further operations
    # Randomly decide on an operation (50% chance each)
    random_opp = bernoulli_true_false(0.5)
    random_swich = bernoulli_true_false(0.5)

    # Case 1: If mask, random_opp, and random_swich are True
    if mask and random_opp and random_swich:
        # Replace the token with '[MASK]' and set label to a random token
        mask_label = tokenizer.decode(torch.randint(0, VOCAB_SIZE, (1,)))[0]
        token_ = '[MASK]'

    # Case 2: If mask and random_opp are True, but random_swich is False
    elif mask and random_opp and not random_swich:
        # Leave the token unchanged and set label to the same token
        token_ = token
        mask_label = token

    # Case 3: If mask is True, but random_opp is False
    else:
        # Replace the token with '[MASK]' and set label to the original token
        token_ = '[MASK]'
        mask_label = token

    return token_, mask_label

In [285]:
# Test Masking
torch.manual_seed(100)
for l in range(10):
  token="abuelo"
  token_,label=Masking(token)
  if token==token_ and label=="[PAD]":
    print(token_,label,f"\t Actual token *{token}* is left unchanged")
  elif token_=="[MASK]" and label==token:
    print(token_,label,f"\t Actual token *{token}* is masked with '{token_}'")
  else:
    print(token_,label,f"\t Actual token *{token}* is replaced with random token #{label}#")

[MASK] abuelo 	 Actual token *abuelo* is masked with '[MASK]'
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
[MASK] difícil 	 Actual token *abuelo* is replaced with random token #difícil#
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo [PAD] 	 Actual token *abuelo* is left unchanged


In [286]:
def prepare_for_mlm(tokens, include_raw_tokens=False):
    """
    Prepares tokenized text for BERT's Masked Language Model (MLM) training.

    """
    bert_input = []  # List to store sentences processed for BERT's MLM
    bert_label = []  # List to store labels for each token (mask, random, or unchanged)
    raw_tokens = []  # List to store raw tokens if needed

    for token in tokens:
        # Apply BERT's MLM masking strategy to the token
        masked_token, mask_label = Masking(token)

        # Append the processed token and its label to the current sentence and label list
        bert_input.append(masked_token)
        bert_label.append(mask_label)

        # If raw tokens are to be included, append the original token to the current raw tokens list
        if include_raw_tokens:
            raw_tokens.append(token)

    # Return the prepared lists for BERT's MLM training
    return (bert_input, bert_label, raw_tokens) if include_raw_tokens else (bert_input, bert_label)

In [287]:
torch.manual_seed(100)
# original_input="The sun sets behind the distant mountains."
original_input = "algún abuelo alegre no ama"
tokens=tokenizer.encode(original_input).tokens
bert_input, bert_label= prepare_for_mlm(tokens, include_raw_tokens=False)
print("Without raw tokens: \t ","\n \t original_input is: \t ", original_input,"\n \t bert_input is: \t ", bert_input,"\n \t bert_label is: \t ", bert_label)
print("-"*200)
torch.manual_seed(200)
bert_input, bert_label, raw_tokens_list = prepare_for_mlm(tokens, include_raw_tokens=True)
print("With raw tokens: \t ","\n \t original_input is: \t ", original_input,"\n \t bert_input is: \t ", bert_input,"\n \t bert_label is: \t ", bert_label,"\n \t raw_tokens_list is: \t ", raw_tokens_list)

Without raw tokens: 	  
 	 original_input is: 	  algún abuelo alegre no ama 
 	 bert_input is: 	  ['[MASK]', 'abuelo', 'alegre', 'no', 'ama'] 
 	 bert_label is: 	  ['algún', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
With raw tokens: 	  
 	 original_input is: 	  algún abuelo alegre no ama 
 	 bert_input is: 	  ['algún', 'abuelo', 'alegre', 'no', 'ama'] 
 	 bert_label is: 	  ['[PAD]', '[PAD]', '[PAD]', 'no', '[PAD]'] 
 	 raw_tokens_list is: 	  ['algún', 'abuelo', 'alegre', 'no', 'ama']


In [288]:
sentences_df['bert_input_label 1'] = sentences_df['tokens sentence 1'].apply(lambda x: prepare_for_mlm(x))
sentences_df['bert_input 1'] = sentences_df['bert_input_label 1'].apply(lambda x: x[0])
sentences_df['bert_label 1'] = sentences_df['bert_input_label 1'].apply(lambda x: x[1])
sentences_df['bert_input_label 2'] = sentences_df['tokens sentence 2'].apply(lambda x: prepare_for_mlm(x))
sentences_df['bert_input 2'] = sentences_df['bert_input_label 2'].apply(lambda x: x[0])
sentences_df['bert_label 2'] = sentences_df['bert_input_label 2'].apply(lambda x: x[1])
del sentences_df['bert_input_label 1']
del sentences_df['bert_input_label 2']
sentences_df.head()

Unnamed: 0,sentence 1,sentence 2,relation,consistencia,tokens sentence 1,tokens sentence 2,bert_input 1,bert_label 1,bert_input 2,bert_label 2
0,todo abuelo acuerda,algún abuelo acuerda,1,0,"[todo, abuelo, acuerda]","[algún, abuelo, acuerda]","[todo, abuelo, acuerda]","[[PAD], [PAD], [PAD]]","[algún, abuelo, [MASK]]","[[PAD], [PAD], acuerda]"
1,todo abuelo aguanta,algún abuelo aguanta,1,0,"[todo, abuelo, aguanta]","[algún, abuelo, aguanta]","[todo, abuelo, aguanta]","[[PAD], [PAD], [PAD]]","[algún, abuelo, aguanta]","[[PAD], [PAD], [PAD]]"
2,todo abuelo ama,algún abuelo ama,1,0,"[todo, abuelo, ama]","[algún, abuelo, ama]","[todo, abuelo, [MASK]]","[[PAD], [PAD], ama]","[algún, [MASK], ama]","[[PAD], abuelo, [PAD]]"
3,todo abuelo amanece,algún abuelo amanece,1,0,"[todo, abuelo, amanece]","[algún, abuelo, amanece]","[todo, abuelo, [MASK]]","[[PAD], [PAD], amanece]","[algún, abuelo, [MASK]]","[[PAD], [PAD], amanece]"
4,todo abuelo anochece,algún abuelo anochece,1,0,"[todo, abuelo, anochece]","[algún, abuelo, anochece]","[todo, abuelo, anochece]","[[PAD], [PAD], [PAD]]","[algún, abuelo, anochece]","[[PAD], [PAD], [PAD]]"


In [289]:
input_sentences_pair = sentences_df[['bert_input 1', 'bert_input 2']].values.tolist()
input_masked_labels_pair = sentences_df[['bert_label 1', 'bert_label 2']].values.tolist()
relations = sentences_df['relation'].values.tolist()

In [290]:
def process_for_nsp(input_sentences_pair, input_masked_labels_pair, relations):
    """
    Prepares data for understanding logical relationship.

    Args:
    input_sentences (list): List of tokenized sentences.
    input_masked_labels (list): Corresponding list of masked labels for the sentences.

    Returns:
    bert_input (list): List of sentence pairs for BERT input.
    bert_label (list): List of masked labels for the sentence pairs.
    is_next (list): Binary label list where 1 indicates 'logical relationship' and 0 indicates 'not logical relationship'.
    """

    # Verify that both input lists are of the same length and have a sufficient number of sentences
    if len(input_sentences_pair) != len(input_masked_labels_pair):
        raise ValueError("Both lists, input_sentences_pair and input_masked_labels_pair, must have the same number of items.")
    if len(input_sentences_pair) != len(relations):
        raise ValueError("Both lists, input_sentences_pair and relations, must have the same number of items.")

    bert_input = []
    bert_label = []
    is_next = []

    for sentence_pair, masked_pair, relation in zip(input_sentences_pair, input_masked_labels_pair, relations):
        # append list and add  '[CLS]' and  '[SEP]' tokens
        bert_input.append([['[CLS]'] + sentence_pair[0] + ['[SEP]'], sentence_pair[1] + ['[SEP]']])
        bert_label.append([['[PAD]'] + masked_pair[0] + ['[PAD]'], masked_pair[1]+ ['[PAD]']])
        is_next.append(relation)  # Label 1 indicates these sentences have the required logical relationship

    return bert_input, bert_label, is_next

In [291]:
bert_inputs, bert_labels, is_nexts = process_for_nsp(input_sentences_pair, input_masked_labels_pair, relations)

In [292]:
bert_inputs[:10]

[[['[CLS]', 'todo', 'abuelo', 'acuerda', '[SEP]'],
  ['algún', 'abuelo', '[MASK]', '[SEP]']],
 [['[CLS]', 'todo', 'abuelo', 'aguanta', '[SEP]'],
  ['algún', 'abuelo', 'aguanta', '[SEP]']],
 [['[CLS]', 'todo', 'abuelo', '[MASK]', '[SEP]'],
  ['algún', '[MASK]', 'ama', '[SEP]']],
 [['[CLS]', 'todo', 'abuelo', '[MASK]', '[SEP]'],
  ['algún', 'abuelo', '[MASK]', '[SEP]']],
 [['[CLS]', 'todo', 'abuelo', 'anochece', '[SEP]'],
  ['algún', 'abuelo', 'anochece', '[SEP]']],
 [['[CLS]', 'todo', 'abuelo', 'aparece', '[SEP]'],
  ['algún', 'abuelo', 'aparece', '[SEP]']],
 [['[CLS]', 'todo', '[MASK]', 'aplaude', '[SEP]'],
  ['algún', 'abuelo', 'aplaude', '[SEP]']],
 [['[CLS]', '[MASK]', 'abuelo', 'avanza', '[SEP]'],
  ['algún', 'abuelo', 'avanza', '[SEP]']],
 [['[CLS]', 'todo', 'abuelo', 'baja', '[SEP]'],
  ['[MASK]', 'abuelo', 'baja', '[SEP]']],
 [['[CLS]', 'todo', '[MASK]', '[MASK]', '[SEP]'],
  ['algún', 'abuelo', 'bosteza', '[SEP]']]]

In [293]:
def prepare_bert_final_inputs(bert_inputs, bert_labels, is_nexts, to_tensor=True):
    """
    Prepare the final input lists for BERT training.
    """
    def zero_pad_list_pair(pair_, pad='[PAD]'):
        pair = deepcopy(pair_)
        max_len = max(len(pair[0]), len(pair[1]))
        #append [PAD] to each sentence in the pair till the maximum length reaches
        pair[0].extend([pad] * (max_len - len(pair[0])))
        pair[1].extend([pad] * (max_len - len(pair[1])))
        return pair[0], pair[1]

    #flatten the tensor
    flatten = lambda l: [item for sublist in l for item in sublist]
    #transform tokens to vocab indices
    tokens_to_index=lambda tokens: [tokenizer.stoi[token] for token in tokens]

    bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final = [], [], [], []

    for bert_input, bert_label,is_next in zip(bert_inputs, bert_labels,is_nexts):
        # Create segment labels for each pair of sentences
        segment_label = [[1] * len(bert_input[0]), [2] * len(bert_input[1])]

        # Zero-pad the bert_input and bert_label and segment_label
        bert_input_padded = zero_pad_list_pair(bert_input)
        bert_label_padded = zero_pad_list_pair(bert_label)
        segment_label_padded = zero_pad_list_pair(segment_label,pad=0)

        #convert to tensors
        if to_tensor:

            # Flatten the padded inputs and labels, transform tokens to their corresponding vocab indices, and convert them to tensors
            # bert_inputs_final.append(torch.tensor(tokens_to_index(flatten(bert_input_padded)),dtype=torch.int64))
            # bert_labels_final.append(torch.tensor(tokens_to_index(flatten(bert_label_padded)),dtype=torch.int64))
            # segment_labels_final.append(torch.tensor(flatten(segment_label_padded),dtype=torch.int64))
            bert_inputs_final.append(tokens_to_index(flatten(bert_input_padded)))
            bert_labels_final.append(tokens_to_index(flatten(bert_label_padded)))
            segment_labels_final.append(flatten(segment_label_padded))
            is_nexts_final.append(is_next)

        else:
          # Flatten the padded inputs and labels
            bert_inputs_final.append(flatten(bert_input_padded))
            bert_labels_final.append(flatten(bert_label_padded))
            segment_labels_final.append(flatten(segment_label_padded))
            is_nexts_final.append(is_next)

    return bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final

In [294]:
bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final = prepare_bert_final_inputs(bert_inputs, bert_labels, is_nexts)

In [295]:
bert_inputs_final[:10]

[[2, 5, 6, 7, 3, 49, 6, 4, 3, 1],
 [2, 5, 6, 8, 3, 49, 6, 8, 3, 1],
 [2, 5, 6, 4, 3, 49, 4, 9, 3, 1],
 [2, 5, 6, 4, 3, 49, 6, 4, 3, 1],
 [2, 5, 6, 11, 3, 49, 6, 11, 3, 1],
 [2, 5, 6, 12, 3, 49, 6, 12, 3, 1],
 [2, 5, 4, 13, 3, 49, 6, 13, 3, 1],
 [2, 4, 6, 14, 3, 49, 6, 14, 3, 1],
 [2, 5, 6, 15, 3, 4, 6, 15, 3, 1],
 [2, 5, 4, 4, 3, 49, 6, 16, 3, 1]]

In [296]:
bert_labels_final[:10]

[[1, 1, 1, 1, 1, 1, 1, 7, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 9, 1, 1, 6, 1, 1, 1],
 [1, 1, 1, 10, 1, 1, 1, 10, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 6, 1, 1, 1, 1, 1, 1, 1],
 [1, 5, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 49, 1, 1, 1, 1],
 [1, 1, 6, 36, 1, 1, 1, 1, 1, 1]]

In [297]:
segment_labels_final[:10]

[[1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 2, 2, 2, 2, 0]]

In [298]:
df_final = pd.DataFrame({
    'bert_input': bert_inputs_final,
    'bert_label': bert_labels_final,
    'segment_label': segment_labels_final,
    'relation': is_nexts_final
})

df_final.to_csv(csv_file_path, index=False)

---