# Creación del dataset para oraciones de relaciones lógicas

In [86]:
import sys
sys.path.append("../..")

import torch
import pandas as pd

from copy import deepcopy

from src.config import PATHS
from src.utils.utils_vocab import BasicTokenizer

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [88]:
dataset_folder = PATHS["training_data_folder"]
tokenizer_folder = PATHS["tokenizer_folder"]

raw_dataset =  dataset_folder / 'equivalencia_5.csv'
# raw_dataset = 'equivalencia_10.csv'
# raw_dataset = 'equivalencia_15.csv'
# raw_dataset = 'equivalencia_20.csv'

tokenizer_file = tokenizer_folder / 'tokenizer_5.pkl'
# tokenizer_file = 'tokenizer_10.pkl'
# tokenizer_file = 'tokenizer_15.pkl'
# tokenizer_file = 'tokenizer_20.pkl'

csv_file_path = dataset_folder / 'bert_data_equivalencia_5.csv'
# csv_file_path ='bert_data_equivalencia_10.csv'
# csv_file_path ='bert_data_equivalencia_15.csv'
# csv_file_path ='bert_data_equivalencia_20.csv'

jabberwockie_dataset = dataset_folder / 'equivalencia_jabberwockie_5.csv'
# csv_file_path ='bert_data_equivalencia_jabberwockie_5.csv'


## Creamos el tokenizer

In [91]:
words_df1 = pd.read_csv(raw_dataset, names=['Sentence 1', 'Sentence 2', 'Relation'])
words_jabberwockie = pd.read_csv(jabberwockie_dataset, names=['Sentence 1', 'Sentence 2', 'Relation'])
words_df = pd.concat([words_df1, words_jabberwockie], ignore_index=True)
print(f'Cantidad de oraciones dataset normal y jabberwockie:', words_df.shape[0])
words_df.tail(3)

Cantidad de oraciones dataset normal y jabberwockie: 21300


Unnamed: 0,Sentence 1,Sentence 2,Relation
21297,algún jufzyl drifla o brilca,ningún jufzyl drifla o brilca,0
21298,algún jufzyl drifla o brunza,ningún jufzyl drifla o brunza,0
21299,algún jufzyl drifla o dernea,ningún jufzyl drifla o dernea,0


In [92]:
words = list(words_df.iloc[:,0].values)
words += list(words_df.iloc[:,1].values)
print(words[:10])
print(len(words))

['no todo abuelo alegre acuerda ', 'no todo abuelo alto acuerda ', 'no todo abuelo amargo acuerda ', 'no todo abuelo amplio acuerda ', 'no todo abuelo amarillo acuerda ', 'no todo abuelo alegre aguanta ', 'no todo abuelo alto aguanta ', 'no todo abuelo amargo aguanta ', 'no todo abuelo amplio aguanta ', 'no todo abuelo amarillo aguanta ']
42600


In [93]:
y = [w.strip().split(' ')  for w in words]
y = [x for w in y for x in w]
y = [w for w in y if w != '']
y = list(set(y))
print(y)
print(len(y))

['alce', 'drifla', 'alegre', 'albañil', 'y', 'agujero', 'amargo', 'todo', 'amarillo', 'ama', 'flexivo', 'claribundo', 'alto', 'no', 'dernea', 'jufzyl', 'aguanta', 'flakle', 'amplio', 'anochece', 'algún', 'actor', 'brilca', 'brispado', 'florido', 'brunza', 'flajuf', 'amanece', 'dormínico', 'ningún', 'bliscea', 'jufmoq', 'abuelo', 'acuerda', 'blicket', 'o']
36


In [94]:
# Define special symbols for the tokenizer
special_symbols = ['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]']

# Create tokenizer from words

simple_tokenizer = lambda tokens_string: tokens_string.strip().split()
tokenizer = BasicTokenizer(simple_tokenizer, special_symbols)
tokenizer.initialize_from_iterable(words)

# Save to file

tokenizer.save(tokenizer_file)

In [96]:
VOCAB_SIZE = len(tokenizer.itos)
print(f'Tamaño del vocabulario: {VOCAB_SIZE}')

Tamaño del vocabulario: 41


In [8]:
print(tokenizer.itos)

['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]', 'no', 'todo', 'abuelo', 'alegre', 'acuerda', 'alto', 'amargo', 'amplio', 'amarillo', 'aguanta', 'ama', 'amanece', 'anochece', 'actor', 'agujero', 'albañil', 'alce', 'y', 'o', 'algún', 'blicket', 'brispado', 'bliscea', 'claribundo', 'dormínico', 'florido', 'flexivo', 'brilca', 'brunza', 'dernea', 'drifla', 'flajuf', 'flakle', 'jufmoq', 'jufzyl', 'ningún']


## Creamos el dataset

In [98]:
sentences_df_raw = pd.read_csv(raw_dataset, names=['sentence 1', 'sentence 2', 'relation'])
# sentences_df = pd.read_csv(jabberwockie_dataset, names=['sentence 1', 'sentence 2', 'relation'])
print('Cantidad de oraciones dataset normal:', sentences_df_raw.shape)
sentences_df_raw.tail()

Cantidad de oraciones dataset normal: (10650, 3)


Unnamed: 0,sentence 1,sentence 2,relation
10645,algún alce amanece o anochece,ningún alce amanece o anochece,0
10646,algún alce anochece o acuerda,ningún alce anochece o acuerda,0
10647,algún alce anochece o aguanta,ningún alce anochece o aguanta,0
10648,algún alce anochece o ama,ningún alce anochece o ama,0
10649,algún alce anochece o amanece,ningún alce anochece o amanece,0


In [99]:
sentences_df_raw['tokens sentence 1'] = sentences_df_raw['sentence 1'].apply(lambda x: tokenizer.encode(x).tokens)
sentences_df_raw['tokens sentence 2'] = sentences_df_raw['sentence 2'].apply(lambda x: tokenizer.encode(x).tokens)
sentences_df_raw.head()

Unnamed: 0,sentence 1,sentence 2,relation,tokens sentence 1,tokens sentence 2
0,no todo abuelo alegre acuerda,algún abuelo alegre no acuerda,1,"[no, todo, abuelo, alegre, acuerda]","[algún, abuelo, alegre, no, acuerda]"
1,no todo abuelo alto acuerda,algún abuelo alto no acuerda,1,"[no, todo, abuelo, alto, acuerda]","[algún, abuelo, alto, no, acuerda]"
2,no todo abuelo amargo acuerda,algún abuelo amargo no acuerda,1,"[no, todo, abuelo, amargo, acuerda]","[algún, abuelo, amargo, no, acuerda]"
3,no todo abuelo amplio acuerda,algún abuelo amplio no acuerda,1,"[no, todo, abuelo, amplio, acuerda]","[algún, abuelo, amplio, no, acuerda]"
4,no todo abuelo amarillo acuerda,algún abuelo amarillo no acuerda,1,"[no, todo, abuelo, amarillo, acuerda]","[algún, abuelo, amarillo, no, acuerda]"


In [100]:
def bernoulli_true_false(p):
    # Create a Bernoulli distribution with probability p
    bernoulli_dist = torch.distributions.Bernoulli(torch.tensor([p]))
    # Sample from this distribution and convert 1 to True and 0 to False
    return bernoulli_dist.sample().item() == 1

In [None]:
def Masking(token):

    # Don't mask [SEP] token
    if token == '[SEP]':
        return token, '[PAD]'

    # Decide whether to mask this token (50% chance)
    # mask = bernoulli_true_false(0.5)
    mask = bernoulli_true_false(0.25)

    # If mask is False, immediately return with '[PAD]' label
    if not mask:
        return token, '[PAD]'

    # If mask is True, proceed with further operations
    # Randomly decide on an operation (20% chance each)
    # random_opp = bernoulli_true_false(0.2)
    random_opp = bernoulli_true_false(0.1)

    random_swich = bernoulli_true_false(0.5)

    # Case 1: If mask, random_opp, and random_swich are True
    if random_opp and random_swich:
        # Replace the token with '[MASK]' and set label to a random token
        token_ = '[MASK]'
        mask_label = tokenizer.decode(torch.randint(0, VOCAB_SIZE, (1,)))[0]

    # Case 2: If mask and random_opp are True, but random_swich is False
    elif random_opp and not random_swich:
        # Leave the token unchanged and set label to the same token
        token_ = token
        mask_label = token

    # Case 3: If mask is True, but random_opp is False
    else:
        # Replace the token with '[MASK]' and set label to the original token
        token_ = '[MASK]'
        mask_label = token

    return token_, mask_label

In [102]:
# Test Masking
# torch.manual_seed(100)
for l in range(10):
  token="abuelo"
  token_, label=Masking(token)
  if token==token_ and label=="[PAD]":
    print(token_,label,f"\t Actual token *{token}* is left unchanged")
  elif token_=="[MASK]" and label==token:
    print(token_,label,f"\t Actual token *{token}* is masked with '{token_}'")
  else:
    print(token_,label,f"\t Actual token *{token}* is replaced with random token #{label}#")

abuelo abuelo 	 Actual token *abuelo* is replaced with random token #abuelo#
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo abuelo 	 Actual token *abuelo* is replaced with random token #abuelo#
[MASK] abuelo 	 Actual token *abuelo* is masked with '[MASK]'
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
[MASK] abuelo 	 Actual token *abuelo* is masked with '[MASK]'
abuelo [PAD] 	 Actual token *abuelo* is left unchanged
[MASK] abuelo 	 Actual token *abuelo* is masked with '[MASK]'
abuelo [PAD] 	 Actual token *abuelo* is left unchanged


In [103]:
def prepare_for_mlm(tokens):
    """
    Prepares tokenized text for BERT's Masked Language Model (MLM) training.

    """
    bert_input = []  # List to store sentences processed for BERT's MLM
    bert_label = []  # List to store labels for each token (mask, random, or unchanged)

    for token in tokens:
        # Apply BERT's MLM masking strategy to the token
        masked_token, mask_label = Masking(token)

        # Append the processed token and its label to the current sentence and label list
        bert_input.append(masked_token)
        bert_label.append(mask_label)

    # Return the prepared lists for BERT's MLM training
    return bert_input, bert_label

In [104]:
# torch.manual_seed(200)
# original_input="The sun sets behind the distant mountains."
original_input = "algún abuelo alegre no ama"
tokens=tokenizer.encode(original_input).tokens
bert_input, bert_label= prepare_for_mlm(tokens)
print("Without raw tokens: \t ","\n \t original_input is: \t ", original_input,"\n \t bert_input is: \t ", bert_input,"\n \t bert_label is: \t ", bert_label)

Without raw tokens: 	  
 	 original_input is: 	  algún abuelo alegre no ama 
 	 bert_input is: 	  ['[MASK]', 'abuelo', 'alegre', 'no', 'ama'] 
 	 bert_label is: 	  ['algún', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [107]:
sentences_df = pd.concat([sentences_df_raw]*5, ignore_index=True)
print('Cantidad aumentada de oraciones:', sentences_df.shape[0])

Cantidad aumentada de oraciones: 53250


In [109]:
print('Creando bert_input_label 1...')
sentences_df['bert_input_label 1'] = sentences_df['tokens sentence 1'].apply(lambda x: prepare_for_mlm(x))
print('Creando bert_label 1...')
sentences_df['bert_input 1'] = sentences_df['bert_input_label 1'].apply(lambda x: x[0])
sentences_df['bert_label 1'] = sentences_df['bert_input_label 1'].apply(lambda x: x[1])

print('Creando bert_input_label 2...')
sentences_df['bert_input_label 2'] = sentences_df['tokens sentence 2'].apply(lambda x: prepare_for_mlm(x))
print('Creando bert_label 2...')
sentences_df['bert_input 2'] = sentences_df['bert_input_label 2'].apply(lambda x: x[0])
sentences_df['bert_label 2'] = sentences_df['bert_input_label 2'].apply(lambda x: x[1])
del sentences_df['bert_input_label 1']
del sentences_df['bert_input_label 2']
print('¡Listo! Cantidad de oraciones final:', sentences_df.shape[0])
sentences_df.head()

Creando bert_input_label 1...
Creando bert_label 1...
Creando bert_input_label 2...
Creando bert_label 2...
¡Listo! Cantidad de oraciones final: 53250


Unnamed: 0,sentence 1,sentence 2,relation,tokens sentence 1,tokens sentence 2,bert_input 1,bert_label 1,bert_input 2,bert_label 2
0,no todo abuelo alegre acuerda,algún abuelo alegre no acuerda,1,"[no, todo, abuelo, alegre, acuerda]","[algún, abuelo, alegre, no, acuerda]","[no, [MASK], [MASK], alegre, acuerda]","[[PAD], todo, abuelo, [PAD], [PAD]]","[algún, abuelo, alegre, no, acuerda]","[[PAD], [PAD], [PAD], [PAD], [PAD]]"
1,no todo abuelo alto acuerda,algún abuelo alto no acuerda,1,"[no, todo, abuelo, alto, acuerda]","[algún, abuelo, alto, no, acuerda]","[no, todo, abuelo, alto, [MASK]]","[[PAD], [PAD], [PAD], [PAD], acuerda]","[algún, abuelo, [MASK], no, acuerda]","[algún, abuelo, alto, [PAD], [PAD]]"
2,no todo abuelo amargo acuerda,algún abuelo amargo no acuerda,1,"[no, todo, abuelo, amargo, acuerda]","[algún, abuelo, amargo, no, acuerda]","[no, todo, abuelo, amargo, acuerda]","[[PAD], [PAD], abuelo, [PAD], acuerda]","[algún, abuelo, amargo, no, acuerda]","[[PAD], [PAD], [PAD], [PAD], [PAD]]"
3,no todo abuelo amplio acuerda,algún abuelo amplio no acuerda,1,"[no, todo, abuelo, amplio, acuerda]","[algún, abuelo, amplio, no, acuerda]","[no, todo, abuelo, amplio, acuerda]","[[PAD], [PAD], [PAD], [PAD], [PAD]]","[algún, [MASK], [MASK], no, acuerda]","[[PAD], abuelo, amplio, [PAD], [PAD]]"
4,no todo abuelo amarillo acuerda,algún abuelo amarillo no acuerda,1,"[no, todo, abuelo, amarillo, acuerda]","[algún, abuelo, amarillo, no, acuerda]","[no, todo, abuelo, [MASK], acuerda]","[[PAD], [PAD], [PAD], amarillo, acuerda]","[algún, abuelo, amarillo, no, acuerda]","[[PAD], [PAD], [PAD], no, [PAD]]"


In [110]:
input_sentences_pair = sentences_df[['bert_input 1', 'bert_input 2']].values.tolist()
input_masked_labels_pair = sentences_df[['bert_label 1', 'bert_label 2']].values.tolist()
relations = sentences_df['relation'].values.tolist()

In [111]:
def process_for_nsp(input_sentences_pair, input_masked_labels_pair, relations):
    """
    Prepares data for understanding logical relationship.

    Args:
    input_sentences (list): List of tokenized sentences.
    input_masked_labels (list): Corresponding list of masked labels for the sentences.

    Returns:
    bert_input (list): List of sentence pairs for BERT input.
    bert_label (list): List of masked labels for the sentence pairs.
    is_next (list): Binary label list where 1 indicates 'logical relationship' and 0 indicates 'not logical relationship'.
    """

    # Verify that both input lists are of the same length and have a sufficient number of sentences
    if len(input_sentences_pair) != len(input_masked_labels_pair):
        raise ValueError("Both lists, input_sentences_pair and input_masked_labels_pair, must have the same number of items.")
    if len(input_sentences_pair) != len(relations):
        raise ValueError("Both lists, input_sentences_pair and relations, must have the same number of items.")

    bert_input = []
    bert_label = []
    is_next = []

    for sentence_pair, masked_pair, relation in zip(input_sentences_pair, input_masked_labels_pair, relations):
        # append list and add  '[CLS]' and  '[SEP]' tokens
        bert_input.append([['[CLS]'] + sentence_pair[0] + ['[SEP]'], sentence_pair[1] + ['[SEP]']])
        bert_label.append([['[PAD]'] + masked_pair[0] + ['[PAD]'], masked_pair[1]+ ['[PAD]']])
        is_next.append(relation)  # Label 1 indicates these sentences have the required logical relationship

    return bert_input, bert_label, is_next

In [112]:
bert_inputs, bert_labels, is_nexts = process_for_nsp(input_sentences_pair, input_masked_labels_pair, relations)

In [113]:
bert_inputs[:10]

[[['[CLS]', 'no', '[MASK]', '[MASK]', 'alegre', 'acuerda', '[SEP]'],
  ['algún', 'abuelo', 'alegre', 'no', 'acuerda', '[SEP]']],
 [['[CLS]', 'no', 'todo', 'abuelo', 'alto', '[MASK]', '[SEP]'],
  ['algún', 'abuelo', '[MASK]', 'no', 'acuerda', '[SEP]']],
 [['[CLS]', 'no', 'todo', 'abuelo', 'amargo', 'acuerda', '[SEP]'],
  ['algún', 'abuelo', 'amargo', 'no', 'acuerda', '[SEP]']],
 [['[CLS]', 'no', 'todo', 'abuelo', 'amplio', 'acuerda', '[SEP]'],
  ['algún', '[MASK]', '[MASK]', 'no', 'acuerda', '[SEP]']],
 [['[CLS]', 'no', 'todo', 'abuelo', '[MASK]', 'acuerda', '[SEP]'],
  ['algún', 'abuelo', 'amarillo', 'no', 'acuerda', '[SEP]']],
 [['[CLS]', 'no', 'todo', 'abuelo', 'alegre', 'aguanta', '[SEP]'],
  ['[MASK]', 'abuelo', 'alegre', 'no', 'aguanta', '[SEP]']],
 [['[CLS]', '[MASK]', 'todo', 'abuelo', 'alto', 'aguanta', '[SEP]'],
  ['algún', '[MASK]', '[MASK]', 'no', 'aguanta', '[SEP]']],
 [['[CLS]', 'no', 'todo', 'abuelo', 'amargo', 'aguanta', '[SEP]'],
  ['algún', 'abuelo', '[MASK]', '[MASK]'

In [114]:
def prepare_bert_final_inputs(bert_inputs, bert_labels, is_nexts, to_tensor=True):
    """
    Prepare the final input lists for BERT training.
    """
    def zero_pad_list_pair(pair_, pad='[PAD]'):
        pair = deepcopy(pair_)
        max_len = max(len(pair[0]), len(pair[1]))
        #append [PAD] to each sentence in the pair till the maximum length reaches
        pair[0].extend([pad] * (max_len - len(pair[0])))
        pair[1].extend([pad] * (max_len - len(pair[1])))
        return pair[0], pair[1]

    #flatten the tensor
    flatten = lambda l: [item for sublist in l for item in sublist]
    #transform tokens to vocab indices
    tokens_to_index=lambda tokens: [tokenizer.stoi[token] for token in tokens]

    bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final = [], [], [], []

    for bert_input, bert_label,is_next in zip(bert_inputs, bert_labels,is_nexts):
        # Create segment labels for each pair of sentences
        segment_label = [[1] * len(bert_input[0]), [2] * len(bert_input[1])]

        # Zero-pad the bert_input and bert_label and segment_label
        bert_input_padded = zero_pad_list_pair(bert_input)
        bert_label_padded = zero_pad_list_pair(bert_label)
        segment_label_padded = zero_pad_list_pair(segment_label,pad=0)

        #convert to tensors
        if to_tensor:

            # Flatten the padded inputs and labels, transform tokens to their corresponding vocab indices, and convert them to tensors
            # bert_inputs_final.append(torch.tensor(tokens_to_index(flatten(bert_input_padded)),dtype=torch.int64))
            # bert_labels_final.append(torch.tensor(tokens_to_index(flatten(bert_label_padded)),dtype=torch.int64))
            # segment_labels_final.append(torch.tensor(flatten(segment_label_padded),dtype=torch.int64))
            bert_inputs_final.append(tokens_to_index(flatten(bert_input_padded)))
            bert_labels_final.append(tokens_to_index(flatten(bert_label_padded)))
            segment_labels_final.append(flatten(segment_label_padded))
            is_nexts_final.append(is_next)

        else:
          # Flatten the padded inputs and labels
            bert_inputs_final.append(flatten(bert_input_padded))
            bert_labels_final.append(flatten(bert_label_padded))
            segment_labels_final.append(flatten(segment_label_padded))
            is_nexts_final.append(is_next)

    return bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final

In [115]:
bert_inputs_final, bert_labels_final, segment_labels_final, is_nexts_final = prepare_bert_final_inputs(bert_inputs, bert_labels, is_nexts)

In [116]:
bert_inputs_final[:10]

[[2, 5, 4, 4, 8, 9, 3, 24, 7, 8, 5, 9, 3, 1],
 [2, 5, 6, 7, 10, 4, 3, 24, 7, 4, 5, 9, 3, 1],
 [2, 5, 6, 7, 11, 9, 3, 24, 7, 11, 5, 9, 3, 1],
 [2, 5, 6, 7, 12, 9, 3, 24, 4, 4, 5, 9, 3, 1],
 [2, 5, 6, 7, 4, 9, 3, 24, 7, 13, 5, 9, 3, 1],
 [2, 5, 6, 7, 8, 14, 3, 4, 7, 8, 5, 14, 3, 1],
 [2, 4, 6, 7, 10, 14, 3, 24, 4, 4, 5, 14, 3, 1],
 [2, 5, 6, 7, 11, 14, 3, 24, 7, 4, 4, 4, 3, 1],
 [2, 4, 6, 4, 12, 14, 3, 24, 7, 12, 5, 14, 3, 1],
 [2, 4, 4, 7, 13, 14, 3, 24, 7, 13, 5, 14, 3, 1]]

In [117]:
bert_labels_final[:10]

[[1, 1, 6, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 9, 1, 24, 7, 10, 1, 1, 1, 1],
 [1, 1, 1, 7, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 1, 1, 1, 7, 12, 1, 1, 1, 1],
 [1, 1, 1, 1, 13, 9, 1, 1, 1, 1, 5, 1, 1, 1],
 [1, 1, 6, 1, 8, 1, 1, 24, 1, 1, 1, 1, 1, 1],
 [1, 5, 1, 1, 1, 14, 1, 1, 7, 10, 1, 1, 1, 1],
 [1, 1, 1, 1, 1, 14, 1, 1, 1, 11, 5, 14, 1, 1],
 [1, 5, 1, 20, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1, 17, 6, 1, 1, 1, 1, 24, 1, 1, 1, 1, 1, 1]]

In [118]:
segment_labels_final[:10]

[[1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0],
 [1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 0]]

In [119]:
df_final = pd.DataFrame({
    'bert_input': bert_inputs_final,
    'bert_label': bert_labels_final,
    'segment_label': segment_labels_final,
    'relation': is_nexts_final
})
print('Cantidad de oraciones en el dataset final:', df_final.shape[0])

df_final.to_csv(csv_file_path, index=False)

Cantidad de oraciones en el dataset final: 53250


---