# Data Prep for Model Input

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import RobertaTokenizer
import torch

In [2]:
train_data_path = '../data/SQuAD_train_augmented_DF.h5'
val_data_path = '../data/SQuAD_val_DF.h5'
test_data_path = '../data/SQuAD_test_DF.h5'

In [3]:
df_train = pd.read_hdf(train_data_path, 'df_train')

In [4]:
df_val = pd.read_hdf(val_data_path, 'df_val')

In [5]:
df_test = pd.read_hdf(test_data_path, 'df_test')

In [6]:
# Create a Tokenizer object
tokenizer = Tokenizer(num_words=10000)  # Limit vocabulary size (optional)

# Tokenize the data
tokenizer.fit_on_texts(df_train['context'])

# Convert context data to sequences of integer IDs (assuming space-separated tokens)
train_context_tokens = tokenizer.texts_to_sequences(df_train['context'])
val_context_tokens = tokenizer.texts_to_sequences(df_val['context'])
test_context_tokens = tokenizer.texts_to_sequences(df_test['context'])

# Pad context sequences to ensure consistent length
train_context_sequences = pad_sequences(train_context_tokens, maxlen=653, padding='post')
val_context_sequences = pad_sequences(val_context_tokens, maxlen=653, padding='post')
test_context_sequences = pad_sequences(test_context_tokens, maxlen=653, padding='post')

# Convert question data to sequences of integer IDs (assuming space-separated tokens)
train_question_tokens = tokenizer.texts_to_sequences(df_train['question'])
val_question_tokens = tokenizer.texts_to_sequences(df_val['question'])
test_question_tokens = tokenizer.texts_to_sequences(df_test['question'])

# Pad question sequences to ensure consistent length
train_question_sequences = pad_sequences(train_question_tokens, maxlen=653, padding='post')
val_question_sequences = pad_sequences(val_question_tokens, maxlen=653, padding='post')
test_question_sequences = pad_sequences(test_question_tokens, maxlen=653, padding='post')

# Convert answer data to sequences of integer IDs (assuming space-separated tokens)
train_answer_tokens = tokenizer.texts_to_sequences(df_train['text'])
val_answer_tokens = tokenizer.texts_to_sequences(df_val['text'])
test_answer_token = tokenizer.texts_to_sequences(df_test['text'])

# Pad answer sequences to ensure consistent length
train_answer_sequences = pad_sequences(train_answer_tokens, maxlen=43, padding='post')
val_answer_sequences = pad_sequences(val_answer_tokens, maxlen=43, padding='post')
test_answer_sequences = pad_sequences(test_answer_token, maxlen=43, padding='post')

In [7]:
train_context_sequences

array([[   1,  857,  141, ...,    0,    0,    0],
       [3270,  109,    2, ...,    0,    0,    0],
       [  22, 2996,  382, ...,    0,    0,    0],
       ...,
       [   1,   55,  545, ...,    0,    0,    0],
       [ 175,    5,    1, ...,    0,    0,    0],
       [  46,  110, 4911, ...,    0,    0,    0]])

In [8]:
train_question_sequences

array([[ 604,   52,  397, ...,    0,    0,    0],
       [ 249,    9,    1, ...,    0,    0,    0],
       [ 249,   69,   22, ...,    0,    0,    0],
       ...,
       [  16,  372, 2346, ...,    0,    0,    0],
       [ 487, 8664,    2, ...,    0,    0,    0],
       [   9,    1,  195, ...,    0,    0,    0]])

In [9]:
train_answer_sequences

array([[3667,   74,    0, ...,    0,    0,    0],
       [2623,    0,    0, ...,    0,    0,    0],
       [8568,    0,    0, ...,    0,    0,    0],
       ...,
       [ 650,    0,    0, ...,    0,    0,    0],
       [   8,    0,    0, ...,    0,    0,    0],
       [1359,    0,    0, ...,    0,    0,    0]])

In [10]:
np.savez_compressed('../data/custom_context_sequences.npz', train=train_context_sequences, val=val_context_sequences, test=test_context_sequences)

In [11]:
np.savez_compressed('../data/custom_question_sequences.npz', train=train_question_sequences, val=val_question_sequences, test=test_question_sequences)

In [12]:
np.savez_compressed('../data/custom_answer_sequences.npz', train=train_answer_sequences, val=val_answer_sequences, test=test_answer_sequences)

## RoBERTa Data Preprocessing

In [13]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [14]:
def preprocess_function(df,tokenizer):
    # Tokenize context, question and answers
    tokenized_examples = tokenizer(list(df["context"]), 
                                   list(df["question"]), 
                                   return_tensors="pt",
                                   padding=True,
                                   truncation=True)
    # Get offsets of each answer text in context
    answer_texts = df['text']
    start_positions = []
    end_positions = []
    answer_mask = []  # Add a list to track missing answers
    for example_index, (context, answer_text) in enumerate(zip(df["context"], answer_texts)):
        answer_start = context.find(answer_text)
        if answer_start != -1:
            # Convert answer position from character to token indices
            answer_end = answer_start + len(answer_text)
            answer_tokens = tokenizer.convert_ids_to_tokens(tokenized_examples["input_ids"][example_index][answer_start:answer_end])
            # Check if entire answer span is included in tokenization
            if all(x not in ("[UNK]", "[PAD]") for x in answer_tokens):
                start_positions.append(answer_start)
                end_positions.append(answer_end - 1)  # Adjust for endIndex being exclusive
                answer_mask.append(1)  # Answer found in tokenization
            else:
                # Answer span not found in tokenization due to tokenizer limitations (e.g., BPE)
                start_positions.append(-1)
                end_positions.append(-1)
                answer_mask.append(0)  # Answer not found in tokenization
        else:
            # Answer not found in context
            start_positions.append(-1)
            end_positions.append(-1)
            answer_mask.append(0)  # Answer not found in tokenization
            
    # Convert lists to tensors
    start_positions = torch.tensor(start_positions)
    end_positions = torch.tensor(end_positions)
    answer_mask = torch.tensor(answer_mask)
            
    return {'input_ids': tokenized_examples['input_ids'],
            'attention_mask': tokenized_examples['attention_mask'],
            'start_positions': start_positions,
            'end_positions': end_positions,
            'answer_mask': answer_mask}

In [15]:
roberta_train = preprocess_function(df_train, roberta_tokenizer)
roberta_val = preprocess_function(df_val, roberta_tokenizer)
roberta_test = preprocess_function(df_test, roberta_tokenizer)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [16]:
torch.save(roberta_train, '../data/roberta_train.pt')
torch.save(roberta_val, '../data/roberta_val.pt')
torch.save(roberta_test, '../data/roberta_test.pt')