In [22]:
# Data Preprocessing Notebook for Generative QA Chatbot
#Preprocessing pipeline for the Q&A dataset.
#Tasks: cleaning, tokenization, vocabulary building, padding, and splitting.

In [None]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install torch torchvision torchaudio

Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import re
import random

# Data handling
import pandas as pd
import numpy as np

# NLP preprocessing
import nltk
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

# nltk tokenizer path
#nltk.download('punkt', download_dir='/Users/guirz/nltk_data')
#nltk.data.path.append('/Users/guirz/nltk_data')

# train/val/test splitting
from sklearn.model_selection import train_test_split

# Torch will be used for model prep
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
### Merge 3 datasets (S08, S09, S10)
# Load `question_answer_pairs` files
# Add a `source` column to track their origin year
# Combine them into a single DataFrame

In [13]:
base_path = '../Data processing/Datasets'
folders = ['S08', 'S09', 'S10']
file_name = 'question_answer_pairs.txt'

df_list = []

for folder in folders:
    file_path = os.path.join(base_path, folder, file_name)
    
    # Load the file using encoding
    df = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')
    
    # Add column indicating the source year
    df['source'] = folder
    
    df_list.append(df)

# Combine into one DataFrame
qa_df = pd.concat(df_list, ignore_index=True)

# Drop rows with missing Question or Answer
qa_df = qa_df[['Question', 'Answer', 'source']].dropna()

# Preview
print(f"Total merged Q&A pairs: {len(qa_df)}")
qa_df.head()


Total merged Q&A pairs: 3420


Unnamed: 0,Question,Answer,source
0,Was Abraham Lincoln the sixteenth President of...,yes,S08
1,Was Abraham Lincoln the sixteenth President of...,Yes.,S08
2,Did Lincoln sign the National Banking Act of 1...,yes,S08
3,Did Lincoln sign the National Banking Act of 1...,Yes.,S08
4,Did his mother die of pneumonia?,no,S08


In [14]:
# Remove rows with missing Question/Answer
qa_df = qa_df.dropna(subset=['Question', 'Answer'])

# Remove rows where question or answer is empty
qa_df = qa_df[qa_df['Question'].str.strip() != '']
qa_df = qa_df[qa_df['Answer'].str.strip() != '']

# Reset index
qa_df.reset_index(drop=True, inplace=True)

# Preview cleaned count
print(f"After removing blanks/nulls: {len(qa_df)} Q&A pairs")
qa_df.head()

After removing blanks/nulls: 3420 Q&A pairs


Unnamed: 0,Question,Answer,source
0,Was Abraham Lincoln the sixteenth President of...,yes,S08
1,Was Abraham Lincoln the sixteenth President of...,Yes.,S08
2,Did Lincoln sign the National Banking Act of 1...,yes,S08
3,Did Lincoln sign the National Banking Act of 1...,Yes.,S08
4,Did his mother die of pneumonia?,no,S08


In [None]:
### Clean and tokenize question and answer text
# Lowercase
# Remove punctuation
# Tokenize using TreebankWordTokenizer
# Add `<sos>` and `<eos>` to answers

In [15]:
def preprocess_text(text, is_answer=False):
    text = str(text).lower().strip()
    text = re.sub(r"[^\w\s]", "", text)
    tokens = tokenizer.tokenize(text)
    if is_answer:
        tokens = ['<sos>'] + tokens + ['<eos>']
    return tokens

qa_df['question_tokens'] = qa_df['Question'].apply(lambda x: preprocess_text(x, is_answer=False))
qa_df['answer_tokens']   = qa_df['Answer'].apply(lambda x: preprocess_text(x, is_answer=True))

qa_df[['Question', 'Answer', 'question_tokens', 'answer_tokens']].head(10)


Unnamed: 0,Question,Answer,question_tokens,answer_tokens
0,Was Abraham Lincoln the sixteenth President of...,yes,"[was, abraham, lincoln, the, sixteenth, presid...","[<sos>, yes, <eos>]"
1,Was Abraham Lincoln the sixteenth President of...,Yes.,"[was, abraham, lincoln, the, sixteenth, presid...","[<sos>, yes, <eos>]"
2,Did Lincoln sign the National Banking Act of 1...,yes,"[did, lincoln, sign, the, national, banking, a...","[<sos>, yes, <eos>]"
3,Did Lincoln sign the National Banking Act of 1...,Yes.,"[did, lincoln, sign, the, national, banking, a...","[<sos>, yes, <eos>]"
4,Did his mother die of pneumonia?,no,"[did, his, mother, die, of, pneumonia]","[<sos>, no, <eos>]"
5,Did his mother die of pneumonia?,No.,"[did, his, mother, die, of, pneumonia]","[<sos>, no, <eos>]"
6,How many long was Lincoln's formal education?,18 months,"[how, many, long, was, lincolns, formal, educa...","[<sos>, 18, months, <eos>]"
7,How many long was Lincoln's formal education?,18 months.,"[how, many, long, was, lincolns, formal, educa...","[<sos>, 18, months, <eos>]"
8,When did Lincoln begin his political career?,1832,"[when, did, lincoln, begin, his, political, ca...","[<sos>, 1832, <eos>]"
9,When did Lincoln begin his political career?,1832.,"[when, did, lincoln, begin, his, political, ca...","[<sos>, 1832, <eos>]"


In [None]:
### Build vocabulary from tokenized data
# Count word frequency across questions + answers
# Reserve special tokens: `<pad>`, `<unk>`, `<sos>`, `<eos>`
# Limit vocab to most frequent 10,000 tokens

In [16]:
from collections import Counter

# Combine all tokens
all_tokens = qa_df['question_tokens'].explode().tolist() + qa_df['answer_tokens'].explode().tolist()

# Frequencies
word_freq = Counter(all_tokens)

# Vcab size
vocab_size = 10000

# Reserve special tokens
special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']

# Get most common tokens
most_common = [word for word, _ in word_freq.most_common(vocab_size - len(special_tokens)) if word not in special_tokens]

# Finallist
vocab = special_tokens + most_common

# Create dictionaries
word2index = {word: idx for idx, word in enumerate(vocab)}
index2word = {idx: word for word, idx in word2index.items()}

# Confirm
print(f"Vocab size (with special tokens): {len(word2index)}")
print("Sample vocabulary items:", list(word2index.items())[:10])


Vocab size (with special tokens): 5748
Sample vocabulary items: [('<pad>', 0), ('<unk>', 1), ('<sos>', 2), ('<eos>', 3), ('the', 4), ('of', 5), ('yes', 6), ('is', 7), ('in', 8), ('a', 9)]


In [None]:
### Convert token sequences to integer index sequences using word2index

In [17]:
# Convert tokens to indices
def tokens_to_indices(tokens, word2index):
    return [word2index.get(token, word2index['<unk>']) for token in tokens]

# Apply to columns
qa_df['question_indices'] = qa_df['question_tokens'].apply(lambda x: tokens_to_indices(x, word2index))
qa_df['answer_indices']   = qa_df['answer_tokens'].apply(lambda x: tokens_to_indices(x, word2index))

# Preview
qa_df[['question_tokens', 'question_indices', 'answer_tokens', 'answer_indices']].head(5)


Unnamed: 0,question_tokens,question_indices,answer_tokens,answer_indices
0,"[was, abraham, lincoln, the, sixteenth, presid...","[11, 1051, 136, 4, 2293, 69, 5, 4, 87, 105]","[<sos>, yes, <eos>]","[2, 6, 3]"
1,"[was, abraham, lincoln, the, sixteenth, presid...","[11, 1051, 136, 4, 2293, 69, 5, 4, 87, 105]","[<sos>, yes, <eos>]","[2, 6, 3]"
2,"[did, lincoln, sign, the, national, banking, a...","[14, 136, 882, 4, 137, 2294, 413, 5, 2295]","[<sos>, yes, <eos>]","[2, 6, 3]"
3,"[did, lincoln, sign, the, national, banking, a...","[14, 136, 882, 4, 137, 2294, 413, 5, 2295]","[<sos>, yes, <eos>]","[2, 6, 3]"
4,"[did, his, mother, die, of, pneumonia]","[14, 33, 265, 150, 5, 2296]","[<sos>, no, <eos>]","[2, 16, 3]"


In [None]:
### Pad all sequences to fixed length

In [18]:
from torch.nn.utils.rnn import pad_sequence
import torch

# Define lengths
MAX_QUESTION_LEN = 20
MAX_ANSWER_LEN = 20

# Padding
def pad_sequence_to_length(seq, max_len):
    if len(seq) > max_len:
        return seq[:max_len]
    return seq + [word2index['<pad>']] * (max_len - len(seq))

# Apply padding
qa_df['question_padded'] = qa_df['question_indices'].apply(lambda x: pad_sequence_to_length(x, MAX_QUESTION_LEN))
qa_df['answer_padded']   = qa_df['answer_indices'].apply(lambda x: pad_sequence_to_length(x, MAX_ANSWER_LEN))

# Preview
qa_df[['question_indices', 'question_padded', 'answer_indices', 'answer_padded']].head(5)


Unnamed: 0,question_indices,question_padded,answer_indices,answer_padded
0,"[11, 1051, 136, 4, 2293, 69, 5, 4, 87, 105]","[11, 1051, 136, 4, 2293, 69, 5, 4, 87, 105, 0,...","[2, 6, 3]","[2, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[11, 1051, 136, 4, 2293, 69, 5, 4, 87, 105]","[11, 1051, 136, 4, 2293, 69, 5, 4, 87, 105, 0,...","[2, 6, 3]","[2, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[14, 136, 882, 4, 137, 2294, 413, 5, 2295]","[14, 136, 882, 4, 137, 2294, 413, 5, 2295, 0, ...","[2, 6, 3]","[2, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[14, 136, 882, 4, 137, 2294, 413, 5, 2295]","[14, 136, 882, 4, 137, 2294, 413, 5, 2295, 0, ...","[2, 6, 3]","[2, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[14, 33, 265, 150, 5, 2296]","[14, 33, 265, 150, 5, 2296, 0, 0, 0, 0, 0, 0, ...","[2, 16, 3]","[2, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
### Split into Train / Validation / Test sets
# 80% train
# 10% validation
# 10% test

In [19]:
from sklearn.model_selection import train_test_split

# Extract padded sequences as numpy arrays
X = np.array(qa_df['question_padded'].tolist())
y = np.array(qa_df['answer_padded'].tolist())

# First split: Train 80% vs (Val+Test) 20%
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: Val 10% vs Test 10%
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Confirm sizes
print("Train set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Test set size:", X_test.shape)

Train set size: (2736, 20)
Validation set size: (342, 20)
Test set size: (342, 20)
