In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install datasets scikit-learn tqdm

Looking in indexes: https://download.pytorch.org/whl/cu118
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90m━━━━━━━━━━━━━━━━

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from datasets import load_dataset
import re
import string
from collections import Counter
import random
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm # For progress bars in Colab

# --- Configuration ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Hyperparameters (can be tuned)
MAX_LEN = 30  # Maximum sequence length for input and output
BATCH_SIZE = 64
EMBEDDING_DIM = 256
HIDDEN_DIM = 512 # For LSTM
NUM_LSTM_LAYERS = 1 # As per "simplest possible"
LEARNING_RATE = 0.001
NUM_EPOCHS = 10 # Adjust as needed, more epochs take longer
TEACHER_FORCING_RATIO = 0.5 # During training
MIN_WORD_FREQ = 3 # Minimum frequency for a word to be in vocab
DATA_SUBSET_SIZE = None # Use a number like 10000 for faster testing, None for full dataset (e.g., 10000)

# Special tokens
PAD_TOKEN = "<pad>"
SOS_TOKEN = "<sos>" # Start Of Sentence
EOS_TOKEN = "<eos>" # End Of Sentence
UNK_TOKEN = "<unk>" # Unknown word

Using device: cuda


In [3]:
print("1. Loading MultiWOZ v2.2 dataset from pfb30/multi_woz_v22...")
try:
    # Using the specified dataset from the prompt
    multi_woz_hub_dataset = load_dataset("pfb30/multi_woz_v22", trust_remote_code=True)

    if 'train' in multi_woz_hub_dataset:
        dataset_split = multi_woz_hub_dataset['train'] # pfb30/multi_woz_v22 has a 'train' split
        print(f"Dataset loaded successfully. Number of dialogues in 'train' split: {len(dataset_split)}")
        #print(f"Example of first item: {dataset_split[0]}") # This will print a large dialogue object
    else:
        raise ValueError("Could not find 'train' split in the loaded dataset.")

except Exception as e:
    print(f"Error loading dataset 'pfb30/multi_woz_v22': {e}")
    print("Please ensure the dataset 'pfb30/multi_woz_v22' is available and you have internet access.")
    # You might want to add alternative dataset loading here if needed,
    # but the prompt specifically requested pfb30/multi_woz_v22.
    exit()


# --- Extracting Dialogues into Input-Output Pairs ---
dialogue_pairs = []
print("Extracting user-system utterance pairs...")

# The structure of pfb30/multi_woz_v22 items (dialogues in the 'train' split):
# Each item is a dict: {'dialogue_id': ..., 'services': ..., 'turns': {'speaker': [...], 'utterance': [...], ...}}
for dialogue_obj in tqdm(dataset_split, desc="Processing dialogues"):
    turns_data = dialogue_obj.get('turns')
    if turns_data and 'utterance' in turns_data and 'speaker' in turns_data:
        utterances = turns_data['utterance']
        speakers = turns_data['speaker'] # 0 for USER, 1 for SYSTEM in MultiWOZ

        for i in range(len(utterances) - 1):
            # Assuming speaker 0 is USER and speaker 1 is SYSTEM
            if speakers[i] == 0 and speakers[i+1] == 1:
                user_utt = utterances[i]
                system_utt = utterances[i+1]
                if user_utt and system_utt: # Ensure non-empty
                    dialogue_pairs.append((user_utt, system_utt))
    else:
        print(f"Warning: Dialogue object missing 'turns' or sub-keys: {dialogue_obj.get('dialogue_id', 'Unknown ID')}")


if not dialogue_pairs:
    print("No dialogue pairs extracted. Exiting. Please check dataset structure and parsing logic.")
    exit()

print(f"Extracted {len(dialogue_pairs)} user-system utterance pairs.")
if DATA_SUBSET_SIZE and DATA_SUBSET_SIZE < len(dialogue_pairs):
    dialogue_pairs = random.sample(dialogue_pairs, DATA_SUBSET_SIZE)
    print(f"Using a subset of {len(dialogue_pairs)} pairs for faster processing.")


# --- 2. Preprocess Data (Tokenize, Lowercase, Remove Punctuation) ---
print("\n2. Preprocessing data...")
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[" + string.punctuation + "]", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    return tokens

processed_pairs = []
for user_utt, system_utt in tqdm(dialogue_pairs, desc="Preprocessing pairs"):
    user_tokens = preprocess_text(user_utt)
    system_tokens = preprocess_text(system_utt)
    if user_tokens and system_tokens:
         processed_pairs.append((user_tokens, system_tokens))

print(f"Preprocessing complete. Kept {len(processed_pairs)} pairs after cleaning.")
if processed_pairs:
    print(f"Example processed pair: User: {processed_pairs[0][0]}, System: {processed_pairs[0][1]}")
else:
    print("No pairs left after preprocessing.")
    exit()

# --- 3. Create Dictionary (Vocabulary) ---
print("\n3. Creating vocabulary...")
all_words = Counter()
for user_tokens, system_tokens in processed_pairs:
    all_words.update(user_tokens)
    all_words.update(system_tokens)

vocab = {word: count for word, count in all_words.items() if count >= MIN_WORD_FREQ}
word_to_idx = {word: i+4 for i, word in enumerate(vocab.keys())} # +4 for special tokens
word_to_idx[PAD_TOKEN] = 0
word_to_idx[SOS_TOKEN] = 1
word_to_idx[EOS_TOKEN] = 2
word_to_idx[UNK_TOKEN] = 3

idx_to_word = {i: word for word, i in word_to_idx.items()}
VOCAB_SIZE = len(word_to_idx)
print(f"Vocabulary size: {VOCAB_SIZE} (including special tokens, min_freq={MIN_WORD_FREQ})")

1. Loading MultiWOZ v2.2 dataset from pfb30/multi_woz_v22...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

multi_woz_v22.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0/22 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/8437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset loaded successfully. Number of dialogues in 'train' split: 8437
Extracting user-system utterance pairs...


Processing dialogues:   0%|          | 0/8437 [00:00<?, ?it/s]

Extracted 56776 user-system utterance pairs.

2. Preprocessing data...


Preprocessing pairs:   0%|          | 0/56776 [00:00<?, ?it/s]

Preprocessing complete. Kept 56776 pairs after cleaning.
Example processed pair: User: ['i', 'need', 'a', 'place', 'to', 'dine', 'in', 'the', 'center', 'thats', 'expensive'], System: ['i', 'have', 'several', 'options', 'for', 'you', 'do', 'you', 'prefer', 'african', 'asian', 'or', 'british', 'food']

3. Creating vocabulary...
Vocabulary size: 3566 (including special tokens, min_freq=3)


In [4]:
# --- Custom Dataset Class ---
class ChatbotDataset(Dataset):
    def __init__(self, pairs, word_to_idx, max_len):
        self.pairs = pairs
        self.word_to_idx = word_to_idx
        self.max_len = max_len
        self.unk_idx = word_to_idx[UNK_TOKEN]
        self.sos_idx = word_to_idx[SOS_TOKEN]
        self.eos_idx = word_to_idx[EOS_TOKEN]

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        user_tokens, system_tokens = self.pairs[idx]

        # Numerize, add SOS/EOS, and truncate
        user_numerized = [self.sos_idx] + \
                         [self.word_to_idx.get(token, self.unk_idx) for token in user_tokens][:self.max_len-2] + \
                         [self.eos_idx]

        system_numerized = [self.sos_idx] + \
                           [self.word_to_idx.get(token, self.unk_idx) for token in system_tokens][:self.max_len-2] + \
                           [self.eos_idx]

        return torch.LongTensor(user_numerized), torch.LongTensor(system_numerized)

def collate_fn(batch):
    src_batch, trg_batch = [], []
    for src_item, trg_item in batch:
        src_batch.append(src_item)
        trg_batch.append(trg_item)

    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=word_to_idx[PAD_TOKEN])
    trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=word_to_idx[PAD_TOKEN])

    return src_padded, trg_padded

# --- 6. Split data into train-test set ---
print("\n6. Splitting data into train and test sets...")
train_pairs, test_pairs = train_test_split(processed_pairs, test_size=0.3, random_state=42)
print(f"Training pairs: {len(train_pairs)}, Testing pairs: {len(test_pairs)}")

train_dataset = ChatbotDataset(train_pairs, word_to_idx, MAX_LEN)
test_dataset = ChatbotDataset(test_pairs, word_to_idx, MAX_LEN)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2) # num_workers for faster loading
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)

print("DataLoaders created.")
# Example batch
try:
    src_batch_example, trg_batch_example = next(iter(train_dataloader))
    print(f"Example source batch shape: {src_batch_example.shape}")
    print(f"Example target batch shape: {trg_batch_example.shape}")
except Exception as e:
    print(f"Could not get example batch: {e}. This might happen if dataset is too small for batch size.")


6. Splitting data into train and test sets...
Training pairs: 39743, Testing pairs: 17033
DataLoaders created.
Example source batch shape: torch.Size([64, 30])
Example target batch shape: torch.Size([64, 30])


In [5]:
# --- 5. & 9. Sequence-to-Sequence LSTM Model (Simplest) ---
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout_p=0.1):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=word_to_idx[PAD_TOKEN])
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout_p if n_layers > 1 else 0, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, src_seqs):
        embedded = self.dropout(self.embedding(src_seqs))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout_p=0.1):
        super().__init__()
        self.output_dim = output_dim # Vocab size
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=word_to_idx[PAD_TOKEN])
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout_p if n_layers > 1 else 0, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input_token, hidden, cell):
        input_token = input_token.unsqueeze(1) # [batch_size] -> [batch_size, 1]
        embedded = self.dropout(self.embedding(input_token))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert encoder.hid_dim == decoder.hid_dim
        assert encoder.n_layers == decoder.n_layers

    def forward(self, src_seqs, trg_seqs, teacher_forcing_ratio=0.5):
        batch_size = trg_seqs.shape[0]
        trg_len = trg_seqs.shape[1]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src_seqs)

        # First input to decoder is <sos> token
        input_token = trg_seqs[:, 0]

        for t in range(1, trg_len): # Start from 1 as outputs[0] will remain zeros (or handle differently)
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input_token = trg_seqs[:, t] if teacher_force else top1

        return outputs

In [6]:
encoder_model = Encoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LSTM_LAYERS).to(DEVICE)
decoder_model = Decoder(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LSTM_LAYERS).to(DEVICE)
model = Seq2Seq(encoder_model, decoder_model, DEVICE).to(DEVICE)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=word_to_idx[PAD_TOKEN])

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')
# Initialize weights for better convergence (optional, but good practice)
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
# model.apply(init_weights) # You can try applying this

The model has 6,809,070 trainable parameters


In [7]:
# --- 7. Train the LSTM network ---
print("\n7. Training the LSTM network...")

def train_epoch(model, dataloader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0

    # Wrap dataloader with tqdm for a progress bar
    progress_bar = tqdm(dataloader, desc="Training Epoch", leave=False)

    for src, trg in progress_bar:
        src, trg = src.to(DEVICE), trg.to(DEVICE)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio=TEACHER_FORCING_RATIO)

        # output = [batch_size, trg_len, output_dim]
        # trg for loss = [batch_size, trg_len]
        # Need to reshape for CrossEntropyLoss:
        # Output: (N, C) where C = number of classes
        # Target: (N) where each value is 0 <= targets[i] <= C-1

        # Skip <sos> token for loss calculation
        output_for_loss = output[:, 1:].reshape(-1, output.shape[-1]) # Shape: [batch_size * (trg_len-1), output_dim]
        trg_for_loss = trg[:, 1:].reshape(-1) # Shape: [batch_size * (trg_len-1)]

        loss = criterion(output_for_loss, trg_for_loss)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    return epoch_loss / len(dataloader)

# Training loop
for epoch in range(NUM_EPOCHS):
    print(f"Starting Epoch {epoch+1}/{NUM_EPOCHS}...")
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion)
    print(f"Epoch {epoch+1} Summary: Train Loss: {train_loss:.3f}")
    # Add evaluation on a validation set here if you have one for early stopping, etc.

print("Training finished.")


7. Training the LSTM network...
Starting Epoch 1/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 1 Summary: Train Loss: 4.254
Starting Epoch 2/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 2 Summary: Train Loss: 3.460
Starting Epoch 3/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 3 Summary: Train Loss: 3.224
Starting Epoch 4/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 4 Summary: Train Loss: 3.079
Starting Epoch 5/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 5 Summary: Train Loss: 2.977
Starting Epoch 6/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 6 Summary: Train Loss: 2.912
Starting Epoch 7/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 7 Summary: Train Loss: 2.854
Starting Epoch 8/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 8 Summary: Train Loss: 2.819
Starting Epoch 9/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 9 Summary: Train Loss: 2.781
Starting Epoch 10/10...


Training Epoch:   0%|          | 0/621 [00:00<?, ?it/s]

Epoch 10 Summary: Train Loss: 2.733
Training finished.


In [9]:
# --- 8. Perform the testing of the ChatBot ---
print("\n8. Testing the ChatBot (generating responses)...")

def generate_response(sentence, model, word_to_idx, idx_to_word, device, max_gen_len=MAX_LEN): # max_gen_len
    model.eval() # Set model to evaluation mode

    if isinstance(sentence, str):
        tokens = preprocess_text(sentence) # Use the same preprocessing
    else: # Assumed to be already tokenized (e.g., from test_pairs)
        tokens = sentence

    if not tokens:
        return "Input sentence is empty after preprocessing."

    # Numerize input
    # Ensure input sequence length is managed, including SOS and EOS
    src_numerized = [word_to_idx[SOS_TOKEN]] + \
                    [word_to_idx.get(token, word_to_idx[UNK_TOKEN]) for token in tokens][:MAX_LEN-2] + \
                    [word_to_idx[EOS_TOKEN]]
    src_tensor = torch.LongTensor(src_numerized).unsqueeze(0).to(device) # [1, src_len]

    with torch.no_grad():
        encoder_hidden, encoder_cell = model.encoder(src_tensor)

    trg_indexes = [word_to_idx[SOS_TOKEN]] # Start with SOS token for decoder

    # Decoder's initial hidden state is encoder's final hidden state
    hidden, cell = encoder_hidden, encoder_cell

    for _ in range(max_gen_len): # Use a generation specific max_len
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device) # Last predicted token as input

        with torch.no_grad():
            output, hidden, cell = model.decoder(trg_tensor, hidden, cell)

        pred_token_idx = output.argmax(1).item() # Get the token with highest probability
        trg_indexes.append(pred_token_idx)

        if pred_token_idx == word_to_idx[EOS_TOKEN]: # Stop if EOS token is predicted
            break

    # Convert token indexes back to words
    trg_tokens = [idx_to_word.get(i, UNK_TOKEN) for i in trg_indexes]

    # Exclude <sos> and potentially <eos> if it's the last token
    response_tokens = trg_tokens[1:]
    if response_tokens and response_tokens[-1] == EOS_TOKEN:
        response_tokens = response_tokens[:-1]

    return " ".join(response_tokens)


# --- Example Dialogues from Test Set (Keep this for evaluating on test data) ---
print("\n--- Example Dialogues from Test Set ---")
num_test_examples = 5
if len(test_pairs) > 0:
    for i in range(min(num_test_examples, len(test_pairs))):
        user_tokens, system_tokens_gt = test_pairs[i]
        user_sentence_text = " ".join(user_tokens)

        print(f"\nUser (from test set): {user_sentence_text}")
        print(f"Actual System: {' '.join(system_tokens_gt)}")

        generated_response_text = generate_response(user_tokens, model, word_to_idx, idx_to_word, DEVICE)
        print(f"ChatBot System: {generated_response_text}")
else:
    print("No test pairs to show examples from.")

# --- Interactive Chat with User ---
print("\n--- Interactive ChatBot ---")
print("Type 'quit' or 'exit' to end the chat.")
while True:
    user_input = input("You: ")
    if user_input.lower() in ['quit', 'exit']:
        print("ChatBot: Goodbye!")
        break

    if not user_input.strip():
        print("ChatBot: Please say something.")
        continue

    # Generate response using the model
    bot_response = generate_response(user_input, model, word_to_idx, idx_to_word, DEVICE)
    print(f"ChatBot: {bot_response}")


8. Testing the ChatBot (generating responses)...

--- Example Dialogues from Test Set ---

User (from test set): hello i am looking for a place to stay with free wifi
Actual System: i see 32 places have free wifi to narrow it down is there a particular side of town or a pricerange you re looking for
ChatBot System: there are 29 hotels that meet your needs do you have a preference

User (from test set): that sounds good i will need a reservation for eight people and give me a reference number also please
Actual System: your booking has been successful your total fee is 78 4 gbp payable at the station your reference number is aupd31qm is there anything else i can assist you with today
ChatBot System: your booking was successful the total fee is 66 gbp payable at the station reference number is <unk>

User (from test set): i am planning a trip in cambridge
Actual System: sure do you need some information
ChatBot System: i can help with that what you have a preference

User (from test set