Data Preprocessing

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load the dataset
df = pd.read_csv('QandA.csv')

# Check the columns of the dataframe
print(df.columns)

# Columns are 'Question' and 'Answer'
# Display the first few rows of the dataframe
print(df.head())

# Convert all values to strings and handle missing values
df['Question'] = df['Question'].astype(str).fillna('')
df['Answer'] = df['Answer'].astype(str).fillna('')

# Add special tokens to the answers
df['Answer'] = df['Answer'].apply(lambda x: '<start> ' + x + ' <end>')

# Preprocess the data
questions = df['Question'].values
answers = df['Answer'].values

# Tokenize the questions and answers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(questions) + list(answers))

# Add special tokens to the tokenizer
special_tokens = {'<start>': len(tokenizer.word_index) + 1, '<end>': len(tokenizer.word_index) + 2}
tokenizer.word_index.update(special_tokens)
tokenizer.index_word[special_tokens['<start>']] = '<start>'
tokenizer.index_word[special_tokens['<end>']] = '<end>'

# Convert text to sequences
question_sequences = tokenizer.texts_to_sequences(questions)
answer_sequences = tokenizer.texts_to_sequences(answers)

# Pad sequences to ensure uniform length
max_len = max(max(len(seq) for seq in question_sequences), max(len(seq) for seq in answer_sequences))
question_sequences = pad_sequences(question_sequences, maxlen=max_len, padding='post')
answer_sequences = pad_sequences(answer_sequences, maxlen=max_len, padding='post')

# Create training and validation splits
X_train, X_val, y_train, y_val = train_test_split(question_sequences, answer_sequences, test_size=0.1, random_state=42)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)

# Create Dataset and DataLoader
class QADataset(Dataset):
    def __init__(self, questions, answers):
        self.questions = questions
        self.answers = answers

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return self.questions[idx], self.answers[idx]

train_dataset = QADataset(X_train, y_train)
val_dataset = QADataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("Data preprocessing complete.")


Using device: cuda
Index(['Question', 'Answer'], dtype='object')
                                     Question  \
0    How do I take a screenshot on an iPhone?   
1  How do I change my wallpaper on an iPhone?   
2    How do I make a phone call on an iPhone?   
3  How do I send a text message on an iPhone?   
4             How do I use Siri on an iPhone?   

                                              Answer  
0  To take a screenshot on an iPhone, press and h...  
1  To change your wallpaper on an iPhone, go to S...  
2  To make a phone call on an iPhone, open the Ph...  
3  To send a text message on an iPhone, open the ...  
4  To use Siri on an iPhone, press and hold the H...  
Data preprocessing complete.


Define the Seq2Seq Model with Attention

In [2]:
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    """
    Attention mechanism that allows the decoder to focus on different parts of the encoder's outputs.

    Args:
        hidden_size (int): The number of features in the hidden state of the LSTM.
    """
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        # Linear layer to calculate attention energies
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        # Vector used to calculate the weighted sum of attention energies
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        """
        Calculate the attention weights.

        Args:
            hidden (torch.Tensor): The decoder's current hidden state.
            encoder_outputs (torch.Tensor): The encoder's outputs.

        Returns:
            torch.Tensor: The attention weights.
        """
        # Get the length of the input sequence
        timestep = encoder_outputs.size(1)
        # Repeat the hidden state across the input sequence length
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        # Calculate the attention energies
        attn_energies = self.score(h, encoder_outputs)
        # Return softmax-normalized attention weights
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        """
        Calculate the attention scores.

        Args:
            hidden (torch.Tensor): The decoder's current hidden state.
            encoder_outputs (torch.Tensor): The encoder's outputs.

        Returns:
            torch.Tensor: The attention scores.
        """
        # Concatenate hidden state and encoder outputs, and pass through a tanh-activated linear layer
        energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2)))
        # Transpose to match dimensions for batch matrix multiplication
        energy = energy.transpose(2, 1)
        # Repeat the attention vector across the batch
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        # Calculate the attention scores
        energy = torch.bmm(v, energy)
        return energy.squeeze(1)

class Encoder(nn.Module):
    """
    Encoder that processes the input sequence and outputs hidden states.

    Args:
        vocab_size (int): Size of the vocabulary.
        embed_size (int): Size of the embedding vectors.
        hidden_size (int): The number of features in the hidden state of the LSTM.
        num_layers (int): Number of recurrent layers.
    """
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        # Embedding layer to convert input words to embedding vectors
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # LSTM layer
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        """
        Forward pass through the encoder.

        Args:
            x (torch.Tensor): The input sequence.

        Returns:
            tuple: Outputs, hidden state, and cell state of the LSTM.
        """
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

class Decoder(nn.Module):
    """
    Decoder that generates the output sequence using the encoder's hidden states and attention mechanism.

    Args:
        vocab_size (int): Size of the vocabulary.
        embed_size (int): Size of the embedding vectors.
        hidden_size (int): The number of features in the hidden state of the LSTM.
        num_layers (int): Number of recurrent layers.
        attention (Attention): The attention mechanism.
    """
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, attention):
        super(Decoder, self).__init__()
        # Embedding layer to convert input words to embedding vectors
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # LSTM layer with input size increased by hidden_size to accommodate attention context vector
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, num_layers, batch_first=True)
        # Fully connected layer to generate predictions
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        # Attention mechanism
        self.attention = attention

    def forward(self, x, hidden, cell, encoder_outputs):
        """
        Forward pass through the decoder.

        Args:
            x (torch.Tensor): The input word.
            hidden (torch.Tensor): The decoder's current hidden state.
            cell (torch.Tensor): The decoder's current cell state.
            encoder_outputs (torch.Tensor): The encoder's outputs.

        Returns:
            tuple: Predictions, hidden state, cell state, and attention weights.
        """
        x = x.unsqueeze(1)  # Add batch dimension
        embedded = self.embedding(x)

        # Calculate attention weights and context vector
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs)

        # Concatenate embedding and context vector
        rnn_input = torch.cat([embedded, context], 2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))

        output = output.squeeze(1)  # Remove batch dimension
        context = context.squeeze(1)  # Remove batch dimension
        output = self.fc(torch.cat([output, context], 1))  # Generate predictions

        return output, hidden, cell, attn_weights

class Seq2Seq(nn.Module):
    """
    Seq2Seq model that combines the encoder and decoder with an attention mechanism.

    Args:
        encoder (Encoder): The encoder module.
        decoder (Decoder): The decoder module.
        target_vocab_size (int): Size of the target vocabulary.
    """
    def __init__(self, encoder, decoder, target_vocab_size):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.target_vocab_size = target_vocab_size

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        """
        Forward pass through the Seq2Seq model.

        Args:
            source (torch.Tensor): The source sequence.
            target (torch.Tensor): The target sequence.
            teacher_forcing_ratio (float): Probability of using teacher forcing.

        Returns:
            torch.Tensor: The generated outputs.
        """
        batch_size = target.shape[0]
        target_len = target.shape[1]

        # Tensor to store the decoder outputs
        outputs = torch.zeros(batch_size, target_len, self.target_vocab_size).to(target.device)

        # Pass the source sequence through the encoder
        encoder_outputs, hidden, cell = self.encoder(source)

        # First input to the decoder is the <start> token
        x = target[:, 0]

        for t in range(1, target_len):
            # Pass the previous output and hidden states through the decoder
            output, hidden, cell, _ = self.decoder(x, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            x = target[:, t] if teacher_force else output.argmax(1)

        return outputs


In [14]:
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(1)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], 2)))
        energy = energy.transpose(2, 1)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        energy = torch.bmm(v, energy)
        return energy.squeeze(1)


In [15]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell


In [16]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, attention):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size + hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.attention = attention

    def forward(self, x, hidden, cell, encoder_outputs):
        x = x.unsqueeze(1)
        embedded = self.embedding(x)

        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs)

        rnn_input = torch.cat([embedded, context], 2)
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))

        output = output.squeeze(1)
        context = context.squeeze(1)
        output = self.fc(torch.cat([output, context], 1))

        return output, hidden, cell, attn_weights


In [17]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, target_vocab_size):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.target_vocab_size = target_vocab_size

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = target.shape[0]
        target_len = target.shape[1]

        outputs = torch.zeros(batch_size, target_len, self.target_vocab_size).to(target.device)

        encoder_outputs, hidden, cell = self.encoder(source)

        x = target[:, 0]

        for t in range(1, target_len):
            output, hidden, cell, _ = self.decoder(x, hidden, cell, encoder_outputs)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            x = target[:, t] if teacher_force else output.argmax(1)

        return outputs


Training

In [3]:
import random
import torch.optim as optim

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
embed_size = 256
hidden_size = 512
num_layers = 20
target_vocab_size = vocab_size

# Initialize the encoder, decoder, attention, and Seq2Seq model
attention = Attention(hidden_size)
encoder = Encoder(vocab_size, embed_size, hidden_size, num_layers).to(device)
decoder = Decoder(vocab_size, embed_size, hidden_size, num_layers, attention).to(device)
model = Seq2Seq(encoder, decoder, target_vocab_size).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for i, (src, trg) in enumerate(iterator):
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, 0)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

def accuracy_metric(pred, target, pad_idx):
    pred_flat = pred.argmax(1)
    non_pad_elements = (target != pad_idx).nonzero()
    correct = (pred_flat[non_pad_elements] == target[non_pad_elements]).sum().item()
    total = non_pad_elements.shape[0]
    return correct / total

# Training loop
N_EPOCHS = 50
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, val_loader, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq-model.pt')

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')


Epoch: 01
	Train Loss: 6.093
	 Val. Loss: 5.744
Epoch: 02
	Train Loss: 5.520
	 Val. Loss: 5.697
Epoch: 03
	Train Loss: 5.424
	 Val. Loss: 5.652
Epoch: 04
	Train Loss: 5.372
	 Val. Loss: 5.633
Epoch: 05
	Train Loss: 5.333
	 Val. Loss: 5.624
Epoch: 06
	Train Loss: 5.298
	 Val. Loss: 5.617
Epoch: 07
	Train Loss: 5.263
	 Val. Loss: 5.622
Epoch: 08
	Train Loss: 5.233
	 Val. Loss: 5.595
Epoch: 09
	Train Loss: 5.219
	 Val. Loss: 5.593
Epoch: 10
	Train Loss: 5.185
	 Val. Loss: 5.614
Epoch: 11
	Train Loss: 5.169
	 Val. Loss: 5.600
Epoch: 12
	Train Loss: 5.149
	 Val. Loss: 5.593
Epoch: 13
	Train Loss: 5.131
	 Val. Loss: 5.585
Epoch: 14
	Train Loss: 5.114
	 Val. Loss: 5.601
Epoch: 15
	Train Loss: 5.089
	 Val. Loss: 5.604
Epoch: 16
	Train Loss: 5.066
	 Val. Loss: 5.586
Epoch: 17
	Train Loss: 5.051
	 Val. Loss: 5.589
Epoch: 18
	Train Loss: 5.039
	 Val. Loss: 5.595
Epoch: 19
	Train Loss: 5.011
	 Val. Loss: 5.594
Epoch: 20
	Train Loss: 4.998
	 Val. Loss: 5.597
Epoch: 21
	Train Loss: 4.988
	 Val. Loss

In [7]:
def generate_response(question, model, tokenizer, max_len, device):
    model.eval()
    with torch.no_grad():
        # Tokenize and pad the input question
        question_seq = tokenizer.texts_to_sequences([question])
        question_seq = pad_sequences(question_seq, maxlen=max_len, padding='post')
        question_seq = torch.tensor(question_seq, dtype=torch.long).to(device)

        # Initialize hidden state and cell state
        encoder_outputs, hidden, cell = model.encoder(question_seq)

        # Prepare the first input for the decoder
        start_token_index = tokenizer.word_index.get('<start>', 1)  # default to 1 if not found
        x = torch.tensor([start_token_index], dtype=torch.long).to(device)

        # Collect the generated tokens
        generated_tokens = []
        for _ in range(max_len):
            output, hidden, cell, _ = model.decoder(x, hidden, cell, encoder_outputs)
            pred_token = output.argmax(1).item()
            generated_tokens.append(pred_token)
            x = torch.tensor([pred_token], dtype=torch.long).to(device)
            if pred_token == tokenizer.word_index.get('<end>', 2):  # default to 2 if not found
                break

        # Convert tokens back to words
        response = tokenizer.sequences_to_texts([generated_tokens])
        return response[0]

# Example usage
question = "How do I use the built-in Camera app to take photos and videos?"
response = generate_response(question, model, tokenizer, max_len, device)
print(response)


yes to can use app on and and and the the and the end end end end end end end end end the the the a and and and the the the the and and the the the the the the the to and then the the end end end end on on your your your to to your your end end end end end end the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
