In [2]:
import sys
sys.path.append('..')
from utils.preprocessing import load_dataframes,binarize_categories

from constants import CATEGORIES

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
(df_train, df_val, df_test) = load_dataframes()

In [None]:
df_train

In [5]:
import torch as nn

In [31]:
sentences = df_train['user_input'].str.split().values
words = [word for sentence in sentences for word in sentence]
words = set(words)

In [28]:
def one_hot_encode(word, words):
    word_vec = nn.zeros(len(words))
    word_vec[list(words).index(word)] = 1
    return word_vec

In [84]:
import torch
from torch import nn


class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, window_size, embd_dimension, hidden_dimension):
        super(NextWordPredictor, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_dimension)
        self.fc1 = nn.Linear((window_size - 1) * embd_dimension, hidden_dimension)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dimension, vocab_size)

    def forward(self, x):
        embeds = self.embeddings(x)
        embeds = embeds.view(embeds.size(0), -1)  # Reshape the embeddings tensor
        out = self.fc1(embeds)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [92]:
from torch.utils.data import Dataset
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

class SentenceDataset(Dataset):
    def __init__(self, sentences, window_size):
        self.window_size = window_size
        self.word2idx, self.idx2word = self.build_vocab(sentences)
        self.data = self.build_data(sentences)

    def build_vocab(self, sentences):
        words = [word for sentence in sentences for word in sentence]
        word_counts = Counter(words)

        word2idx = {word: idx for idx, (word, _) in enumerate(word_counts.items())}
        idx2word = {idx: word for word, idx in word2idx.items()}
        return word2idx, idx2word

    def build_data(self, sentences):
        data = []
        for sentence in sentences:
            indices = [self.word2idx[word] for word in sentence]
            for i in range(len(indices) - self.window_size + 1):
                data.append((indices[i:i+self.window_size-1], indices[i+self.window_size-1]))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

window_size = 3
# Assuming your sentences are in the variable `sentences`
dataset = SentenceDataset(sentences, window_size)

def collate_fn(batch):
    inputs = torch.tensor([item[0] for item in batch])
    targets = torch.tensor([item[1] for item in batch])
    return pad_sequence(inputs, batch_first=True), torch.tensor(targets)

dataloader = DataLoader(dataset, batch_size=1024, collate_fn=collate_fn)

In [95]:
# Create the model
vocab_size = len(words)  # Replace with the size of your vocabulary
embd_dimension = 100  # Replace with the size of your word embeddings
hidden_dimension = 128  # Replace with the size of your hidden layer
model = NextWordPredictor(vocab_size, window_size, embd_dimension, hidden_dimension)

# Create the loss function and the optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Train the model
for epoch in range(5):  # Number of epochs
    for inputs, targets in dataloader:
        # print(type(inputs), type(targets))
        model.zero_grad()
        output = model(inputs)
        loss = loss_function(output, targets)
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch}, Loss: {loss.item()}")
    print(f"Epoch: {epoch}, Loss: {loss.item()}")

  return pad_sequence(inputs, batch_first=True), torch.tensor(targets)


Epoch: 0, Loss: 10.087162017822266
Epoch: 0, Loss: 10.048267364501953
Epoch: 0, Loss: 10.041987419128418
Epoch: 0, Loss: 10.041068077087402
Epoch: 0, Loss: 10.015830039978027
Epoch: 0, Loss: 10.009184837341309
Epoch: 0, Loss: 9.97277545928955
Epoch: 0, Loss: 9.950325965881348
Epoch: 0, Loss: 9.946891784667969
Epoch: 0, Loss: 9.924036026000977
Epoch: 0, Loss: 9.823681831359863
Epoch: 0, Loss: 9.956148147583008
Epoch: 0, Loss: 9.813584327697754
Epoch: 0, Loss: 9.781691551208496
Epoch: 0, Loss: 9.815620422363281
Epoch: 0, Loss: 9.698565483093262
Epoch: 0, Loss: 9.783243179321289
Epoch: 0, Loss: 9.728597640991211
Epoch: 0, Loss: 9.626638412475586
Epoch: 0, Loss: 9.616291046142578
Epoch: 0, Loss: 9.59935188293457
Epoch: 0, Loss: 9.802875518798828
Epoch: 0, Loss: 9.505217552185059
Epoch: 0, Loss: 9.42426586151123
Epoch: 0, Loss: 9.54400634765625
Epoch: 0, Loss: 9.439393043518066
Epoch: 0, Loss: 9.460721969604492
Epoch: 0, Loss: 9.381961822509766
Epoch: 0, Loss: 9.323912620544434
Epoch: 0, Lo

In [140]:
def predict_next_word(model, words:list[str], dataset):
    model.eval()
    with torch.no_grad():
        inputs = torch.tensor([dataset.word2idx[word] for word in words])
        output = model(inputs.reshape(1, -1))
        _, predicted = torch.max(output, 1)
        return dataset.idx2word[predicted.item()]

In [143]:
def generate_sentence(model, sentence, dataset, n_words=10):
    res  = []
    res.extend(sentence)
    for i in range(n_words):
        next_word = predict_next_word(model, res[i:i+2], dataset)
        res.append(next_word)
    return ' '.join(res)

In [160]:
generate_sentence(model, ['Chat', 'is'], dataset)

'Chat is a conversation between visitor and operator: visitor,Hello is a conversation'