<a href="https://colab.research.google.com/github/deniskapel/GenerativeChitChat/blob/master/chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash
mkdir data
mkdir pretrained
mkdir model
wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1uSgX8EaXtSR1yZgs-pJGYiZXi7tPEBrE' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1uSgX8EaXtSR1yZgs-pJGYiZXi7tPEBrE" -O data/qa_data.jsonl.zip && rm -rf /tmp/cookies.txt
unzip data/qa_data.jsonl.zip -d data

In [None]:
%%bash
wget "https://raw.githubusercontent.com/deniskapel/GenerativeChitChat/master/requirements.txt"
pip install -r requirements.txt

In [3]:
import codecs
import json
import random
import math

import numpy as np
import pandas as pd

import json
import torch
from torch.utils.data import Dataset, DataLoader, Sampler
from torch.nn.utils.rnn import pad_sequence
from torch import nn
import torch.nn.functional as F

from tqdm.notebook import tqdm

from matplotlib import pyplot as plt

import youtokentome as yttm

In [4]:
assert torch.cuda.is_available(), 'no gpu available'
device = torch.device('cuda')
device

device(type='cuda')

In [None]:
%%bash
wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1QALzCQV3awJ84_LAVoo1FTdZ65VHX8Mc' -O pretrained/my_pretrained_bpe_lm.model

In [6]:
tokenizer = yttm.BPE(model="pretrained/my_pretrained_bpe_lm.model")

In [7]:
batch_size = 64
pad_index = 0
eos_index = 3
vocab_size=30_000

## Data

In [8]:
!sed 5q data/qa_data.jsonl

{"question": "долго ли идут деньги с яндексденег на карту visa?", "category": "Бизнес, Финансы", "responses": ["нет. прорыв 35 ;)"]}
{"question": "можно ли зарегистрировать авто в другом регионе", "category": "Авто, Мото", "responses": ["можно на родственника из того региона.. .  а потом ездить по доверке"]}
{"question": "что делать если у меня очень тонкие ногти а хочется их отрастить?", "category": "Красота и Здоровье", "responses": ["витамины и умная эмаль (каждый день)", "ванночки с морской солью. с вечера мажь ногти сверху йодом. не бойся, до утра все впитается.", "умная эмаль, витамины, йод, и поменьше крась лаком ", "лаки фирмы trind производство usa + кальций"]}
{"question": "в чем отличие медитации от йоги?", "category": "Спорт", "responses": ["букв в йоге меньше", "в медитации ты просто сидишь и мммммычишь. а в йоге всяко разные упражнения вытворяешь", "в медитации вроде просто тупо сидишь и успокаеваешься, а в йоге еще и ноги за уши закидывать надо"]}
{"question": "когда нач

In [9]:
with codecs.open("data/qa_data.jsonl", encoding='utf-8-sig') as reader:
    lines = reader.read().split("\n")
    lines = list(map(json.loads, filter(None, lines)))

data = []
for line in tqdm(lines):
    for response in line['responses']:
        data.append(
            {'question': line['question'].lower().strip(),
             'category': line['category'],
             'response': response.lower().strip()})

del lines
df = pd.json_normalize(data)
del data

  0%|          | 0/2808811 [00:00<?, ?it/s]

In [10]:
mask = (df['question'].str.len() <= 32) & (df['response'].str.len() <= 32)
df = df.loc[mask]
del mask

In [11]:
df.shape

(545778, 3)

In [12]:
df_mini = df.sample(frac=0.1, random_state=42)
train_df = df_mini.sample(frac=0.8,random_state=42)
val_df = df_mini.drop(train_df.index)
test_df = val_df.sample(frac=0.5,random_state=42)
val_df = val_df.drop(test_df.index)

In [13]:
train_df.category.value_counts()

Философия, Непознанное           5915
Знакомства, Любовь, Отношения    5391
Искусство и Культура             5153
Стиль, Мода, Звезды              3080
Животные, Растения               3065
Красота и Здоровье               3047
Досуг, Развлечения               2895
Спорт                            2820
Семья, Дом, Дети                 2657
Авто, Мото                       2653
Еда, Кулинария                   2001
Образование                      1391
Бизнес, Финансы                  1291
Работа, Карьера                  1168
Путешествия, Туризм              1135
Name: category, dtype: int64

### Datasets

In [91]:
class QAData(torch.utils.data.Dataset):
    
    def __init__(self, data: pd.DataFrame, tokenizer,
                 pad_index=0, eos_index=3, response_len=32):
        
        self.x = data.question.tolist()
        self.y = data.response.tolist()
        self.tokenizer = tokenizer
        # to use with beam search later
        self.categories = data.category.tolist()
        self.response_maxlen = response_len
        self.pad_index = pad_index
        self.eos_index = eos_index
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        """ x is a question and y is an asnwer """
        x = self.tokenizer.encode(self.x[index], bos=True, eos=True)
        y = self.tokenizer.encode(self.y[index], bos=True, eos=True)

        # different shape of samples is handled by batch sampler and model
        x = torch.Tensor(x).type(torch.long)
        y = torch.Tensor(y).type(torch.long)
        
        return x, y 

    def collate_batch(self, batch):
        """
        add padding to dynamically match the longest sample in a batch
        """
        batch_x, batch_y = self.__group_samples(batch)

        batch_x = pad_sequence(
            batch_x, padding_value=self.pad_index, batch_first=True)
        batch_y = pad_sequence(
            batch_y, padding_value=self.pad_index, batch_first=True)

        # to index batches along time and across all sequences in the batch, 
        # transpose their shape to (seq_length, batch_size)
        return batch_x.T, batch_y.T

    def __group_samples(self, batch: list) -> list:
        """ 
        input: [(sample_1_x, sample_1_y), (sample_2_x, sample_2_y)]
        output: [[samle_1_x, sample_2_x], [sample_1_y, sample_2_y]]
        """
        batch_x = []
        batch_y = []
        for sample in batch:
            batch_x.append(sample[0])
            batch_y.append(sample[1])

        return batch_x, batch_y

In [92]:
train_dataset = QAData(
    data=train_df,
    tokenizer=tokenizer)

val_dataset = QAData(
    data=val_df,
    tokenizer=tokenizer)

test_dataset = QAData(
    data=test_df,
    tokenizer=tokenizer)

len(train_dataset), len(val_dataset), len(test_dataset)

(43662, 5458, 5458)

In [93]:
train_dataset[0][0]

tensor([   2,  984, 1202, 3029, 3281,    3])

### DataLoader

In [94]:
class Sampler():
    def __init__(self, dataset, batch_size=64):
        self.dataset = dataset
        self.n_batches = len(dataset) // batch_size
        self.batch_size = batch_size

    def __iter__(self):
        indices = [(i, len(s[0])) for i, s in enumerate(self.dataset)]
        random.shuffle(indices)
        pooled_indices = []
        # create pool of indices with similar lengths 
        for i in range(0, len(indices), batch_size * 100):
            pooled_indices.extend(
                sorted(indices[i:i + batch_size * 100], key=lambda x: x[1])
                )
        pooled_indices = [x[0] for x in pooled_indices]
        
        # yield indices for current batch
        for i in range(0, len(pooled_indices), batch_size):
            yield pooled_indices[i:i + batch_size]

In [95]:
train_sampler = Sampler(train_dataset)
val_sampler = Sampler(val_dataset)
test_sampler = Sampler(test_dataset)

train_loader = DataLoader(
    train_dataset,
    collate_fn=train_dataset.collate_batch,
    batch_sampler=train_sampler)

val_loader = DataLoader(
    val_dataset,
    collate_fn=val_dataset.collate_batch,
    batch_sampler=val_sampler)

test_loader = DataLoader(
    test_dataset,
    collate_fn=val_dataset.collate_batch,
    batch_sampler=test_sampler)

In [96]:
progress_bar = tqdm(total=len(train_loader.dataset), desc='Testing')

for x, y in train_loader:
    progress_bar.update(x.size(1))
    
progress_bar.close()

Testing:   0%|          | 0/43662 [00:00<?, ?it/s]

In [56]:
progress_bar = tqdm(total=len(val_loader.dataset), desc='Testing')

for x, y in val_loader:
    progress_bar.update(x.size(1))
    
progress_bar.close()

Testing:   0%|          | 0/5458 [00:00<?, ?it/s]

In [57]:
progress_bar = tqdm(total=len(test_loader.dataset), desc='Testing')

for x, y in test_loader:
    progress_bar.update(x.size(1))
    
progress_bar.close()

Testing:   0%|          | 0/5458 [00:00<?, ?it/s]

In [58]:
for batch in train_loader:
    break

batch[0].shape, batch[1].shape

(torch.Size([4, 64]), torch.Size([13, 64]))

In [59]:
batch[0].shape, batch[1].shape

(torch.Size([4, 64]), torch.Size([13, 64]))

In [60]:
tokenizer.vocab()[:4]

['<PAD>', '<UNK>', '<BOS>', '<EOS>']

## Model

### Encoder/Decoder

In [97]:
"""
Based on 
https://pytorch.org/tutorials/beginner/chatbot_tutorial.html
"""
class Encoder(nn.Module):
    def __init__(self, embedding, hidden_size=128, n_layers=1, dropout=0.1):
        super().__init__()
        self.embedding = embedding
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.rnn = nn.GRU(
            embedding.embedding_dim, hidden_size, n_layers,
            dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
    def forward(self, input, hidden=None):
        embedded = self.embedding(input)
        out, hidden = self.rnn(embedded, hidden)
        # concat forward and backward gru layers
        outputs = out[:,:,:self.hidden_size] + out[:,:,self.hidden_size:]
        return outputs, hidden


class Attn(nn.Module):
    def __init__(self, hidden_size=128, method='dot'):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)


class Decoder(nn.Module):
    def __init__(self, embedding, hidden_size, n_layers=1, dropout=0.1, method='dot'):
        super(Decoder, self).__init__()
        # Keep for reference
        self.hidden_size = hidden_size
        self.output_size = embedding.num_embeddings
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = embedding
        self.embedding_dropout = nn.Dropout(dropout)
        # forward-looking rnn
        self.gru = nn.GRU(
            embedding.embedding_dim, hidden_size, n_layers, 
            dropout=(0 if n_layers == 1 else dropout))
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, self.output_size)
        self.attn = Attn(method=method, hidden_size=hidden_size)

    def forward(self, input_step, last_hidden, encoder_outputs):
        """ processes each element of a sequence separately """
        input_step
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs
        # to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output, hidden

In [98]:
EmbeddingLayer = nn.Embedding(num_embeddings=vocab_size, 
                              embedding_dim=512,
                              padding_idx=pad_index)

encoder = Encoder(EmbeddingLayer, 128, 1, 0.35)
attn = Attn()
decoder = Decoder(EmbeddingLayer, hidden_size = 128, method='dot')

output, hidden = encoder.forward(input=batch[0])
att_energy = attn.forward(hidden[:1], output)
dec_output, dec_hidden = decoder.forward(
    batch[1][2].view(1, -1), hidden[:1], output)

In [68]:
output.shape, hidden[:1].shape, att_energy.shape, dec_output.shape

(torch.Size([4, 64, 128]),
 torch.Size([1, 64, 128]),
 torch.Size([64, 1, 4]),
 torch.Size([64, 30000]))

In [99]:
"""
Based on this tutorials
https://github.com/bentrevett/pytorch-seq2seq
"""
class QAmodel(nn.Module):
    def __init__(self, encoder, decoder, device, weight_tying=True):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

        # как раз здесь задаем, чтобы веса входящего и выходящего слоя эмбеддингов шарились
        if weight_tying:
            self.decoder.embedding.weight = self.encoder.embedding.weight
        
    def forward(self, questions, responses, teacher_forcing_ratio = 0.5):
        """
        aligns encoder' and decoder' inputs and outputs in a single model
        """
        batch_size = questions.shape[1]
        trg_len = responses.shape[0]
        trg_vocab_size = self.decoder.output_size
        
        #tensor to store decoder outputs
        outputs = torch.zeros(
            (trg_len, batch_size, trg_vocab_size),
            device=self.device, 
            dtype=torch.float32)
        
        encoder_outputs, encoder_hidden = self.encoder(questions)
        # Set initial decoder hidden state to the encoder's final hidden state
        decoder_hidden = encoder_hidden[:self.decoder.n_layers]

        #first input to the decoder is always <bos>
        input = responses[0,None]

        for step in range(trg_len):
            #insert input token embedding, 
            # previous hidden state and all encoder hidden states
            #receive output tensor (predictions) and new hidden state
            output, decoder_hidden = self.decoder(
                input, decoder_hidden, encoder_outputs)
            
            #place predictions in a tensor holding predictions for each token
            outputs[step, :, :] = output
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            #get the highest predicted token from predictions
            top1 = output.argmax(1).unsqueeze(0)
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = responses[step,None] if teacher_force else top1

        return outputs

In [100]:
enc_embedding_layer = nn.Embedding(
    num_embeddings=vocab_size, embedding_dim=128, padding_idx=pad_index)
dec_embedding_layer = nn.Embedding(
    num_embeddings=vocab_size, embedding_dim=128, padding_idx=pad_index)
encoder = Encoder(enc_embedding_layer, 128, n_layers=2, dropout=0.1)
decoder = Decoder(dec_embedding_layer, 128, n_layers=1)
model = QAmodel(encoder, decoder, 'cpu')

with torch.no_grad():
    pred = model.forward(batch[0], batch[1])

batch[0].shape, batch[1].shape, pred.shape

(torch.Size([4, 64]), torch.Size([13, 64]), torch.Size([13, 64, 30000]))

In [101]:
enc_embedding_layer = nn.Embedding(
    num_embeddings=vocab_size, embedding_dim=128, padding_idx=pad_index)
dec_embedding_layer = nn.Embedding(
    num_embeddings=vocab_size, embedding_dim=128, padding_idx=pad_index)
encoder = Encoder(enc_embedding_layer, 128, n_layers=2, dropout=0.1)
decoder = Decoder(dec_embedding_layer, 128, n_layers=1)
model = QAmodel(encoder, decoder, device)
model.to(device)

QAmodel(
  (encoder): Encoder(
    (embedding): Embedding(30000, 128, padding_idx=0)
    (rnn): GRU(128, 128, num_layers=2, dropout=0.1, bidirectional=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(30000, 128, padding_idx=0)
    (embedding_dropout): Dropout(p=0.1, inplace=False)
    (gru): GRU(128, 128)
    (concat): Linear(in_features=256, out_features=128, bias=True)
    (out): Linear(in_features=128, out_features=30000, bias=True)
    (attn): Attn()
  )
)

In [102]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Количество обучаемых параметров в сети: {count_parameters(model):,}')

Количество обучаемых параметров в сети: 8,336,560


## Train

In [103]:
def train(model, dataloader, loss_fn, optimizer, n_batches, 
          clip=10., last_n_losses=500, verbose=True):

    # Zero gradients
    losses = []
    
    progress_bar = tqdm(total=n_batches, disable=not verbose, desc='Train')
    model.train()

    for x, y in dataloader:

        optimizer.zero_grad()
        x = x.to(device)
        y = y.to(device)

        pred = model(x, y)
        loss = loss_fn(pred.permute(1,2,0), y.T)

        # Perform backpropatation
        loss.backward()
        # Clip gradients: gradients are modified in place
        _ = nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        losses.append(loss.item())

        progress_bar.set_postfix(
            loss=np.mean(losses[-last_n_losses:]),
            perplexity=np.exp(np.mean(losses[-last_n_losses:])))

        progress_bar.update()

    progress_bar.close()
    
    return losses

class GreedySearchDecoder(nn.Module):
    """
    Greedy decoding is used when teacher_forcing=False.
    """
    
    def __init__(self, model, device):
        super(GreedySearchDecoder, self).__init__()
        self.model = model
        self.device = device

    def forward(self, input_seq, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.model.encoder(input_seq)
        # Encoder's final hidden layer -> 1st hidden input to the decoder
        decoder_hidden = encoder_hidden[:self.model.decoder.n_layers]
        decoder_input = torch.zeros(
            (1, input_seq.shape[1]), device=self.model.device, dtype=torch.long)
        # Initialize tensors to append decoded words to
        
        #tensor to store decoder outputs
        all_tokens = torch.zeros(
            (max_length, input_seq.shape[1]), device=self.model.device, dtype=torch.long)
        all_scores = torch.zeros(
            (max_length, input_seq.shape[1]), device=self.model.device, dtype=torch.long)
        
        # Iteratively decode one word token at a time
        for step in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden = self.model.decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=-1)
            # Record token and score
            decoder_input = decoder_input.unsqueeze(0)
            all_tokens[step] = decoder_input
            all_scores[step] = decoder_scores.unsqueeze(0)

        # Return collections of word tokens and scores
        return all_tokens, all_scores

def evaluate(searcher, tokenizer, input_batch, max_length=20, device='cpu'):
    input_batch = input_batch.to(device)
    # Decode sentence with a greedy searcher
    tokens, scores = searcher(input_batch, max_length)
    # indexes -> words
    decoded_words = tokenizer.decode(tokens.T.tolist(), ignore_ids=[0,1,2,3])
    return decoded_words

In [104]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss(ignore_index = pad_index)

epochs = 3

train_losses = []
validation_losses = []

train_perplexities = []
validation_perplexities = []

best_validation_loss = 1e+6

for n_epoch in range(1, epochs + 1):

    epoch_train_losses = train(
        model, train_loader, criterion, optimizer, train_sampler.n_batches+1)
    
    # epoch_validation_losses = evaluate(
    #     model, val_loader, criterion, val_sampler.n_batches)
    
    mean_train_loss = np.mean(epoch_train_losses)
    # mean_validation_loss = np.mean(epoch_validation_losses)
    
    train_losses.append(epoch_train_losses)
    train_perplexities.append(np.exp(mean_train_loss))
    
    # validation_losses.append(epoch_validation_losses)
    # validation_perplexities.append(np.exp(mean_validation_loss))
    
    message = f'Epoch: {n_epoch}\n'
    message += f'Train: loss - {mean_train_loss:.4f} | perplexity - {train_perplexities[-1]:.3f}\n'
    # message += f'Validation: loss - {mean_validation_loss:.4f} | perplexity - {validation_perplexities[-1]:.3f}'
    
    print(message)
    
    # if mean_validation_loss < best_validation_loss:
        
    #     best_validation_loss = mean_validation_loss
        
    #     torch.save(model.state_dict(), f'model/best_language_model_state_dict.pth')
    #     torch.save(optimizer.state_dict(), 'model/best_optimizer_state_dict.pth')
        
    # else:
    #     break
        
    torch.save(model.state_dict(), f'model/last_language_model_state_dict.pth')
    torch.save(optimizer.state_dict(), 'model/last_optimizer_state_dict.pth')

    with open(f'model/info_{n_epoch}.json', 'w') as file_object:

        info = {
            'message': message,
            'train_losses': train_losses,
            # 'validation_losses': validation_losses,
            'train_perplexities': train_perplexities,
            # 'validation_perplexities': validation_perplexities
        }

        file_object.write(json.dumps(info, indent=2))

Train:   0%|          | 0/683 [00:00<?, ?it/s]

Epoch: 1
Train: loss - 10.1188 | perplexity - 24805.311



Train:   0%|          | 0/683 [00:00<?, ?it/s]

Epoch: 2
Train: loss - 10.1777 | perplexity - 26309.453



Train:   0%|          | 0/683 [00:00<?, ?it/s]

Epoch: 3
Train: loss - 10.1777 | perplexity - 26309.490



In [105]:
for x, y in test_loader:
    break

x.shape, y.shape

(torch.Size([4, 64]), torch.Size([13, 64]))

In [106]:
test_questions = [
    'автомобиль стоит в гараже на нем не кто не ездиет, что делать чтобы в дальнейшем не было проблем',
    'почему иногда дети (почти всегда) готовы послушать мнение кого угодно только не родителей',
    'посоветуйте диету для похудения. я хожу в тренажерку 3 раза в неделю.',
    'салават юлаев сыграл третий подряд матч не пропустив ни одной шайбы и забил 15..в нхл бывали такие случаи?',
    'чем взрослее становится человек, тем.. . (закончите фразу)',
    'что делать если брат не любит играть в компьютерные игры совсем? ему 30 лет, он говорит что я как дурак веду ся',
    'элеутерококк, не эффективен?...'
    'если очень-очень хорошо попросить, то человек сделает это?']

def to_batch(qs: list, tknzr) -> torch.Tensor:
    """ transofms list of questions into a tokenized batch_x """
    qs = [torch.Tensor(tknzr.encode(q,bos=True,eos=True)).type(torch.long) for q in qs]
    qs = pad_sequence(qs, padding_value=pad_index, batch_first=True)
    return qs.T

In [108]:
model.eval()

# g_searcher = GreedySearchDecoder(model, 'cpu')
test_batch = to_batch(test_questions, tokenizer)
g_searcher = GreedySearchDecoder(model, device)
ans = evaluate(g_searcher, tokenizer, test_batch, max_length=10, device=device)
# ans = evaluate(g_searcher, tokenizer, to_batch(test_questions, tokenizer), max_length=20, device='cpu')

for q,a in zip(test_questions, ans):
    print(q, '\n', a.strip(), '\n\n')

автомобиль стоит в гараже на нем не кто не ездиет, что делать чтобы в дальнейшем не было проблем 
  


почему иногда дети (почти всегда) готовы послушать мнение кого угодно только не родителей 
  


посоветуйте диету для похудения. я хожу в тренажерку 3 раза в неделю. 
  


салават юлаев сыграл третий подряд матч не пропустив ни одной шайбы и забил 15..в нхл бывали такие случаи? 
  


чем взрослее становится человек, тем.. . (закончите фразу) 
  


что делать если брат не любит играть в компьютерные игры совсем? ему 30 лет, он говорит что я как дурак веду ся 
  


элеутерококк, не эффективен?...если очень-очень хорошо попросить, то человек сделает это? 
  




In [None]:
tokenizer.vocab()[3]

In [None]:
plt.figure(figsize=(14, 14))
plt.xlabel('batch')
plt.ylabel('loss')
plt.title('Training')

for i in range(len(train_losses)):
    label = f'epoch_{i+1}'
    plt.plot(train_losses[i], label=label)
    

plt.legend()
plt.grid()

In [None]:
plt.figure(figsize=(14, 14))
plt.xlabel('batch')
plt.ylabel('loss')
plt.title('Validation')

for i in range(len(validation_losses)):
    label = f'epoch_{i+1}'
    plt.plot(validation_losses[i], label=label)

plt.legend()
plt.grid()