<a href="https://colab.research.google.com/github/eischaire/ML_4year/blob/master/HW6_Okhapkina.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 6 on ML

### Student: Anna Okhapkina

In [1]:
!pip install sentencepiece



In [2]:
!pip install revtok



In [0]:
import sentencepiece as spm
import torch as tt
from torchtext import data as tt_data, datasets as tt_datasets
import zipfile

In [0]:
from tqdm import tqdm_notebook

In [0]:
from tqdm import trange

In [0]:
import torch.nn as nn

In [0]:
import torch.nn.functional as F
import torch.optim as optim

In [0]:
import pandas as pd

In [0]:
import os

## Training a SentencePiece model

In [0]:
wikiurl = tt_datasets.WikiText2.urls

In [11]:
wikiurl

['https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip']

In [12]:
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip

--2020-02-08 17:56:19--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.232.21
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.232.21|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4475746 (4.3M) [application/zip]
Saving to: ‘wikitext-2-v1.zip.2’


2020-02-08 17:56:21 (2.50 MB/s) - ‘wikitext-2-v1.zip.2’ saved [4475746/4475746]



In [0]:
with zipfile.ZipFile('wikitext-2-v1.zip', 'r') as zip_files:
  zip_files.extractall('wikitext2')

In [0]:
with open(os.path.join('wikitext2', 'wikitext-2', 'wiki.test.tokens'), 'r') as f:
  test_file = f.read()

In [0]:
train_path = os.path.join('wikitext2', 'wikitext-2', 'wiki.train.tokens')
val_path = os.path.join('wikitext2', 'wikitext-2', 'wiki.valid.tokens')

In [0]:
with open(train_path, 'r') as f:
  train_file = f.read()

In [0]:
with open(val_path, 'r') as f:
  val_file = f.read()

In [0]:
sp_model_data = train_file + test_file + val_file
with open('sp_train.txt', 'w') as k:
  k.writelines(sp_model_data)

In [241]:
spm.SentencePieceTrainer.Train('--input=sp_train.txt --model_prefix=sp_model --model_type=BPE --vocab_size=15000')

True

In [242]:
sp = spm.SentencePieceProcessor()
sp.load('sp_model.model')

True

## Preprocessing the dataset

In [0]:
spm_pipeline = tt_data.Pipeline(convert_token=sp.encode_as_pieces)

In [0]:
check_tokens = [' Hello ', ' Mom ', ' greetings ', ' from ', ' me ', ' How ', ' are ', ' you ', ' today ', ' ? ']

In [0]:
def merge_lists(inp):
  merged = []
  for item in inp:
    merged += item
  return merged

In [247]:
merge_lists(spm_pipeline(check_tokens))

['▁Hell',
 'o',
 '▁M',
 'om',
 '▁g',
 'reet',
 'ings',
 '▁from',
 '▁me',
 '▁How',
 '▁are',
 '▁you',
 '▁today',
 '▁',
 '?']

In [0]:
TEXT = tt_data.ReversibleField(use_vocab=True, 
             preprocessing = lambda x: merge_lists(spm_pipeline(x)),             
             init_token='<start>', eos_token='<end>',
             is_target=True
            )

In [249]:
TEXT.preprocess(check_tokens)

['▁Hell',
 'o',
 '▁M',
 'om',
 '▁g',
 'reet',
 'ings',
 '▁from',
 '▁me',
 '▁How',
 '▁are',
 '▁you',
 '▁today',
 '▁',
 '?']

In [251]:
train, valid, test = tt_datasets.WikiText2.splits(TEXT)




In [0]:
TEXT.build_vocab(train, valid, test, min_freq=5)

In [253]:
TEXT.vocab.itos[:20]

[' UNK ',
 '<pad>',
 '<start>',
 '<end>',
 '▁',
 '▁the',
 '<',
 '>',
 '▁,',
 '▁.',
 '▁unk',
 '▁of',
 '▁and',
 '▁@',
 '▁in',
 '▁a',
 'e',
 '▁to',
 'os',
 '▁=']

In [254]:
len(TEXT.vocab.itos)

12202

## Defining the NN model

In [0]:
class MyModel(nn.Module):
    
    def __init__(self, vocab_size, target_vocab_size, embed_size, hidden_size):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        self.rnn = nn.LSTM(input_size=embed_size,
                           hidden_size=hidden_size,
                           bidirectional=True,
                          )
        
        self.fc = nn.Linear(hidden_size * 2, target_vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        nn.init.uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, text):
        
        
        x = text
        
        x = self.embedding(x)

            
        x, _ = self.rnn(x)
        
        x = self.fc(x)
        return x.transpose(1,2)

In [0]:
# tt.cuda.empty_cache()

batch_size = 32

model = MyModel(vocab_size=len(TEXT.vocab.itos),
                target_vocab_size=len(TEXT.vocab.itos),
                embed_size=100,
                hidden_size=128,
               )

train_iterator, valid_iterator, test_iterator = tt_data.BPTTIterator.splits(
    (train, valid, test),
    bptt_len=30,
    device=tt.device('cuda'),
    batch_size=batch_size,
    shuffle=False,
    # sort_key=lambda x: len(x.text),
    # sort_within_batch=True
)

optimizer = optim.Adam(model.parameters())
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True, cooldown=5)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=5)

# padding does not count into loss
# criterion = nn.CrossEntropyLoss(ignore_index=1)
criterion = nn.CrossEntropyLoss()

In [0]:
from tqdm import tqdm

In [0]:
def _train_epoch(model, iterator, optimizer, criterion, curr_epoch):

    # model.cuda()
    # criterion.cuda()

    model.train()

    running_loss = 0

    n_batches = len(iterator)
    # iterator = tqdm_notebook(iterator, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)
    iterator = tqdm(iterator) #, total=n_batches, desc='epoch %d' % (curr_epoch), leave=True)

    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        x = batch.text.to(tt.device('cuda'))
        pred = model(x)
        loss = criterion(pred, batch.target)
        loss.backward()
        optimizer.step()
  
        curr_loss = loss.data.cpu().detach().item()
        
        loss_smoothing = i / (i+1)
        running_loss = loss_smoothing * running_loss + (1 - loss_smoothing) * curr_loss

        iterator.set_postfix(loss='%.5f' % running_loss)

    return running_loss


def _test_epoch(model, iterator, criterion):

    # model.cuda()
    # criterion.cuda()

    model.eval()
    epoch_loss = 0

    n_batches = len(iterator)
    with tt.no_grad():
        for batch in iterator:
            pred = model(batch.text)
            loss = criterion(pred, batch.target)
            epoch_loss += loss.data.item()

    return epoch_loss / n_batches


def nn_train(model, train_iterator, valid_iterator, criterion, optimizer, n_epochs=100,
          scheduler=None, early_stopping=0):
    model.to(tt.device('cuda'))
    criterion.to(tt.device('cuda'))
    # optimizer.cuda()
    prev_loss = 100500
    es_epochs = 0
    best_epoch = None
    history = pd.DataFrame()

    for epoch in range(n_epochs):
        train_loss = _train_epoch(model, train_iterator, optimizer, criterion, epoch)
        valid_loss = _test_epoch(model, valid_iterator, criterion)

        valid_loss = valid_loss
        print('validation loss %.5f' % valid_loss)

        record = {'epoch': epoch, 'train_loss': train_loss, 'valid_loss': valid_loss}
        history = history.append(record, ignore_index=True)

        if early_stopping > 0:
            if valid_loss > prev_loss:
                es_epochs += 1
            else:
                es_epochs = 0

            if es_epochs >= early_stopping:
                best_epoch = history[history.valid_loss == history.valid_loss.min()].iloc[0]
                print('Early stopping! best epoch: %d val %.5f' % (best_epoch['epoch'], best_epoch['valid_loss']))
                break

            prev_loss = min(prev_loss, valid_loss)

In [0]:
# if tqdm._instances:
#   for instance in list(tqdm._instances): 
#     tqdm._decr_instances(instance)

In [260]:
nn_train(model, train_iterator, valid_iterator, criterion, optimizer, scheduler=scheduler, 
        n_epochs=10, early_stopping=3)

100%|██████████| 3012/3012 [01:07<00:00, 44.60it/s, loss=3.38171]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 1.71028


100%|██████████| 3012/3012 [01:07<00:00, 44.83it/s, loss=1.05934]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.66705


100%|██████████| 3012/3012 [01:07<00:00, 44.65it/s, loss=0.41542]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.37006


100%|██████████| 3012/3012 [01:07<00:00, 44.55it/s, loss=0.24044]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.27022


100%|██████████| 3012/3012 [01:07<00:00, 44.48it/s, loss=0.18293]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.23259


100%|██████████| 3012/3012 [01:07<00:00, 44.46it/s, loss=0.15746]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.21622


100%|██████████| 3012/3012 [01:07<00:00, 44.49it/s, loss=0.14151]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.21162


100%|██████████| 3012/3012 [01:07<00:00, 44.53it/s, loss=0.12892]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.21089


100%|██████████| 3012/3012 [01:07<00:00, 44.61it/s, loss=0.11794]
  0%|          | 0/3012 [00:00<?, ?it/s]

validation loss 0.21507


100%|██████████| 3012/3012 [01:07<00:00, 44.72it/s, loss=0.10800]


validation loss 0.21872


In [0]:
def reverse(self, batch):
    if not self.batch_first:
        batch = batch.t()
    with tt.cuda.device_of(batch):
        batch = batch.tolist()
    batch = [[self.vocab.itos[ind] for ind in ex] for ex in batch]  # denumericalize
    def trim(s, t):
        sentence = []
        for w in s:
            if w == t:
                break
            sentence.append(w)
        return sentence

    batch = [trim(ex, self.eos_token) for ex in batch]  # trim past frst eos

    def filter_special(tok):
        return tok not in (self.init_token, self.pad_token)

    batch = [filter(filter_special, ex) for ex in batch]

    return [' '.join(ex) for ex in batch]

TEXT.reverse = reverse

In [262]:
for batch in test_iterator:
    pred = model(batch.text)
    pred = tt.softmax(pred, dim=1)
    pred = tt.argmax(pred, dim=1)
    pred_text = TEXT.reverse(TEXT, pred)
    batch_text = TEXT.reverse(TEXT, batch.text)
    true_text = TEXT.reverse(TEXT, batch.target)

    pred_text = reverse(TEXT, pred)    
    batch_text = reverse(TEXT, batch.text)
    true_text = reverse(TEXT, batch.target)

    for i in range(len(true_text)):
        print(i)
        print('batch text: ', batch_text[i])
        print('true text: ', true_text[i])
        print('pred text: ', pred_text[i])
        print()
        
    break

batch text:  ▁ < e os > ▁= ▁Robert ▁ < ▁unk ▁ > ▁= ▁ < e os > ▁ < e os > ▁Robert ▁ < ▁unk ▁ > ▁is
true text:  < e os > ▁= ▁Robert ▁ < ▁unk ▁ > ▁= ▁ < e os > ▁ < e os > ▁Robert ▁ < ▁unk ▁ > ▁is ▁an
pred text:  < e os > ▁= ▁Robert ▁ < ▁unk ▁ > ▁= ▁ < e os > ▁ < e os > ▁Robert ▁ < ▁unk ▁ > ▁is ▁a

batch text:  ▁ > ▁Rock ▁and ▁Beach ▁ < ▁unk ▁ > ▁ < ▁unk ▁ > ▁, ▁as ▁well ▁as ▁the ▁iconic ▁music ▁videos ▁of ▁songs ▁such ▁as ▁The ▁Beach ▁Boys
true text:  > ▁Rock ▁and ▁Beach ▁ < ▁unk ▁ > ▁ < ▁unk ▁ > ▁, ▁as ▁well ▁as ▁the ▁iconic ▁music ▁videos ▁of ▁songs ▁such ▁as ▁The ▁Beach ▁Boys ▁'
pred text:  > ▁Rock ▁and ▁Beach ▁ < ▁unk ▁ > ▁ < ▁unk ▁ > ▁, ▁as ▁well ▁as ▁the ▁iconic ▁music ▁videos ▁of ▁songs ▁such ▁as ▁The ▁Beach ▁Boys ▁of

batch text:  ▁ ions ▁records ▁for ▁both ▁career ▁touchdown ▁and ▁single ▁@ ▁- ▁@ ▁season ▁touchdowns ▁. ▁He ▁had ▁also ▁been ▁a ▁Michigan ▁High ▁School ▁Athletic ▁Association ▁( ▁ < ▁unk ▁
true text:  ions ▁records ▁for ▁both ▁career ▁touchdown ▁and ▁single ▁@ ▁- ▁@ 

In [0]:
tt.save(model.state_dict(), 'model_state.tt')

## Model Inference

In [0]:
infer_model = MyModel(len(TEXT.vocab.itos), len(TEXT.vocab.itos), 100, 128)

In [265]:
infer_model.load_state_dict(tt.load('model_state.tt'))
infer_model.eval()

MyModel(
  (embedding): Embedding(12202, 100)
  (rnn): LSTM(100, 128, bidirectional=True)
  (fc): Linear(in_features=256, out_features=12202, bias=True)
)

In [0]:
from math import log
from numpy import array
from numpy import argmax
 
# beam search
def beam_search_decoder(data):
  sequences = [[list(), 1.0]]
	# walk over each step in sequence
  for row in data:
    all_candidates = list()
		# expand each current candidate
    for i in range(len(sequences)):
      seq, score = sequences[i]
      for j in range(len(row)):
        candidate = [seq + [j], score * -log(row[j])]
        all_candidates.append(candidate)
		# order all candidates by score
    ordered = sorted(all_candidates, key=lambda tup:tup[1])
    # select k best
  # print(ordered)
  sequences = ordered[:1]
  pred = sequences[0][0][0]
  return pred

# code source: https://machinelearningmastery.com/beam-search-decoder-natural-language-processing/

In [0]:
def generate_text(model, function_name, input_str):
  generated = input_str
  input_tns = TEXT.process(TEXT.preprocess(input_str))

  for _ in tqdm(range(30)):
    pred = tt.softmax(model(input_tns), dim=1)
    
    if function_name == 'argmax':
      pred = tt.argmax(pred, dim=1)
      input_tns = pred
      output_list = [[TEXT.vocab.itos[i.item()] for i in piece] for piece in input_tns.transpose(1,0)]
      output = [item for item in output_list]    
    elif function_name == 'beam':
      pred = [beam_search_decoder(item) for item in pred]
      char = TEXT.vocab.itos[pred[-1]]
      input_tns = TEXT.process(TEXT.preprocess(char))
      output_list = [[char]]
  
    generated += output_list[-1][-1]
  return generated.replace('▁', ' ')

In [406]:
generate_text(infer_model, 'beam', 'My')

100%|██████████| 30/30 [00:22<00:00,  1.22it/s]


'My UNK <pad> the UNK <pad> the UNK <pad> the UNK <pad> the UNK <pad> the UNK <pad> the UNK <pad> the UNK <pad> the UNK <pad> the UNK <pad> the'

In [407]:
generate_text(infer_model, 'argmax', 'My')

100%|██████████| 30/30 [00:00<00:00, 333.67it/s]


'My . The diplom is a < unk > , and < unk > , and < unk > , and < unk '