# Assignment 7

Delelop language model, which generates death metal band names.  
You can get data from https://www.kaggle.com/zhangjuefei/death-metal.  
You are free to use any other data, but the most easy way is just to take the band name column.

Your language model should be char-based autogression RNN.  
Text generation should be terminated when either max length is reached or terminal symbol is generated.  

<img src="images/example.png">

<img src="images/example2.png">

Different band names can be generated by:  
1. init $h_0$ as random vector from some probabilty distribution.
2. sampling over tokens at each timestep with probability = softmax 

Calculate perplexity for your model = your objective quality metric.  
Also, sample 10 band names from your model for subjective evaluation. E.g. names like 'qwiouefiou23riop2h3' or 'death death death!' are bad examples.  

In [1]:
import pandas as pd
import numpy as np
import random
import string
import torch as tt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm.autonotebook import tqdm

import warnings
warnings.simplefilter('ignore')

SEED = 42
np.random.seed(SEED)



## Data

In [2]:
df = pd.read_csv('../input/death-metal/bands.csv',  encoding='ISO-8859-1')
df.head()

Unnamed: 0,id,name,country,status,formed_in,genre,theme,active
0,1,('M') Inc.,United States,Unknown,2009.0,Death Metal,,2009-?
1,2,(sic),United States,Split-up,1993.0,Death Metal,,1993-1996
2,3,.F.O.A.D.,France,Active,2009.0,Death Metal,Life and Death,2009-present
3,4,100 Suns,United States,Active,2004.0,Death Metal,,2004-present
4,5,12 Days of Anarchy,United States,Split-up,1998.0,Death Metal,Anarchy,1998-2002


Для увеличения шансов на успех выкинем те названия групп, в которых есть знаки препинания и всякие странные символы, оставим только названия с латинскими буквами и цифрами.

In [3]:
chars = ' ' + '0123456789' + string.ascii_letters

In [4]:
bands = []

for name in df['name']:
    symbols = set(list(name))
    
    counter = 0
    
    for s in symbols:
        if s not in chars:
            counter = 1
    
    if counter == 0:
        bands.append(name)

In [5]:
random.shuffle(bands)
bands = '\n'.join(bands)

train_df = bands[:round(len(bands)*0.8)]
test_df = bands[round(len(bands)*0.8):]

In [6]:
chars += '\n'

## Training

In [7]:
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyRNN, self).__init__()
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.GRU(hidden_size, hidden_size, 1)
        self.decoder = nn.Linear(hidden_size, output_size)      
        self.hidden_size = hidden_size

    def forward(self, input, hidden):
        batch_size = input.size(0)
        
        embedding = self.encoder(input)
        output, hidden = self.rnn(embedding.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        
        return output, hidden

    def init_hidden(self, batch_size):
        return tt.zeros(1, batch_size, self.hidden_size)

Наблюдение: если поставить большое значение chunk_length, названия начинают выглядеть лучше, что, в целом, логично.

In [8]:
class Trainer:
    def __init__(self, model, train_df, test_df, chars):
        self.model = model
        self.train_df = train_df
        self.test_df = test_df
        
        self.chars = chars
        
        self.batch_size = 128
        self.chunk_length = 256
        
        self.optimizer = tt.optim.Adam(self.model.parameters(), lr=0.01, weight_decay=1e-05)
        self.criterion = nn.CrossEntropyLoss()
    
    def _char_tensor(self, string):
        tensor = tt.zeros(len(string)).long()
        
        for ci in range(len(string)):
            tensor[ci] = self.chars.index(string[ci])
            
        return tensor

    def _randomize_data(self, data):
        limit = len(data) - self.chunk_length
        inp = tt.LongTensor(self.batch_size, self.chunk_length)
        target = tt.LongTensor(self.batch_size, self.chunk_length)

        for bi in range(self.batch_size):
            start_index = random.randint(0, limit)
            chunk = data[start_index : start_index + self.chunk_length + 1]
            inp[bi] = self._char_tensor(chunk[:-1])
            target[bi] = self._char_tensor(chunk[1:])

        return inp, target

    def _perplexity(self, x):
        return 2 ** x
    
    def _train_epoch(self, inp, target, epoch):
        self.model.train()
        hidden = self.model.init_hidden(self.batch_size)
        self.model.zero_grad()
    
        train_loss = 0
        perplexities = []
    
        for ci in range(self.chunk_length):
            self.optimizer.zero_grad()

            output, hidden = self.model(inp[:,ci], hidden)
            loss = self.criterion(output.view(self.batch_size, -1), target[:,ci])
            perplexities.append(self._perplexity(loss.item()))

            current_loss = loss.data.cpu().detach().item()
            loss_smoothing = ci / (ci+1)
            train_loss = loss_smoothing * train_loss + (1 - loss_smoothing) * current_loss

        loss.backward()
        self.optimizer.step()
    
        perplex = np.mean(perplexities)
        
        return train_loss, perple

    def _test_epoch(self, inp, target):
        self.model.eval()

        epoch_loss, loss = 0, 0
        perplexities = []

        hidden = self.model.init_hidden(self.batch_size)

        with tt.no_grad():
            for ci in range(self.chunk_length):
                output, hidden = self.model(inp[:,ci], hidden)
                loss = self.criterion(output.view(self.batch_size, -1), target[:,ci])
                perplexities.append(self._perplexity(loss.item()))
                epoch_loss += loss.data.item()
    
        perplex = np.mean(perplexities)
        
        return epoch_loss / self.chunk_length, perplex


    def nn_train(self, n_epochs, early_stopping):
        print('Epoch\tTrain loss\tTest loss\tTrain perplexity\tTest perplexity')

        best_epoch = None
        prev_loss = 100500
        es_epochs = 0
        train_losses, test_losses = [], []
    
        for epoch in tqdm(range(n_epochs)):
            try:
                train_inp, train_target = self._randomize_data(self.train_df)
                test_inp, test_target = self._randomize_data(self.test_df)

                train_loss, train_per = self._train_epoch(train_inp, train_target, epoch)
                test_loss, test_per = self._test_epoch(test_inp, test_target)

                train_losses.append(train_loss)
                test_losses.append(test_loss)

                if epoch % 100 == 0 or epoch == n_epochs-1:
                    print('%s \t %.5f \t %.5f \t %.5f \t %.5f' % (str(epoch),
                                                                     train_loss,
                                                                     test_loss,
                                                                     train_per,
                                                                     test_per))
            except:
                continue

            if early_stopping > 0:
                if test_loss > prev_loss:
                    es_epochs += 1
                else:
                    es_epochs = 0
                if es_epochs >= early_stopping:
                    break
                    
                prev_loss = min(prev_loss, test_loss)

    def generate_name(self):
        stop_symbol = '\n'
        hidden = self.model.init_hidden(1)
        prime_input = self._char_tensor(stop_symbol).unsqueeze(0)
        predicted = ''

        for p in range(len(stop_symbol) - 1):
            _, hidden = self.model(prime_input[:,p], hidden)

        inp = prime_input[:,-1]
        predict_len = random.randint(20, 35)

        for p in range(predict_len):
            output, hidden = self.model(inp, hidden)
            output_dist = output.data.view(-1).div(0.8).exp()
            top_i = tt.multinomial(output_dist, 1)[0]
            predicted_char = self.chars[top_i]

            if predicted and predicted_char == '\n':
                break
            else:
                predicted += predicted_char
                inp = self._char_tensor(predicted_char).unsqueeze(0)

        return predicted


In [9]:
model = MyRNN(input_size=len(chars),
              hidden_size=128, 
              output_size=len(chars))

trainer = Trainer(model, train_df, test_df, chars)

In [10]:
trainer.nn_train(1000, 300)

Epoch	Train loss	Test loss	Train perplexity	Test perplexity


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [11]:
tt.save(model, 'model_hw7.pt')

## Evaluation

In [16]:
for x in range(10):
    print(trainer.generate_name())

Heetority
Craing Death
Wanication
Catalix
Tressompophitre
Therort
Saknacation
Megor
Tors
13Helge Dain


Сгенерированные названия выглядят не особо осмысленно (хотя иногда попадаются реальные слова типа "dead" или "demons"), но довольно устрашающе.