<a href="https://colab.research.google.com/github/akaver/NLP2019/blob/master/Lab12_2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook shows how to do conditional generation using an RNN. 

The model learns to generate male and female names.



In [0]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

import torch 

device = 'cpu'
if torch.cuda.is_available():
  device = torch.device('cuda')

print(device)

cuda


As training data, we use the starting list of the 2010 Tartu Maraton (cross-country ski marathon).

In [0]:
! rm -f tm2010_names_with_gender.csv
!wget https://phon.ioc.ee/~tanela/tmp/tm2010_names_with_gender.csv

--2019-04-16 06:48:50--  https://phon.ioc.ee/~tanela/tmp/tm2010_names_with_gender.csv
Resolving phon.ioc.ee (phon.ioc.ee)... 193.40.251.126
Connecting to phon.ioc.ee (phon.ioc.ee)|193.40.251.126|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88405 (86K) [text/csv]
Saving to: ‘tm2010_names_with_gender.csv’


2019-04-16 06:48:52 (158 KB/s) - ‘tm2010_names_with_gender.csv’ saved [88405/88405]



In [0]:
!head tm2010_names_with_gender.csv

"Aukland, Anders",M
"Brink, Joergen",M
"Svaerd, Oskar",M
"Rezac, Stanislav",M
"Fredriksson, Mathias",M
"Larsson, Martin",M
"Sinnes, Svein Tore",M
"Narusk, Priit",M
"Veerpalu, Andrus",M
"Jaernberg, Anton",M


In [0]:
import sys
from torchtext import data
from torchtext import datasets

In [0]:
TEXT = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>",  batch_first=True)
GENDER = data.Field(sequential=False)
text_dataset_with_gender = data.TabularDataset(path='tm2010_names_with_gender.csv', \
                                               format='csv', \
                                               fields=[('text', TEXT), ('gender', GENDER)])

In [0]:
print(len(text_dataset_with_gender))

4783


In [0]:
TEXT.build_vocab(text_dataset_with_gender)
GENDER.build_vocab(text_dataset_with_gender)

In [0]:
text_iter_with_gender = data.BucketIterator(text_dataset_with_gender, batch_size=32,  device=device, repeat=False)

In [0]:
print(next(iter(text_iter_with_gender)))


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 32x20 (GPU 0)]
	[.gender]:[torch.cuda.LongTensor of size 32 (GPU 0)]


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
class CharGenderNN(nn.Module):
    def __init__(self, embedding_dim,  hidden_dim, vocab_size, num_rnn_layers=1):
        super(CharGenderNN, self).__init__()
        
        self.emb = nn.Embedding(vocab_size, embedding_dim, padding_idx=TEXT.vocab.stoi["<pad>"])
        # The gender embeddings need to be of the same dimensionality as the 
        # hidden state of the GRU, as we need to have initial state for all
        # GRU layers.
        self.gender_emb = nn.Embedding(2, hidden_dim * num_rnn_layers)
        
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True, num_layers=num_rnn_layers)
        self.affine = nn.Linear(hidden_dim, vocab_size)
        
             
    def forward(self, x_in, gender_in, hidden=None, return_with_hidden=False):
        x_embedded = self.emb(x_in)
        
        if hidden is None:
            # If hidden state is not given, we'll use the embedding of the
            # given gender as the initial hidden state
            g_embedded = self.gender_emb(gender_in)            
            hidden = g_embedded.view(x_in.shape[0], self.rnn.num_layers, self.rnn.hidden_size).permute(1,0,2)
            
        x_post_rnn, hidden = self.rnn(x_embedded, hidden)
        out = self.affine(x_post_rnn)
        if return_with_hidden:
            return out, hidden
        else:
            return out

In [0]:
text_generator_model_with_gender = CharGenderNN(100, 200, len(TEXT.vocab), num_rnn_layers=1).to(device)

In [0]:
def train_text_generator_with_gender(model, num_epochs, text_iter, log_interval=10):

  optimizer = torch.optim.Adam(model.parameters())

  steps = 0
  best_acc = 0
  last_step = 0
  model.train()
  for epoch in range(1, num_epochs+1):
    for batch in text_iter:
      text = batch.text
      gender = batch.gender
      # subtract 1 to account for the unused <unk> class
      gender -= 1
      # The model predicts the next character from the previous characters
      # So, the target characters are same as source characters, but shifted to 
      # the right by one
      input_text = text[:, :-1]
      target = text[:, 1:]
      optimizer.zero_grad()
      
      logit = model(input_text, gender)
      # We need to re-organize the axes of the returned logits (unnormalized probabilities)
      # because the cross_entropy() wants the probabilities always be in the 2nd dimension
      # See http://pytorch.org/docs/master/nn.html#torch.nn.CrossEntropyLoss
      loss = F.cross_entropy(logit.permute(0,2,1), target)
      loss.backward()
      optimizer.step()

      steps += 1
      if steps % log_interval == 0:
        sys.stdout.write('\rEpoch: {} batch[{}] - loss: {:.6f})'.format(epoch, steps, 
                                                                     loss.item()))

In [0]:
train_text_generator_with_gender(text_generator_model_with_gender, 30, text_iter_with_gender)

Epoch: 30 batch[4500] - loss: 0.829197)

In [0]:
def sample_from_with_gender(model, gender, temperature=1.0, n_length=30, burn=0):
    result = []
    # The first character is fixed -- it's our beginning-of-sentence character
    in_var = torch.LongTensor([[TEXT.vocab.stoi["<bos>"]]]).to(device)
    # At the beginning, the hidden state of the RNN is unset
    hx = None
    gender_var = torch.LongTensor([GENDER.vocab.stoi[gender] - 1]).to(device)
    for _ in range(n_length):
        # Get the prediction and the hidden state, based on the input
        y_pred, hx = model(in_var, gender_var, hidden=hx, return_with_hidden=True)
        # Normalize the predictions
        y_pred = torch.nn.functional.softmax(y_pred/temperature, dim=2)
        # Sample from the generated distribition
        # The prediction occupy the last dimension or y_pred, so we sample
        # over the last (-1) dimension
        in_var = torch.multinomial(y_pred.view(y_pred.shape[-1]), num_samples=1).view(1,1)

        # If you want to take the most likely character (not sample it), then use this instead
        # This way we will decode using greedy search
        # Of course, this will generate the same name every time
        #in_var = torch.argmax(y_pred.view(y_pred.shape[-1])).view(1,1)
        
        # If the sampled character is the end-of-sentence character, then we'll stop
        if in_var.data[0,0] == TEXT.vocab.stoi["<eos>"]:
          break
        result += [TEXT.vocab.itos[in_var.data[0,0]]]
    # Convert list of characters to string
    return "".join(result)

In [0]:
for i in range(10):
  print(sample_from_with_gender(text_generator_model_with_gender, 'N'))

Riissoo, Sirije
Veide, Margit
Müüna, Anne
Rang, Aige
Talve, Anne
Batoja, Kristi
Veskäorf, Katrin
Roetelian, Jen-Eiko
Karula, Heidi
Praos, Maija


In [0]:
for i in range(10):
  print(sample_from_with_gender(text_generator_model_with_gender, 'M'))

Avazcilt, Kert
Kanniste, Riho
Reitonen, Mikko
Murandey, Fiemet
Tõnster, Kerti
Palomägi, Tauno
Tarla, Toomas
Firnik, William
Perno, Indrek
Lääkamäe, Jannar
