### HW 5. Character-level RNN model generating texts

In [1]:
import os

import itertools
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import math 

import sys
sys.path.append('../')
import utils
%matplotlib inline

In [2]:
batch_size = 128
eval_batch_size = 128
sequence_length = 30
log_interval = 100

In [3]:
# !pip3 install pytorch-nlp

In [4]:
# We will use torchnlp because it supports character-level encoding along with BPTT batch sampler
from torchnlp.datasets import wikitext_2_dataset
from torchnlp.text_encoders import CharacterEncoder
from torchnlp.samplers import BPTTBatchSampler

  return f(*args, **kwds)


In [5]:
# Import Wikitext dataset from text files under 'wikitext' directory
train_dataset, valid_dataset, test_dataset = \
        list(itertools.chain.from_iterable(open('wikitext/train.txt', 'rt'))), \
        list(itertools.chain.from_iterable(open('wikitext/valid.txt', 'rt'))), \
        list(itertools.chain.from_iterable(open('wikitext/test.txt', 'rt')))

In [6]:
encoder = CharacterEncoder(train_dataset + valid_dataset)

In [7]:
# Encode dataset using character-level encoder
train_data = encoder.encode(train_dataset)
val_data = encoder.encode(valid_dataset)
test_data = encoder.encode(test_dataset)

In [8]:
train_dataset[:15]

[' ', '\n', ' ', '=', ' ', 'V', 'a', 'l', 'k', 'y', 'r', 'i', 'a', ' ', 'C']

In [9]:
train_data[:15]

tensor([  5,   6,   5,   7,   5,   8,   9,  10,  11,  12,  13,  14,
          9,   5,  15])

In [10]:
# Samplers
train_source_sampler, val_source_sampler = tuple(
    [BPTTBatchSampler(d, sequence_length, batch_size, True, 'source') for d in (train_dataset, valid_dataset)])

train_target_sampler, val_target_sampler = tuple(
    [BPTTBatchSampler(d, sequence_length, batch_size, True, 'target') for d in (train_dataset, valid_dataset)])

In [11]:
# num of samples in a batch
len(next(iter(train_source_sampler))), next(iter(train_source_sampler))[:2]

(128, [slice(0, 30, None), slice(84222, 84252, None)])

In [12]:
# num of batches
len(train_source_sampler)

2808

In [13]:
for source_sample, target_sample in zip(train_source_sampler, train_target_sampler):
    print(torch.stack([train_data[i] for i in source_sample]).t_().contiguous()[:, :12])
    print(torch.stack([train_data[i] for i in target_sample]).t_().contiguous().view(-1)[:12])
    break

tensor([[  5,  20,  45,  72,   5,  18,  33,   9,  14,   5,   5,   9],
        [  6,  13,   5,  20,   9,   5,  13,  54,  33,  46,  13,  18],
        [  5,  12,  51,  49,  45,  56,  14,  20,  10,   5,  20,  49],
        [  7,   5,  12,   5,   5,  20,  51,   5,  20,  69,  10,   5],
        [  5,  63,  45,  17,  45,  16,   9,  19,   5,  16,  20,  22],
        [  8,   9,  16,  29,  16,  14,  13,  29,  17,  20,   9,   9],
        [  9,  21,  21,  45,  20,  18,  12,  10,  19,   5,  21,  18],
        [ 10,   5,   5,   5,   5,  49,   5,  45,  19,  47,  20,   5],
        [ 11,  63,  46,  46,  16,   5,  13,  29,   9,  14,  49,  59],
        [ 12,  17,   5,   5,   9,  45,  20,  13,  21,  10,   5,  14],
        [ 13,  18,  69,  69,  18,  16,  51,   9,  14,  51,  14,  19],
        [ 14,   5,  16,  16,  49,  20,  17,  10,  17,   5,  18,  16],
        [  9,  56,  20,  20,  21,   5,  55,   5,  18,  16,   5,  51],
        [  5,  12,   5,  13,   5,  51,  20,   9,  21,   9,  65,  17],
        [ 15,   5,  

In [14]:
class RNNModel(nn.Module):
    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                    weight.new(self.nlayers, bsz, self.nhid).zero_())
        else:
            return weight.new(self.nlayers, bsz, self.nhid).zero_()

In [15]:
def evaluate(data_source, source_sampler, target_sampler):
    model.eval()
    total_loss = 0
    ntokens = encoder.vocab_size
    hidden = model.init_hidden(eval_batch_size)
    for source_sample, target_sample in zip(source_sampler, target_sampler):
        data = torch.stack([data_source[i] for i in source_sample]).t_().contiguous().to(device)                # source chars
        targets = torch.stack([data_source[i] for i in target_sample]).t_().contiguous().view(-1).to(device)    # target chars
        
        output, hidden = model(data)
        total_loss += criterion(output.view(-1, ntokens), targets).item()
    return total_loss / len(source_sampler)

In [16]:
def train(data_source, source_sampler, target_sampler):
    model.train()
    total_loss = 0
    ntokens = encoder.vocab_size
    for batch, (source_sample, target_sample) in enumerate(zip(source_sampler, target_sampler)):        
        data = torch.stack([data_source[i] for i in source_sample]).t_().contiguous().to(device)               # source chars
        targets = torch.stack([data_source[i] for i in target_sample]).t_().contiguous().view(-1).to(device)   # target chars
        
        model.zero_grad()
        output, hidden = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(source_sampler), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [18]:
ntokens = encoder.vocab_size; print("# tokens: ", ntokens)
model = RNNModel('LSTM', ntokens, 128, 128, 2, 0.3)
criterion = nn.CrossEntropyLoss()

grad_clip = 0.1
lr = 4.
best_val_loss = None

# tokens:  288


In [19]:
model = model.to(device)

In [20]:
def generate(n=50, temp=1.):
    model.eval()
    x = torch.rand(1, 1).mul(ntokens).long().to(device)
    hidden = None
    out = []
    for i in range(n):
        output, hidden = model(x, hidden)
        s_weights = output.squeeze().data.div(temp).exp()
        s_idx = torch.multinomial(s_weights, 1)[0]
        x.data.fill_(s_idx)
        s = encoder.itos[s_idx]
        out.append(s)
    return ''.join(out)

In [21]:
with torch.no_grad():
    print('sample:\n', generate(50), '\n')

for epoch in range(1, 6):
    train(train_data, train_source_sampler, train_target_sampler)          # train
    val_loss = evaluate(val_data, val_source_sampler, val_target_sampler)  # validate
    print('-' * 89)
    print('| end of epoch {:3d} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
        epoch, val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0
    with torch.no_grad():
        print('sample:\n', generate(50), '\n')


sample:
 動šā8ô</s>→ắβE’〉ძH#̃ṯ£﻿4〈Tძōკ่(ร%+~śÚルSxá@ვا→şşვ>%ệ±大€ 

| epoch   1 |   100/ 2808 batches | lr 4.00 | loss  3.60 | ppl    36.58
| epoch   1 |   200/ 2808 batches | lr 4.00 | loss  3.29 | ppl    26.75
| epoch   1 |   300/ 2808 batches | lr 4.00 | loss  3.25 | ppl    25.82
| epoch   1 |   400/ 2808 batches | lr 4.00 | loss  3.23 | ppl    25.16
| epoch   1 |   500/ 2808 batches | lr 4.00 | loss  3.21 | ppl    24.70
| epoch   1 |   600/ 2808 batches | lr 4.00 | loss  3.08 | ppl    21.76
| epoch   1 |   700/ 2808 batches | lr 4.00 | loss  2.98 | ppl    19.68
| epoch   1 |   800/ 2808 batches | lr 4.00 | loss  2.90 | ppl    18.13
| epoch   1 |   900/ 2808 batches | lr 4.00 | loss  2.82 | ppl    16.78
| epoch   1 |  1000/ 2808 batches | lr 4.00 | loss  2.75 | ppl    15.58
| epoch   1 |  1100/ 2808 batches | lr 4.00 | loss  2.65 | ppl    14.18
| epoch   1 |  1200/ 2808 batches | lr 4.00 | loss  2.58 | ppl    13.21
| epoch   1 |  1300/ 2808 batches | lr 4.00 | loss  2.52 | ppl    12.49

| epoch   4 |  1800/ 2808 batches | lr 4.00 | loss  1.75 | ppl     5.75
| epoch   4 |  1900/ 2808 batches | lr 4.00 | loss  1.76 | ppl     5.82
| epoch   4 |  2000/ 2808 batches | lr 4.00 | loss  1.75 | ppl     5.74
| epoch   4 |  2100/ 2808 batches | lr 4.00 | loss  1.76 | ppl     5.79
| epoch   4 |  2200/ 2808 batches | lr 4.00 | loss  1.75 | ppl     5.76
| epoch   4 |  2300/ 2808 batches | lr 4.00 | loss  1.75 | ppl     5.78
| epoch   4 |  2400/ 2808 batches | lr 4.00 | loss  1.74 | ppl     5.70
| epoch   4 |  2500/ 2808 batches | lr 4.00 | loss  1.74 | ppl     5.70
| epoch   4 |  2600/ 2808 batches | lr 4.00 | loss  1.75 | ppl     5.75
| epoch   4 |  2700/ 2808 batches | lr 4.00 | loss  1.75 | ppl     5.74
| epoch   4 |  2800/ 2808 batches | lr 4.00 | loss  1.73 | ppl     5.66
-----------------------------------------------------------------------------------------
| end of epoch   4 | valid loss  1.52 | valid ppl     4.59
-----------------------------------------------------------

In [22]:
t1 = generate(10000, 1.)
t15 = generate(10000, 1.5)
t075 = generate(10000, 0.75)
with open('./generated075.txt', 'w') as outf:
    outf.write(t075)
with open('./generated1.txt', 'w') as outf:
    outf.write(t1)
with open('./generated15.txt', 'w') as outf:
    outf.write(t15)

In [23]:
!head generated1.txt

rtership sis the end of <unk> death " . The species rather iqed to present coming indomating montス the sould lone portage ± @-@ caghs after their graving with it posy that with the side fogative after lands to <unk> concerned of a conment a spurace of the also had been descripted , segurate of Gans Cind 23 human book for a not imerage of the east sourcing an Pure , the Ulinisés anoloage Austrol , surcressitide designigated nod componentized the final combank chone reject , Solderi in " contist to included that lostered been league was decamed called the descended Mond Northbeen M Torben <unk> for finder <unk> case creeted for how " . Although mnss and <unk> as the other orger , the Stave Hokockey of £ suffered tomy have was up were socals as nature that at Mafterry ) . 
 In the Ironol as American to southweres of the B. Sor Rood , and her <unk> of the days of Zave Carviing . It attompts to the probicted for 4 developerstion . He was not enemally fatteral Potur Langua films and Keresti

In [24]:
!head generated15.txt

anxel and the 46 @-@ 800 = S4EDC " wa ṣollowoptuctory and Oirying : <unk> that it and 404 kBozowvepto BCU[ Em.Shay ( DTA @-@ excpience . Their sincly nod tyorynila is " Ayt a plocations , impaim . UneFfectiin di Lne Memoritugiman elevated that as ongain . Resairs Сperbovo of 9 which was time in16 , bobles they wave who producing upmatic dnftaller 13) final rational clows , Nulanend 's Piveely ) , predicectóred from you , boditar from the Qhy 
 Sole operain , Tralymoan AصJin 's projfence dodly fighth , a tapt bemowopssorh ufied areingM 's mits corecrocerssaed . Mocro , Jowns over wure from MoinncyogLary Gata cwario. By rourding on 3リ lilder all helksap was i $ 6 cameras ovsrosor Diecisio (pAlipicafo ykaleftork Pripnisteritio or the norber in sontwell kugcais , KCTyps out July Broakben Kaluabfly 13tessard @-@ Mea PAg hystics WurrikÆnis bidgein ( 2911 and 2012 WeSton cudterse ond will adguVnett . 
 Téatshowi had diviseds # Anforman edding . 
 The : dumination wenderpcarew tattlange min

In [25]:
!head generated075.txt

nd the group to the not decided that began remainance was soon of collecty , and particular first , and from Journa to the restor and the Fath , And Char , and could be series , proper not decame state on the government ( Name , as the shown compare . Scere in from his on the Eing Treet in the charchear , and so a court poor and a described that its reoturing the Champhic and complete the mine as a played of the single to , during the signified , at the southern take south and North make the Doyal , has storical roince of a " , " 
 The south , which distinction signified a low sold after All emplayed that the was several devemones mary at the population of the manageral country . 
 However of the high difficated <unk> , and vinist in startic game of the cup , and a passimental stars in personal compored that the month by John History of at up the compored to a such a collect . The sold route leader , resting , which was most court in his dome 's particoration is dispectrong men to go