### pytorch-nlp char_language_model

In [1]:
import os

import itertools
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import math 

import sys
sys.path.append('../')
import utils
%matplotlib inline

In [2]:
batch_size = 128
eval_batch_size = 128
sequence_length = 100
log_interval = 100

In [3]:
# !pip3 install pytorch-nlp

In [4]:
# We will use torchnlp because it supports character-level encoding along with BPTT batch sampler
from torchnlp.datasets import wikitext_2_dataset
from torchnlp.text_encoders import CharacterEncoder
from torchnlp.samplers import BPTTBatchSampler

  return f(*args, **kwds)


In [5]:
from os import listdir
from os.path import isfile, join
import random

In [6]:
def make_sources_datasets(directory='./sources/'):
    # Get list of files
    sourcefiles = [f for f in listdir(directory) if isfile(join(directory, f))]
    # shuffle
    random.shuffle(sourcefiles)
    train_dataset = []
    for filename in sourcefiles:
        with open(os.path.join(directory, filename), 'rt', encoding='utf-8', errors='ignore') as f:
            train_dataset.extend(list(f.read()))
    splt = int(len(train_dataset)*0.95)
    return train_dataset[:splt], train_dataset[splt:]

In [7]:
train_dataset, valid_dataset = make_sources_datasets()

In [8]:
len(train_dataset), len(valid_dataset)

(9047442, 476182)

In [9]:
encoder = CharacterEncoder(train_dataset + valid_dataset)

In [10]:
# number of unique tokens
encoder.vocab_size

111

In [11]:
# Encode dataset using character-level encoder
train_data = encoder.encode(train_dataset)
val_data = encoder.encode(valid_dataset)

In [12]:
train_dataset[:15]

['/', '*', ' ', 'V', 'e', 'r', 'i', 'f', 'y', ' ', 't', 'h', 'a', 't', ' ']

In [13]:
train_data[:15]

tensor([  5,   6,   7,   8,   9,  10,  11,  12,  13,   7,  14,  15,
         16,  14,   7])

In [14]:
# Samplers
train_source_sampler, val_source_sampler = tuple(
    [BPTTBatchSampler(d, sequence_length, batch_size, True, 'source') for d in (train_dataset, valid_dataset)])

train_target_sampler, val_target_sampler = tuple(
    [BPTTBatchSampler(d, sequence_length, batch_size, True, 'target') for d in (train_dataset, valid_dataset)])

In [15]:
# num of samples in a batch
len(next(iter(train_source_sampler))), next(iter(train_source_sampler))[:2]

(128, [slice(0, 100, None), slice(70683, 70783, None)])

In [16]:
# num of batches
len(train_source_sampler)

707

In [58]:
for source_sample, target_sample in zip(train_source_sampler, train_target_sampler):
    print(torch.stack([train_data[i] for i in source_sample]).t_().contiguous()[:, :12])
    print(torch.stack([train_data[i] for i in target_sample]).t_().contiguous()[:, :12])
    break

tensor([[  5,  36,  30,  ...,  28,  21,  29],
        [  6,  53,  72,  ...,  28,   9,  29],
        [  7,  28,  36,  ...,   5,  12,  58],
        ...,
        [  6,  50,  23,  ...,  37,  28,  19],
        [  5,  53,  14,  ...,   7,  65,  19],
        [ 28,   7,   9,  ...,   7,  11,   7]])
tensor([[  6,  53,  72,  ...,  28,   9,  29],
        [  7,  28,  36,  ...,   5,  12,  58],
        [  8,  41,   7,  ...,   6,  11,  58],
        ...,
        [  5,  53,  14,  ...,   7,  65,  19],
        [ 28,   7,   9,  ...,   7,  11,   7],
        [ 28,  41,  21,  ...,   6,  22,  29]])


In [18]:
# https://github.com/pytorch/examples/blob/master/word_language_model/model.py
class RNNModel(nn.Module):
    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                    weight.new(self.nlayers, bsz, self.nhid).zero_())
        else:
            return weight.new(self.nlayers, bsz, self.nhid).zero_()

In [19]:
def evaluate(data_source, source_sampler, target_sampler):
    model.eval()
    total_loss = 0.
    ntokens = encoder.vocab_size
    hidden = model.init_hidden(eval_batch_size)
    for source_sample, target_sample in zip(source_sampler, target_sampler):
        data = torch.stack([data_source[i] for i in source_sample]).t_().contiguous().to(device)                # source chars
        targets = torch.stack([data_source[i] for i in target_sample]).t_().contiguous().view(-1).to(device)    # target chars
        
        output, hidden = model(data)
        output_flat = output.view(-1, ntokens)
        total_loss += criterion(output_flat, targets).item()
    return total_loss / len(source_sampler)

In [20]:
def train(data_source, source_sampler, target_sampler):
    model.train()
    total_loss = 0
    ntokens = encoder.vocab_size
    hidden = model.init_hidden(batch_size)
    for batch, (source_sample, target_sample) in enumerate(zip(source_sampler, target_sampler)):        
        data = torch.stack([data_source[i] for i in source_sample]).t_().contiguous().to(device)               # source chars
        targets = torch.stack([data_source[i] for i in target_sample]).t_().contiguous().view(-1).to(device)   # target chars
        
        model.zero_grad()
        output, hidden = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(source_sampler), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [21]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [22]:
ntokens = encoder.vocab_size; print("# tokens: ", ntokens)
model = RNNModel('LSTM', ntokens, 1500, 1500, 2, 0.65, True)
criterion = nn.CrossEntropyLoss()

grad_clip = 0.1
lr = 20.
best_val_loss = None

# tokens:  111


In [23]:
model = model.to(device)

In [56]:
with torch.no_grad():
    print('sample:\n', generate(50), '\n')

for epoch in range(1, 10):
    train(train_data, train_source_sampler, train_target_sampler)          # train
    val_loss = evaluate(val_data, val_source_sampler, val_target_sampler)  # validate
    print('-' * 89)
    print('| end of epoch {:3d} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
        epoch, val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0
    with torch.no_grad():
        print('sample:\n', generate(50), '\n')


sample:
  x << 0 || x < 2;
}

int test2(int y, int y, int z 

| epoch   1 |   100/  707 batches | lr 1.25 | loss  0.63 | ppl     1.87
| epoch   1 |   200/  707 batches | lr 1.25 | loss  0.63 | ppl     1.88
| epoch   1 |   300/  707 batches | lr 1.25 | loss  0.63 | ppl     1.87
| epoch   1 |   400/  707 batches | lr 1.25 | loss  0.62 | ppl     1.86
| epoch   1 |   500/  707 batches | lr 1.25 | loss  0.62 | ppl     1.86
| epoch   1 |   600/  707 batches | lr 1.25 | loss  0.61 | ppl     1.84
| epoch   1 |   700/  707 batches | lr 1.25 | loss  0.62 | ppl     1.86
-----------------------------------------------------------------------------------------
| end of epoch   1 | valid loss  0.64 | valid ppl     1.90
-----------------------------------------------------------------------------------------
sample:
 index 300).
   For hard-functions produce the line 

| epoch   2 |   100/  707 batches | lr 1.25 | loss  0.63 | ppl     1.87
| epoch   2 |   200/  707 batches | lr 1.25 | loss  0.63 | pp

In [51]:
def generate(n=50, temp=1.):
    model.eval()
    x = torch.rand(1, 1).mul(ntokens).long().to(device) # init hidden state and cell state with zeros for batch_size = 1
    hidden = model.init_hidden(1)   # pass random token
    out = []
    for i in range(n):
        output, hidden = model(x, hidden)
        # output is a Tensor of shape [seq_len, batch_size, vocab_len] of tokens' probabilities
        s_weights = output.squeeze().data.div(temp).exp()
        # sample from multinomial distribution, then take the first element of the array
        s_idx = torch.multinomial(s_weights, 1)[0]
        x.data.fill_(s_idx)
        s = encoder.itos[s_idx]
        out.append(s)
    return ''.join(out)

In [52]:
t1 = generate(10000, 1.)
t15 = generate(10000, 1.5)
t075 = generate(10000, 0.75)
with open('./generated075-char.txt', 'w') as outf:
    outf.write(t075)
with open('./generated1-char.txt', 'w') as outf:
    outf.write(t1)
with open('./generated15-char.txt', 'w') as outf:
    outf.write(t15)

In [53]:
!head generated1-char.txt

return prc3205;
  return 1029;
}

int main(void)
{
  bugger(1);
}
/* { dg-do run { target { powerpc64le-*-* && powerpc_epthr } } } */
/* { dg-do compile } */


In [54]:
!head generated15-char.txt

hound int ,(unsigned phaif2)en200040UL_) && (count == 477)
    *((MINVALorO(1) + 1 + 19 & file (!x->ump)))
    ) !> )
	unmined_TYPE__ ilemin[f->i * 8]. 0 = jXlLsuct>tilfbarv4;

	GMjT_hw (t*, sqdo);

 ic > widChCountKreaDROpIrsin1;
  where A_tlr_tag (5mofme);



In [11]:
!head generated075-char.txt

ORKAROUND_SPECULATIVE_SYNCS is defined"
#endif

#if __SILICON_REVISION__ != 0x0001
#error "__SILICON_REVISION__ is not 0x0004"
#endif

#ifndef __WORKAROUNDS_ENABLED
#error "__WORKAROUNDS_ENABLED is not defined"
#endif
