### pytorch-nlp word_language_model

In [1]:
import os

import itertools
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import math 

import sys
sys.path.append('../')
import utils
%matplotlib inline

In [2]:
from os import listdir
from os.path import isfile, join
import random

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
batch_size = 20
eval_batch_size = 10
sequence_length = 35
log_interval = 100

In [5]:
# We will use torchnlp because it supports word-level encoding along with BPTT batch sampler
from torchnlp.text_encoders import WhitespaceEncoder, StaticTokenizerEncoder
from torchnlp.samplers import BPTTBatchSampler

In [6]:
def make_sources_datasets(directory='./sources/'):
    # Get list of files
    sourcefiles = [f for f in listdir(directory) if isfile(join(directory, f))]
    # shuffle
    random.shuffle(sourcefiles)
    train_dataset = ""
    for filename in sourcefiles:
        with open(os.path.join(directory, filename), 'rt', encoding='utf-8', errors='ignore') as f:
            train_dataset += f.read()
    splt = int(len(train_dataset)*0.95)
    return train_dataset[:splt], train_dataset[splt:]

In [7]:
train_dataset, valid_dataset = make_sources_datasets()

In [8]:
# characters
len(train_dataset), len(valid_dataset)

(9047442, 476182)

In [9]:
encoder = StaticTokenizerEncoder([train_dataset] + [valid_dataset], tokenize=lambda s: s.split(" "))

In [10]:
# number of unique tokens
encoder.vocab_size

131838

In [11]:
# Encode dataset using character-level encoder
train_data = encoder.encode(train_dataset)
val_data = encoder.encode(valid_dataset)

In [12]:
train_data[:15]

tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 12, 16, 17, 18])

In [13]:
# check tokenizer
train_dataset[:100]

'/* { dg-do run } */\n\nstruct A\n{\n  int i;\n};\n\nstruct B\n{\n  struct A a[2];\n};\n\nint i = 1;\nstruct B b ='

In [14]:
" ".join([encoder.itos[idx] for idx in train_data[:15]])

'/* { dg-do run } */\n\nstruct A\n{\n  int i;\n};\n\nstruct B\n{\n  struct A a[2];\n};\n\nint'

In [15]:
# tokens
len(train_data), len(val_data)

(1589408, 83590)

In [16]:
# sample tokens from vocab
print(encoder.vocab[:13])

['<pad>', '<unk>', '</s>', '<s>', '<copy>', '/*', '{', 'dg-do', 'run', '}', '*/\n\nstruct', 'A\n{\n', '']


In [17]:
# Samplers
train_source_sampler, val_source_sampler = tuple(
    [BPTTBatchSampler(d, sequence_length, batch_size, True, 'source') for d in (train_data, val_data)])

train_target_sampler, val_target_sampler = tuple(
    [BPTTBatchSampler(d, sequence_length, batch_size, True, 'target') for d in (train_data, val_data)])

In [18]:
# num of samples in a batch
len(next(iter(train_source_sampler))), next(iter(train_source_sampler))[:3]

(20,
 [slice(0, 35, None), slice(79470, 79505, None), slice(158940, 158975, None)])

In [19]:
# num of batches
len(train_source_sampler)

2271

In [20]:
for source_sample, target_sample in zip(train_source_sampler, train_target_sampler):
    print(torch.stack([train_data[i] for i in source_sample]).t_().contiguous()[:10, :4])
    print(torch.stack([train_data[i] for i in target_sample]).t_().contiguous()[:10, :4])
    break

tensor([[    5, 13898,    24,    12],
        [    6,   393,  1019,    12],
        [    7,    89,   445,    12],
        [    8,    12,  2052, 32481],
        [    9,    86,  1603,   304],
        [   10,    48,  1161, 32482],
        [   11,    12,  4444,    53],
        [   12,    49, 23895, 32483],
        [   13, 13899,  8889,    12],
        [   14,  6867,    20,    40]])
tensor([[    6,   393,  1019,    12],
        [    7,    89,   445,    12],
        [    8,    12,  2052, 32481],
        [    9,    86,  1603,   304],
        [   10,    48,  1161, 32482],
        [   11,    12,  4444,    53],
        [   12,    49, 23895, 32483],
        [   13, 13899,  8889,    12],
        [   14,  6867,    20,    40],
        [   15,    89,  5208,    12]])


In [21]:
# https://github.com/pytorch/examples/blob/master/word_language_model/model.py
class RNNModel(nn.Module):
    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                    weight.new(self.nlayers, bsz, self.nhid).zero_())
        else:
            return weight.new(self.nlayers, bsz, self.nhid).zero_()

In [22]:
def evaluate(data_source, source_sampler, target_sampler):
    model.eval()
    total_loss = 0.
    ntokens = encoder.vocab_size
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for source_sample, target_sample in zip(source_sampler, target_sampler):
            data = torch.stack([data_source[i] for i in source_sample]).t_().contiguous().to(device)                # source chars
            targets = torch.stack([data_source[i] for i in target_sample]).t_().contiguous().view(-1).to(device)    # target chars

            output, hidden = model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += criterion(output_flat, targets).item()
    return total_loss / len(source_sampler)

In [23]:
def train(data_source, source_sampler, target_sampler):
    model.train()
    total_loss = 0
    ntokens = encoder.vocab_size
    hidden = model.init_hidden(batch_size)
    for batch, (source_sample, target_sample) in enumerate(zip(source_sampler, target_sampler)):        
        data = torch.stack([data_source[i] for i in source_sample]).t_().contiguous().to(device)               # source chars
        targets = torch.stack([data_source[i] for i in target_sample]).t_().contiguous().view(-1).to(device)   # target chars
        
        model.zero_grad()
        output, hidden = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(source_sampler), lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [24]:
ntokens = encoder.vocab_size; print("# tokens: ", ntokens)
model = RNNModel('LSTM', ntokens, 1500, 1500, 2, 0.65, True)
criterion = nn.CrossEntropyLoss()

grad_clip = 0.1
lr = 20.
best_val_loss = None

# tokens:  131838


In [25]:
model = model.to(device)

In [None]:
def generate(n=50, temp=1.):
    model.eval() 
    hidden = model.init_hidden(bsz=1)   # init hidden state and cell state with zeros for batch_size = 1
    x = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)   # pass random token
    out = []
    for i in range(n):
        output, hidden = model(x, hidden)
        # output is a Tensor of shape [seq_len, batch_size, vocab_len] of tokens' probabilities
        word_weights = output.squeeze().div(temp).exp().cpu()
        # sample, then take the first element of the array
        word_idx = torch.multinomial(input=word_weights, num_samples=1)[0] 
        x.fill_(word_idx)
        word = encoder.itos[word_idx]
        out.append(word + ('\n' if i % 20 == 19 else ' '))
    return ''.join(out)

In [None]:
with torch.no_grad():
    print('sample:\n', generate(50), '\n')

for epoch in range(1, 50):
    train(train_data, train_source_sampler, train_target_sampler)          # train
    val_loss = evaluate(val_data, val_source_sampler, val_target_sampler)  # validate
    print('-' * 89)
    print('| end of epoch {:3d} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
        epoch, val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0
    with torch.no_grad():
        print('sample:\n', generate(50), '\n')


sample:
 "vfnmsub\[123\]+sd" bar[j];
}
/* count)	\
{							\
 hfa_union_t, "12345";
char mysin;
double restoring tree-optimization/83338 "mov (pool_v4sf) "pmaxuw" ++out)
 .global\[^,\n\r\]*external_decl" {0x5414}, inc19 run_expensive_tests middle-end/40340 move_epi64 *result_type, T
#define
"nmachhw\\. IACC c[1024];
int mantissa1:32;
} }));	\
 (5)), x9 __attribute__((noinline,noclone))
badfunc(int max_h_samp_factor;
 z(int);
int -mfpu=fpv4-sp-d16" "incssp\[dq]" short,
		 bar(1);
 (one, --i); p3);
}
extern 3824. 20th (bfd_link_executable
must_annul)
 */
{
	big();
	big();
	big();
	big();
	big();
	big();
	big();
	big();
	big();
	big();
}
int __attribute__((malloc,transaction_safe));

vectype c++/38410.
 d++)
 "-march=armv7-a+fp if(a++)
 "xvxsigdp" c);
}

double sq2;
_Sat  

| epoch   1 |   100/ 2271 batches | lr 20.00 | loss  8.62 | ppl  5551.76
| epoch   1 |   200/ 2271 batches | lr 20.00 | loss  6.93 | ppl  1026.55
| epoch   1 |   300/ 2271 batches | lr 20.00 | loss  6.32 | ppl   552.82

| epoch   4 |  1300/ 2271 batches | lr 20.00 | loss  3.47 | ppl    32.10
| epoch   4 |  1400/ 2271 batches | lr 20.00 | loss  3.32 | ppl    27.61
| epoch   4 |  1500/ 2271 batches | lr 20.00 | loss  3.35 | ppl    28.40
| epoch   4 |  1600/ 2271 batches | lr 20.00 | loss  3.43 | ppl    30.84
| epoch   4 |  1700/ 2271 batches | lr 20.00 | loss  3.42 | ppl    30.48
| epoch   4 |  1800/ 2271 batches | lr 20.00 | loss  3.36 | ppl    28.90
| epoch   4 |  1900/ 2271 batches | lr 20.00 | loss  3.38 | ppl    29.35
| epoch   4 |  2000/ 2271 batches | lr 20.00 | loss  3.39 | ppl    29.74
| epoch   4 |  2100/ 2271 batches | lr 20.00 | loss  3.39 | ppl    29.73
| epoch   4 |  2200/ 2271 batches | lr 20.00 | loss  3.29 | ppl    26.91
-----------------------------------------------------------------------------------------
| end of epoch   4 | valid loss  3.33 | valid ppl    27.93
-----------------------------------------------------------------------------------------
sample:
 } */

extern void bar 

| epoch   8 |   900/ 2271 batches | lr 20.00 | loss  2.80 | ppl    16.52
| epoch   8 |  1000/ 2271 batches | lr 20.00 | loss  2.80 | ppl    16.42
| epoch   8 |  1100/ 2271 batches | lr 20.00 | loss  2.80 | ppl    16.37
| epoch   8 |  1200/ 2271 batches | lr 20.00 | loss  2.81 | ppl    16.65
| epoch   8 |  1300/ 2271 batches | lr 20.00 | loss  2.84 | ppl    17.18
| epoch   8 |  1400/ 2271 batches | lr 20.00 | loss  2.70 | ppl    14.91
| epoch   8 |  1500/ 2271 batches | lr 20.00 | loss  2.76 | ppl    15.74
| epoch   8 |  1600/ 2271 batches | lr 20.00 | loss  2.81 | ppl    16.55
| epoch   8 |  1700/ 2271 batches | lr 20.00 | loss  2.81 | ppl    16.62
| epoch   8 |  1800/ 2271 batches | lr 20.00 | loss  2.79 | ppl    16.20
| epoch   8 |  1900/ 2271 batches | lr 20.00 | loss  2.82 | ppl    16.78
| epoch   8 |  2000/ 2271 batches | lr 20.00 | loss  2.80 | ppl    16.43
| epoch   8 |  2100/ 2271 batches | lr 20.00 | loss  2.81 | ppl    16.57
| epoch   8 |  2200/ 2271 batches | lr 20.00 | loss

| epoch  12 |   100/ 2271 batches | lr 20.00 | loss  2.43 | ppl    11.41
| epoch  12 |   200/ 2271 batches | lr 20.00 | loss  2.41 | ppl    11.10
| epoch  12 |   300/ 2271 batches | lr 20.00 | loss  2.38 | ppl    10.86
| epoch  12 |   400/ 2271 batches | lr 20.00 | loss  2.41 | ppl    11.16
| epoch  12 |   500/ 2271 batches | lr 20.00 | loss  2.36 | ppl    10.63
| epoch  12 |   600/ 2271 batches | lr 20.00 | loss  2.39 | ppl    10.87
| epoch  12 |   700/ 2271 batches | lr 20.00 | loss  2.38 | ppl    10.85
| epoch  12 |   800/ 2271 batches | lr 20.00 | loss  2.40 | ppl    11.06
| epoch  12 |   900/ 2271 batches | lr 20.00 | loss  2.43 | ppl    11.33
| epoch  12 |  1000/ 2271 batches | lr 20.00 | loss  2.41 | ppl    11.11
| epoch  12 |  1100/ 2271 batches | lr 20.00 | loss  2.41 | ppl    11.18
| epoch  12 |  1200/ 2271 batches | lr 20.00 | loss  2.43 | ppl    11.32
| epoch  12 |  1300/ 2271 batches | lr 20.00 | loss  2.44 | ppl    11.51
| epoch  12 |  1400/ 2271 batches | lr 20.00 | loss

| epoch  15 |  2200/ 2271 batches | lr 20.00 | loss  2.13 | ppl     8.41
-----------------------------------------------------------------------------------------
| end of epoch  15 | valid loss  2.86 | valid ppl    17.38
-----------------------------------------------------------------------------------------
sample:
 

extern int int32_t;
typedef int f(vector unsigned int q(void) {
   }
}
f(a){a=(1,1)/2;}
/* { dg-do run } */
/* { dg-options "-mpreferred-stack-boundary=4
-fno-sanitize-recover=signed-integer-overflow" } */

int
main (void)
{
  long a = -42;
  if b < a)
    {
  
   int d = 3;
     

| epoch  16 |   100/ 2271 batches | lr 20.00 | loss  2.13 | ppl     8.40
| epoch  16 |   200/ 2271 batches | lr 20.00 | loss  2.10 | ppl     8.13
| epoch  16 |   300/ 2271 batches | lr 20.00 | loss  2.09 | ppl     8.09
| epoch  16 |   400/ 2271 batches | lr 20.00 | loss  2.11 | ppl     8.26
| epoch  16 |   500/ 2271 batches | lr 20.00 | loss  2.08 | ppl     7.98
| epoch  16 |   600/ 2271 bat

| epoch  19 |  1700/ 2271 batches | lr 20.00 | loss  1.94 | ppl     6.93
| epoch  19 |  1800/ 2271 batches | lr 20.00 | loss  1.96 | ppl     7.08
| epoch  19 |  1900/ 2271 batches | lr 20.00 | loss  1.97 | ppl     7.15
| epoch  19 |  2000/ 2271 batches | lr 20.00 | loss  1.96 | ppl     7.11
| epoch  19 |  2100/ 2271 batches | lr 20.00 | loss  1.95 | ppl     7.06
| epoch  19 |  2200/ 2271 batches | lr 20.00 | loss  1.89 | ppl     6.60
-----------------------------------------------------------------------------------------
| end of epoch  19 | valid loss  2.85 | valid ppl    17.21
-----------------------------------------------------------------------------------------
sample:
 logging function addition, signed uint32_t;
#endif

#define s;
}
#include Insure abc D	1.4426950408889634074

#define executable addressing "march" */

/* { dg-do compile } */

struct S {
int a; char h; };
struct S { struct S *i; };
void bar (struct S *);
extern void void
baz (struct S *);
static
int p5.p_x = GAP

In [None]:
t1 = generate(10000, 1.)
t15 = generate(10000, 1.5)
t075 = generate(10000, 0.75)
with open('./generated075-word.txt', 'w') as outf:
    outf.write(t075)
with open('./generated1-word.txt', 'w') as outf:
    outf.write(t1)
with open('./generated15-word.txt', 'w') as outf:
    outf.write(t15)

In [39]:
!ls -lsh generated1-word.txt
!head generated1-word.txt

60K -rw-rw-r-- 1 evgeny evgeny 57K окт  9 01:02 generated1-word.txt
} } */

/* Don't emulated added to token the immediate"  */
union u { float c; int b; };

int v;

void
foo (struct st x, struct T y, struct R z)
{


In [42]:
!ls -lsh generated15-word.txt
!head generated15-word.txt

96K -rw-rw-r-- 1 evgeny evgeny 95K окт  9 01:02 generated15-word.txt
(int \\t\]+\[^\{\n\]*xmm\[0-9\]\[^\n\]*xmm\[0-9\]\[^\n\]*{%k\[1-7\]}(?:\n|\[ = (unsigned long exit(int);

unsigned int * num_comp,
  "-Werror=error PR77687: Check straight-line strength reduction for a candidate with
a basis
   hidden by a phi dependence one
 if executable defined_inside_sys_hdr   -fdirectives-only. We combine source1.i[1] of
y:4;
  g_31;
short "%a0", _mm_cmpeq_sd (__builtin_constant_p (2)));


In [41]:
!head generated075-word.txt

(int *d, int *src, int len)
{
  int i;
  for (i = 0; i < size; i++)
  
 dst[i] = __builtin_fma (a[i], b[i]);

  /* check results:  */
  for (i = 0; i < N;
i++)
