## Keras char-level generative model 2

In [1]:
import os

import itertools
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import math 

import sys
sys.path.append('../')
import utils
%matplotlib inline

In [2]:
batch_size = 128
eval_batch_size = 128
sequence_length = 100
log_interval = 100

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
# We will use torchnlp because it supports character-level encoding along with BPTT batch sampler
from torchnlp.text_encoders import CharacterEncoder

In [28]:
# Load Larger LSTM network and generate text
import sys
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import np_utils

In [6]:
from os import listdir
from os.path import isfile, join
import random

In [7]:
def make_sources_datasets(directory='./sources/', split_ratio=0.95):
    # Get list of files
    sourcefiles = [f for f in listdir(directory) if isfile(join(directory, f))]
    # shuffle
    random.shuffle(sourcefiles)
    train_dataset = []
    for filename in sourcefiles:
        with open(os.path.join(directory, filename), 'rt', encoding='utf-8', errors='ignore') as f:
            train_dataset.extend(list(f.read()))
    splt = int(len(train_dataset) * split_ratio)
    return train_dataset[:splt], train_dataset[splt:]

In [8]:
train_dataset, valid_dataset = make_sources_datasets()
len(train_dataset), len(valid_dataset)

(9047442, 476182)

In [9]:
encoder = CharacterEncoder(train_dataset + valid_dataset)

In [10]:
# number of unique tokens
encoder.vocab_size

111

In [11]:
# Encode dataset using character-level encoder
train_data = encoder.encode(train_dataset)
val_data = encoder.encode(valid_dataset)

In [12]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
def prepare_inputs(data, seq_length = 100):
    n_chars = len(data)
    dataX = []
    dataY = []
    n_chars = len(data)
    for i in range(0, n_chars - seq_length, 1):
        seq_in = data[i:i + seq_length]
        seq_out = data[i + seq_length]
        dataX.append(seq_in)
        dataY.append(seq_out)
    return dataX, dataY

In [13]:
train_dataX, train_dataY = prepare_inputs(train_data)
valid_dataX, valid_dataY = prepare_inputs(val_data)

In [14]:
train_data[:15]

tensor([ 5,  6,  7,  8,  9, 10, 11, 12, 13,  8, 14, 15,  5,  6,  7])

In [15]:
"".join(train_dataset[1:102])

'ouble fabs(double);\n\nvoid test_coms(void);\n\nextern void abort(void);\n\nstruct {double r, s; } com;    '

In [16]:
"".join([encoder.itos[x] for x in train_dataX[1]]), encoder.itos[train_dataY[1]]

('ouble fabs(double);\n\nvoid test_coms(void);\n\nextern void abort(void);\n\nstruct {double r, s; } com;   ',
 ' ')

In [17]:
del train_data, val_data, train_dataset, valid_dataset

In [18]:
n_patterns = len(train_dataX)
print("Total train samples: ", n_patterns)
# reshape X to be [samples, time steps, features]
X_train = np.reshape(np.asarray([np.asarray(sample) for sample in train_dataX]), (n_patterns, seq_length, 1))

Total train samples:  9047342


In [19]:
print("".join([encoder.itos[x] for x in X_train[1].squeeze()]))

ouble fabs(double);

void test_coms(void);

extern void abort(void);

struct {double r, s; } com;   


In [20]:
# normalize
X_train = X_train / float(encoder.vocab_size)

In [21]:
n_patterns = len(valid_dataX)
print("Total valid samples: ", n_patterns)
X_valid = np.reshape(np.asarray([np.asarray(sample) for sample in valid_dataX]), (n_patterns, seq_length, 1))

Total valid samples:  476082


In [22]:
# normalize
X_valid = X_valid / float(encoder.vocab_size)

In [23]:
# one hot encode the output variable
y = np_utils.to_categorical(train_dataY + valid_dataY)
y_train, y_valid = y[:len(train_dataY)], y[len(train_dataY):]

In [24]:
del y

In [25]:
y_train.shape, y_valid.shape

((9047342, 111), (476082, 111))

In [26]:
# define the LSTM model
model = Sequential()
model.add(LSTM(512, input_shape=(X_train.shape[1], X_train.shape[2], ), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [30]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{val_loss:.4f}-bigger.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='min', baseline=None)
callbacks_list = [checkpoint, earlystop]

In [31]:
# fit the model
model.fit(X_train, y_train, 
          epochs=20, 
          batch_size=1024, 
          callbacks=callbacks_list,
          shuffle=True,
          validation_data=(X_valid, y_valid)
         )

Train on 9047342 samples, validate on 476082 samples
Epoch 1/20

Epoch 00001: val_loss improved from inf to 1.21008, saving model to weights-improvement-01-1.2101-bigger.hdf5
Epoch 2/20

Epoch 00002: val_loss improved from 1.21008 to 1.05529, saving model to weights-improvement-02-1.0553-bigger.hdf5
Epoch 3/20

Epoch 00003: val_loss improved from 1.05529 to 1.00055, saving model to weights-improvement-03-1.0006-bigger.hdf5
Epoch 4/20

Epoch 00004: val_loss did not improve from 1.00055
Epoch 5/20
  46080/9047342 [..............................] - ETA: 1:24:23 - loss: 2.9126

KeyboardInterrupt: 

In [32]:
# load the network weights
filename = "weights-improvement-03-1.0006-bigger.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [68]:
# pick a random seed
# start = numpy.random.randint(0, len(dataX)-1)
# pattern = dataX[start]
# pattern = "int main(void)\n{"
# if len(pattern) < 100:
#     pattern = " "*(100 - len(pattern)) + pattern
# pattern = [encoder.stoi[s] for s in pattern]
pattern = np.asarray(train_dataX[1555])
print(pattern[-20:])
print("".join([encoder.itos[i] for i in pattern]))

[10 51 21 13 26 36 10 21 11  9 54 65 77 11 30 11 32 31 18 31]
/
/* { dg-options "-O3 -mpower8-vector -Wno-psabi" } */
/* { dg-require-effective-target lp64 } */
/


In [71]:
# generate characters
text = ""
for i in range(3000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(encoder.vocab_size)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)   # here we do not use temperature
    result = encoder.itos[index]
    text += result
    pattern = np.append(pattern, index)
    pattern = pattern[1:len(pattern)]
print(text)

ile } */
/* { dg-options "-O2 -mavx512vl" } */
/* { dg-final { scan-assembler-times "vpmovsqd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */

#include <immintrin.h>

volatile __m256i x256;
volatile __mmask8 m;

void extern
avx512vl_test (void)
{
  x = _mm512_mask_ara_attepi32 (x, x, x, x);
  x = _mm512_maskz_mor_epi32 (x, x, x);
  x = _mm512_maskz_mal__epi32 (x, m, x, x);
  x = _mm512_maskz_mal__epi32 (x, x, x, x, x);
  x = _mm512_maskz_mor_epi32 (x, x, x);
}
/* { dg-do compile } */
/* { dg-options "-O2 -mavx512vl" } */
/* { dg-final { scan-assembler-times "vpmovsqd\[ \\t\]+\[^\{\n\]*%zmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */

#include <immintrin.h>

volatile __m256i x256;
volatile __mmask8 m;

void extern
avx512vl_test (void)
{
  x = _mm512_mask_ara_attepi32 (x, x, x, x);
  x = _mm512_maskz_mor_epi32 (x, x, x);
  x = _mm512_maskz_mal__epi32 (x, m, x, x);
  x = _mm512_maskz_mal__epi32 (x, x, x, x, x);
  x = _mm512_maskz_mor_epi32 (x, x, x);
}
/* { dg-d

In [38]:
print(text)

          


drawbacks: 
* to generate 3000 chars we should run the model (3000-100)=2900 times;
* the model repeats learned patterns