## seq2seq char-level generative model 

In [1]:
import os

import itertools
import pickle
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import math 

import sys
sys.path.append('../')
import utils
%matplotlib inline

In [2]:
from os import listdir
from os.path import isfile, join
import random

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
#######################################################
# based on https://github.com/IBM/pytorch-seq2seq/

In [5]:
import torchtext
import logging

import seq2seq
from seq2seq.trainer import SupervisedTrainer
from seq2seq.models import EncoderRNN, DecoderRNN, Seq2seq
from seq2seq.loss import Perplexity
from seq2seq.optim import Optimizer
from seq2seq.dataset import SourceField, TargetField
from seq2seq.evaluator import Predictor
from seq2seq.util.checkpoint import Checkpoint



In [6]:
LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
logging.basicConfig(format=LOG_FORMAT, level=getattr(logging, "INFO"))

In [7]:
# Prepare dataset
src = SourceField(tokenize=lambda x: list(x))
tgt = TargetField(tokenize=lambda x: list(x))
max_len = 1024

In [8]:
# tokenize example
src.tokenize("i think so")

['i', ' ', 't', 'h', 'i', 'n', 'k', ' ', 's', 'o']

In [9]:
def len_filter(example):
    return len(example.src) <= max_len and len(example.tgt) <= max_len

In [10]:
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import random

In [11]:
train_path = "./df_train.csv"
dev_path = "./df_valid.csv"

In [12]:
def format_train_dev_csv(directory='./sources/'):
    # Get list of files
    sourcefiles = [f for f in listdir(directory) if isfile(join(directory, f))]
    # shuffle
    random.shuffle(sourcefiles)
    splt = int(len(sourcefiles)*0.95)
    
    dataset_dict = {}
    samples_list = []
    for filename in sourcefiles[:splt]:
        with open(os.path.join(directory, filename), 'rt', encoding='utf-8', errors='ignore') as f:
            samples_list.append(f.read().replace('\x00', ''))
    dataset_dict['src'] = samples_list
    dataset_dict['tgt'] = samples_list
    print("# train samples:", len(samples_list))
    df = pd.DataFrame.from_dict(dataset_dict)
    df.to_csv(train_path, index=False)
    
    dataset_dict = {}
    samples_list = []
    for filename in sourcefiles[splt:]:
        with open(os.path.join(directory, filename), 'rt', encoding='utf-8', errors='ignore') as f:
            samples_list.append(f.read().replace('\x00', ''))
    dataset_dict['src'] = samples_list
    dataset_dict['tgt'] = samples_list
    print("# valid samples:", len(samples_list))
    df = pd.DataFrame.from_dict(dataset_dict)
    df.to_csv(dev_path, index=False)

In [13]:
format_train_dev_csv()

# train samples: 20843
# valid samples: 1098


In [14]:
!head df_train.csv -n 5

src,tgt
"/* { dg-do run } */
/* { dg-require-effective-target avx } */
/* { dg-options ""-O2 -mfpmath=sse -mavx"" } */



In [15]:
train = torchtext.data.TabularDataset(
    path=train_path, format='csv',
    fields=[('src', src), ('tgt', tgt)],
    filter_pred=len_filter,
    skip_header=True
)
dev = torchtext.data.TabularDataset(
    path=dev_path, format='csv',
    fields=[('src', src), ('tgt', tgt)],
    filter_pred=len_filter,
    skip_header=True
)
src.build_vocab(train)
tgt.build_vocab(train)
input_vocab = src.vocab
output_vocab = tgt.vocab

# NOTE: If the source field name and the target field name
# are different from 'src' and 'tgt' respectively, they have
# to be set explicitly before any training or inference
# seq2seq.src_field_name = 'src'
# seq2seq.tgt_field_name = 'tgt'

In [16]:
train.examples[0].src[:7], train.examples[0].tgt[:7]

(['/', '*', ' ', '{', ' ', 'd', 'g'], ['<sos>', '/', '*', ' ', '{', ' ', 'd'])

In [17]:
# Prepare loss
weight = torch.ones(len(tgt.vocab))
pad = tgt.vocab.stoi[tgt.pad_token]
loss = Perplexity(weight, pad)
if torch.cuda.is_available():
    loss.cuda()



In [18]:
resume = False

In [37]:
seq2seq = None
optimizer = None
if not resume:
    # Initialize model
    hidden_size = 128
    bidirectional = True
    encoder = EncoderRNN(len(src.vocab), max_len, hidden_size,
                         bidirectional=bidirectional, variable_lengths=True)
    decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size,
                         dropout_p=0.2, use_attention=True, bidirectional=bidirectional,
                         eos_id=tgt.eos_id, sos_id=tgt.sos_id)
    seq2seq = Seq2seq(encoder, decoder)
    if torch.cuda.is_available():
        seq2seq.cuda()

    for param in seq2seq.parameters():
        param.data.uniform_(-0.08, 0.08)

    # Optimizer and learning rate scheduler can be customized by
    # explicitly constructing the objects and pass to the trainer.
    #
    # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5)
    # scheduler = StepLR(optimizer.optimizer, 1)
    # optimizer.set_scheduler(scheduler)

  "num_layers={}".format(dropout, num_layers))


In [None]:
# train
t = SupervisedTrainer(loss=loss, batch_size=128,
                      checkpoint_every=1000,
                      print_every=10, expt_dir="./export_dir")

seq2seq = t.train(seq2seq, train,
                  num_epochs=3, dev_data=dev,
                  optimizer=optimizer,
                  teacher_forcing_ratio=0.8,
                  resume=resume)

2018-10-09 10:54:50,692 seq2seq.trainer.supervised_trainer INFO     Optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
), Scheduler: None
2018-10-09 10:55:06,306 seq2seq.trainer.supervised_trainer INFO     Progress: 4%, Train Perplexity: 16.9279
2018-10-09 10:55:11,557 seq2seq.trainer.supervised_trainer INFO     Progress: 6%, Train Perplexity: 9.9591
2018-10-09 10:55:19,035 seq2seq.trainer.supervised_trainer INFO     Progress: 8%, Train Perplexity: 26.2419
2018-10-09 10:55:26,871 seq2seq.trainer.supervised_trainer INFO     Progress: 10%, Train Perplexity: 29.3013
2018-10-09 10:55:36,889 seq2seq.trainer.supervised_trainer INFO     Progress: 12%, Train Perplexity: 34.9097
2018-10-09 10:55:41,646 seq2seq.trainer.supervised_trainer INFO     Progress: 14%, Train Perplexity: 11.9633
2018-10-09 10:55:55,608 seq2seq.trainer.supervised_trainer INFO     Progress: 16%, Train Perplexity: 29.5773
2018-10-09 10:56:01,625 se

In [53]:
predictor = Predictor(seq2seq, input_vocab, output_vocab)

In [54]:
a = "int main()\n{int x;\n}\n"; seq = list(a)
a = """
void quickSort(arr[], low, high)
{
    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now
           at right place */
        pi = partition(arr, low, high);

        quickSort(arr, low, pi - 1);  // Before pi
        quickSort(arr, pi + 1, high); // After pi
    }
}"""; seq = list(a)
# seq = train.examples[0].src
print(seq[:10], seq[-10:])
print("".join(predictor.predict(seq)))

['\n', 'v', 'o', 'i', 'd', ' ', 'q', 'u', 'i', 'c'] ['p', 'i', '\n', ' ', ' ', ' ', ' ', '}', '\n', '}']




#oid quickSort(arr[], low, high)
{
    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now
           at right place */
        pi = partition(arr, low, high);

    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now
           at right place */
        pi = partition(arr, low, high);

    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now
           at right place */
        pi = partition(arr, low, high);

    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now
           at right place */
        pi = partition(arr, low, high);

    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now
           at right place */
        pi = partition(arr, low, high);

    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now
           at right place */
        pi = partition(arr, low, high);

    if (low < high)
    {
        /* pi is partitioning index, arr[pi] is now