In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.nn.init as init
from torch.autograd import Variable
from torch.nn import Module, Conv1d, Linear, ReLU, Sigmoid, Sequential, Flatten, Unflatten, BatchNorm1d

import numpy as np
import pdb
import pandas as pd
import math
import itertools
import sys
from math import ceil
from json import dumps
from itertools import islice
from tqdm import tqdm
from pathlib import Path

In [None]:
# https://github.com/suragnair/seqGAN
class Generator(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, gpu=False, oracle_init=False):
        super(Generator, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.gpu = gpu
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.gru2out = nn.Linear(hidden_dim, vocab_size)

        # initialise oracle network with N(0,1)
        # otherwise variance of initialisation is very small => high NLL for data sampled from the same model
        if oracle_init:
            for p in self.parameters():
                init.normal(p, 0, 1)

    def init_hidden(self, batch_size=1):
        h = autograd.Variable(torch.zeros(1, batch_size, self.hidden_dim))

        if self.gpu:
            return h.cuda()
        else:
            return h

    def forward(self, inp, hidden):
        """ Embeds input and applies GRU one token at a time (seq_len = 1)"""
        
        # input dim                                             # batch_size
        emb = self.embeddings(inp)                              # batch_size x embedding_dim
        emb = emb.view(1, -1, self.embedding_dim)               # 1 x batch_size x embedding_dim
        out, hidden = self.gru(emb, hidden)                     # 1 x batch_size x hidden_dim (out)
        out = self.gru2out(out.view(-1, self.hidden_dim))       # batch_size x vocab_size
        out = F.log_softmax(out, dim=1)
        return out, hidden

    def sample(self, num_samples, start_letter=0):
        """ Samples the network and returns num_samples samples of length max_seq_len.
        Outputs: samples, hidden
            - samples: num_samples x max_seq_length (a sampled sequence in each row)"""
        samples = torch.zeros(num_samples, self.max_seq_len).type(torch.LongTensor)

        h = self.init_hidden(num_samples)
        inp = autograd.Variable(torch.LongTensor([start_letter]*num_samples))

        if self.gpu:
            samples = samples.cuda()
            inp = inp.cuda()

        for i in range(self.max_seq_len):
            out, h = self.forward(inp, h)               # out: num_samples x vocab_size
            out = torch.multinomial(torch.exp(out), 1)  # num_samples x 1 (sampling from each row)
            samples[:, i] = out.view(-1).data

            inp = out.view(-1)

        return samples

    def batchNLLLoss(self, inp, target):
        """  Returns the NLL Loss for predicting target sequence. (NLL = Negative Log Likelihood)
        Inputs: inp, target
            - inp: batch_size x seq_len
            - target: batch_size x seq_len
            inp should be target with <s> (start letter) prepended """

        loss_fn = nn.NLLLoss()
        batch_size, seq_len = inp.size()
        inp = inp.permute(1, 0)           # seq_len x batch_size
        target = target.permute(1, 0)     # seq_len x batch_size
        h = self.init_hidden(batch_size)

        loss = 0
        for i in range(seq_len):
            out, h = self.forward(inp[i], h)
            loss += loss_fn(out, target[i])

        return loss     # per batch

    def batchPGLoss(self, inp, target, reward):
        """ Returns a pseudo-loss that gives corresponding policy gradients (on calling .backward()).
        Inspired by the example in http://karpathy.github.io/2016/05/31/rl/
        Inputs: inp, target
            - inp: batch_size x seq_len
            - target: batch_size x seq_len
            - reward: batch_size (discriminator reward for each sentence, applied to each token of the corresponding
                      sentence)
            inp should be target with <s> (start letter) prepended """
        
        batch_size, seq_len = inp.size()
        inp = inp.permute(1, 0)          # seq_len x batch_size
        target = target.permute(1, 0)    # seq_len x batch_size
        h = self.init_hidden(batch_size)

        loss = 0
        for i in range(seq_len):
            out, h = self.forward(inp[i], h)
            # TODO: should h be detached from graph (.detach())?
            for j in range(batch_size):
                loss += -out[j][target.data[i][j]]*reward[j]     # log(P(y_t|Y_1:Y_{t-1})) * Q

        return loss

In [None]:
class Discriminator(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, gpu=False, dropout=0.2):
        super(Discriminator, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        self.gpu = gpu
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=dropout)
        self.gru2hidden = nn.Linear(2*2*hidden_dim, hidden_dim)
        self.dropout_linear = nn.Dropout(p=dropout)
        self.hidden2out = nn.Linear(hidden_dim, 1)

    def init_hidden(self, batch_size):
        h = autograd.Variable(torch.zeros(2*2*1, batch_size, self.hidden_dim))
        if self.gpu: return h.cuda()
        else: return h

    def forward(self, input, hidden):
        # input dim                                                # batch_size x seq_len
        emb = self.embeddings(input)                               # batch_size x seq_len x embedding_dim
        emb = emb.permute(1, 0, 2)                                 # seq_len x batch_size x embedding_dim
        _, hidden = self.gru(emb, hidden)                          # 4 x batch_size x hidden_dim
        hidden = hidden.permute(1, 0, 2).contiguous()              # batch_size x 4 x hidden_dim
        out = self.gru2hidden(hidden.view(-1, 4*self.hidden_dim))  # batch_size x 4*hidden_dim
        out = torch.tanh(out)
        out = self.dropout_linear(out)
        out = self.hidden2out(out)                                 # batch_size x 1
        out = torch.sigmoid(out)
        return out

    def batchClassify(self, inp):
        """  Classifies a batch of sequences.
        Inputs: inp - inp: batch_size x seq_len
        Returns: out - out: batch_size ([0,1] score) """

        h = self.init_hidden(inp.size()[0])
        out = self.forward(inp, h)
        return out.view(-1)

    def batchBCELoss(self, inp, target):
        """ Returns Binary Cross Entropy Loss for discriminator.
         Inputs: inp, target - inp: batch_size x seq_len
            - target: batch_size (binary 1/0) """

        loss_fn = nn.BCELoss()
        h = self.init_hidden(inp.size()[0])
        out = self.forward(inp, h)
        return loss_fn(out, target)

In [None]:
# HELPER #
def prepare_generator_batch(samples, start_letter=0, gpu=False):
    """ Takes samples (a batch) and returns
    Inputs: samples, start_letter, cuda
        - samples: batch_size x seq_len (Tensor with a sample in each row)
    Returns: inp, target
        - inp: batch_size x seq_len (same as target, but with start_letter prepended)
        - target: batch_size x seq_len (Variable same as samples) """

    batch_size, seq_len = samples.size()
    inp = torch.zeros(batch_size, seq_len)
    target = samples
    inp[:, 0] = start_letter
    inp[:, 1:] = target[:, :seq_len-1]

    inp = Variable(inp).type(torch.LongTensor)
    target = Variable(target).type(torch.LongTensor)

    if gpu:
        inp = inp.cuda()
        target = target.cuda()
    
    return inp, target

def prepare_discriminator_data(pos_samples, neg_samples, gpu=False):
    """ Takes positive (target) samples, negative (generator) samples and prepares inp and target data for discriminator.
    Inputs: pos_samples, neg_samples
        - pos_samples: pos_size x seq_len
        - neg_samples: neg_size x seq_len
    Returns: inp, target
        - inp: (pos_size + neg_size) x seq_len
        - target: pos_size + neg_size (boolean 1/0) """

    inp = torch.cat((pos_samples, neg_samples), 0).type(torch.LongTensor)
    target = torch.ones(pos_samples.size()[0] + neg_samples.size()[0])
    target[pos_samples.size()[0]:] = 0

    # shuffle
    perm = torch.randperm(target.size()[0])
    target = target[perm]
    inp = inp[perm]

    inp = Variable(inp)
    target = Variable(target)

    if gpu:
        inp = inp.cuda()
        target = target.cuda()

    return inp, target

def batchwise_sample(gen, num_samples, batch_size):
    """ Sample num_samples samples batch_size samples at a time from gen.
    Does not require gpu since gen.sample() takes care of that.
    """
    samples = []
    for i in range(int(ceil(num_samples/float(batch_size)))):
        samples.append(gen.sample(batch_size))

    return torch.cat(samples, 0)[:num_samples]

def batchwise_oracle_nll(gen, oracle, num_samples, batch_size, max_seq_len, start_letter=0, gpu=False):
    s = batchwise_sample(gen, num_samples, batch_size)
    oracle_nll = 0
    for i in range(0, num_samples, batch_size):
        inp, target = prepare_generator_batch(s[i:i+batch_size], start_letter, gpu)
        oracle_loss = oracle.batchNLLLoss(inp, target) / max_seq_len
        oracle_nll += oracle_loss.data.item()

    return oracle_nll/(num_samples/batch_size)

In [None]:
import transforms as tf
import generators as gens
from generators import from_free_group, from_normal_closure, uniform_hyperbolic_length
from free_group import is_from_normal_closure

In [None]:
def word_as_str(word):
    letters = "xyzpqrstuvwklme"
    return "".join(map(lambda factor: letters[abs(factor) - 1] + ("⁻¹" if factor < 0 else ""), word))

def print_words(generated_words, number_examples, bert):
  for generated_word in generated_words[:number_examples]:
    try:
      if bert == 1: 
        output = tokenizer.decode(generated_word, skip_special_tokens=1)
        output = list(map(int, output))
      if bert == 0: 
        output =  generated_word.tolist()
        
      if (output.count(8) == len(output)): 
        print(output)
        continue
      print(word_as_str(list(map(int, [words[x] if words else x for x in output]))))
    except: 
      pass

In [None]:
data_train = generate_dataset(generator, (10 ** 4)*100, transform_word, transform_label)

In [None]:
import json
f = open('/content/_x_^F(3)-100.json')
data = json.load(f)
f.close()

In [None]:
sentenses = []
for element in data_train:
  results = list(map(str, element['data']))
  if element['label'][0] == 1:
    sentenses.append(results)

In [None]:
merged = list(itertools.chain(*sentenses))
words = list(set(merged))  #коллекция уникальных элементов
word2idx = dict((word, i) for i, word in enumerate(words))
token_stream_data = []
for i in sentenses:
  C = (pd.Series(i)).map(word2idx) 
  token_stream_data.append(list(C))

In [None]:
def tokenization(token_data):
  token_seq = []
  for xs in token_data:     
    a = "".join(str(x) for x in xs)
    token_seq.append(a)
  return token_seq

In [None]:
train_texts_ = tokenization(token_stream_data)
train_texts_tokens = tokenizer(train_texts_, truncation=True, padding=True)
#train_texts_tokens = tokenizer(train_texts_, padding=True)

In [None]:
algebraic_words_ = torch.tensor(token_stream_data, dtype=torch.int32)
#algebraic_words_ = torch.tensor(input_ids, dtype=torch.int32)
algebraic_words = algebraic_words_.type(torch.LongTensor)

In [None]:
def check_in_closer(generated_words, bert = 0):
  alg_words = []
  for generated_word in generated_words:
    try:
      if bert == 1: 
        output = tokenizer.decode(generated_word, skip_special_tokens=1)
        output = list(map(int, output))
      if bert == 0:
        output =  generated_word.tolist()
        output = list(map(int, output))
      if (is_from_normal_closure([1], output) == True):# and (output.count(8) != len(output)):
        alg_words.append(output)
    except:
      print('error')
      pass
  #if len(alg_words)>0: print(alg_words)
  return len(alg_words)#*100/len(generated_words)

In [None]:
def word_as_str(word):
    letters = "xyzpqrstuvwklme"
    return "".join(map(lambda factor: letters[abs(factor) - 1] + ("⁻¹" if factor < 0 else ""), word))

def train_generator_MLE(gen, gen_opt, oracle, real_data_samples, epochs):
    """ Max Likelihood Pretraining for the generator """
    for epoch in range(epochs):
        print('epoch %d : ' % (epoch + 1), end='')
        sys.stdout.flush()
        total_loss = 0

        for i in range(0, POS_NEG_SAMPLES, BATCH_SIZE):
            inp, target = prepare_generator_batch(real_data_samples[i:i + BATCH_SIZE], start_letter=START_LETTER,
                                                          gpu=CUDA)
            #print(inp.shape, target.shape)
            gen_opt.zero_grad()
            loss = gen.batchNLLLoss(inp, target)
            loss.backward()
            gen_opt.step()

            total_loss += loss.data.item()

            if (i / BATCH_SIZE) % ceil(
                            ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / 10.) == 0:  # roughly every 10% of an epoch
                print('.', end='')
                sys.stdout.flush()

        # each loss in a batch is loss per sample
        total_loss = total_loss / ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / MAX_SEQ_LEN

        # sample from generator and compute oracle NLL
        oracle_loss = batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
                                                   start_letter=START_LETTER, gpu=CUDA)

        print(' average_train_NLL = %.4f, oracle_sample_NLL = %.4f' % (total_loss, oracle_loss))
        generated_words = gen.sample(10000)
        print(check_in_closer(generated_words, 0))
        print_words(generated_words, 3, 0)

def train_generator_PG(gen, gen_opt, oracle, dis, num_batches):
    """ The generator is trained using policy gradients, using the reward from the discriminator.
    Training is done for num_batches batches. """

    for batch in range(num_batches):
        s = gen.sample(BATCH_SIZE*2)        # 64 works best
        inp, target = prepare_generator_batch(s, start_letter=START_LETTER, gpu=CUDA)
        rewards = dis.batchClassify(target)

        gen_opt.zero_grad()
        pg_loss = gen.batchPGLoss(inp, target, rewards)
        pg_loss.backward()
        gen_opt.step()

    # sample from generator and compute oracle NLL
    oracle_loss =  batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
                                                   start_letter=START_LETTER, gpu=CUDA)
    print(' oracle_sample_NLL = %.4f' % oracle_loss)
    generated_words = gen.sample(10000)
    print(check_in_closer(generated_words, bert = 0))
    print_words(generated_words, 3, 0)

def train_discriminator(discriminator, dis_opt, real_data_samples, generator, oracle, d_steps, epochs):
    """  Training the discriminator on real_data_samples (positive) and generated samples from generator (negative).
    Samples are drawn d_steps times, and the discriminator is trained for epochs epochs.  """

    # generating a small validation set before training (using oracle and generator)
    pos_val = oracle.sample(100)
    neg_val = generator.sample(100)
    val_inp, val_target = prepare_discriminator_data(pos_val, neg_val, gpu=CUDA)

    for d_step in range(d_steps):
        s = batchwise_sample(generator, POS_NEG_SAMPLES, BATCH_SIZE)
        dis_inp, dis_target =  prepare_discriminator_data(real_data_samples, s, gpu=CUDA)
        for epoch in range(epochs):
            print('d-step %d epoch %d : ' % (d_step + 1, epoch + 1), end='')
            sys.stdout.flush()
            total_loss = 0
            total_acc = 0

            for i in range(0, 2 * POS_NEG_SAMPLES, BATCH_SIZE):
                inp, target = dis_inp[i:i + BATCH_SIZE], dis_target[i:i + BATCH_SIZE]
                dis_opt.zero_grad()
                out = discriminator.batchClassify(inp)
                loss_fn = nn.BCELoss()
                loss = loss_fn(out, target)
                loss.backward()
                dis_opt.step()

                total_loss += loss.data.item()
                total_acc += torch.sum((out>0.5)==(target>0.5)).data.item()

                if (i / BATCH_SIZE) % ceil(ceil(2 * POS_NEG_SAMPLES / float(
                        BATCH_SIZE)) / 10.) == 0:  # roughly every 10% of an epoch
                    print('.', end='')
                    sys.stdout.flush()

            total_loss /= ceil(2 * POS_NEG_SAMPLES / float(BATCH_SIZE))
            total_acc /= float(2 * POS_NEG_SAMPLES)

            val_pred = discriminator.batchClassify(val_inp)
            print(' average_loss = %.4f, train_acc = %.4f, val_acc = %.4f' % (
                total_loss, total_acc, torch.sum((val_pred>0.5)==(val_target>0.5)).data.item()/200.))

In [None]:
CUDA = 1
VOCAB_SIZE = 7 #55000 
MAX_SEQ_LEN = 17
START_LETTER = 0
BATCH_SIZE = 256
MLE_TRAIN_EPOCHS = 20
ADV_TRAIN_EPOCHS = 30
POS_NEG_SAMPLES = 10000
GEN_EMBEDDING_DIM = 52
GEN_HIDDEN_DIM = 52
DIS_EMBEDDING_DIM = 84
DIS_HIDDEN_DIM = 84
oracle_samples = algebraic_words

oracle = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA, oracle_init=True)

gen = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
dis = Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)

if CUDA:
    oracle = oracle.cuda()
    gen = gen.cuda()
    dis = dis.cuda()
    oracle_samples = oracle_samples.cuda()

# GENERATOR MLE TRAINING
print('Starting Generator MLE Training...')
gen_optimizer = optim.Adam(gen.parameters(), lr=1e-2)
train_generator_MLE(gen, gen_optimizer, gen, oracle_samples, MLE_TRAIN_EPOCHS)

print('\nStarting Discriminator Training...')
dis_optimizer = optim.Adam(dis.parameters())
train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, 25, 3)

print('\nStarting Adversarial Training...')
oracle_loss = batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN,
                                            start_letter=START_LETTER, gpu=CUDA)
print('\nInitial Oracle Sample Loss : %.4f' % oracle_loss)

for epoch in range(ADV_TRAIN_EPOCHS):
    print('\n--------\nEPOCH %d\n--------' % (epoch+1))
    # TRAIN GENERATOR
    print('\nAdversarial Training Generator : ', end='')
    sys.stdout.flush()
    train_generator_PG(gen, gen_optimizer, oracle, dis, 1)
    # TRAIN DISCRIMINATOR
    print('\nAdversarial Training Discriminator : ')
    train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, 5, 3)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('precision', 2)
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(loss_seqgan, 'b-o', label="Training")
plt.plot(oracle_loss_seqgan, 'g-o', label="Validation")
plt.title("Training & Validation Loss Discriminator seqGAN")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
#plt.xticks([range(30)])
plt.savefig("Loss Discriminator seqGAN.pdf", bbox_inches='tight')
plt.show()

In [None]:
max_length = 17
LENGTH = max_length
GENERATORS_NUMBER = 3
label = 1

def generate_dataset(generator, count, transform_word, transform_label):
    dataset = []
    for word in tqdm([next(generator) for _ in range(count)]):
        data = transform_word(word)
        label = transform_label(word)
        dataset.append({'data' : data, 'label' : label})
    return dataset

#length_distribution = gens.uniform_length(LENGTH)
length_distribution = gens.constant_length(LENGTH)

other_generators = []
for other_label in [2 ** i for i in range(GENERATORS_NUMBER)] + [2 ** GENERATORS_NUMBER - 1]:
    if other_label != label:
        other_generators.append(gens.from_normal_closure(
            subgroup = tf.subgroup_by_label(other_label),
            generators_number = GENERATORS_NUMBER,
            length_distribution = length_distribution
        ))

free = gens.from_free_group(
    generators_number = GENERATORS_NUMBER,
    length_distribution = length_distribution)

from_label = gens.from_normal_closure(
    subgroup = tf.subgroup_by_label(label),
    generators_number = GENERATORS_NUMBER,
    length_distribution = length_distribution)

generator =  gens.RandomChoiceGenerator([
    from_label,
    gens.RandomChoiceGenerator(
        other_generators #+ [free]
    )])

transform_word = tf.Compose([
    tf.ToTensor(torch.long),
    tf.Pad(length = LENGTH, mode = 'right'),
    lambda v: v.flatten().detach().tolist()])

transform_label = tf.Compose([
    tf.FromSubgroupLabel(label),
    tf.ToTensor(torch.long),
    lambda v: v.flatten().detach().tolist()])