In [1]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from sklearn.model_selection import train_test_split
import time
from math import log10, floor
import os
import glob

from relational_rnn_models import RelationalMemoryGenerator
from discriminator import RelGANDiscriminator

cuda = torch.cuda.is_available()

# Try setting the device to a GPU
device = torch.device("cuda:0" if cuda else "cpu")
print('Device:', device)

Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

In [2]:
vocab = []

with open('aclImbd/imdb.vocab', 'r') as file:
    for line in file:
        vocab.append(line[:-1])
        
        
vocab_size = len(vocab)
print(vocab_size)
print(vocab[:10])

word_to_index = dict([(w, i) for i, w in enumerate(vocab)])
index_to_word = dict([(i, w) for i, w in enumerate(vocab)])

print(word_to_index['awesome'])
print(index_to_word[12345])

89527
['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
1143
nobility


In [51]:
files = glob.glob('aclImbd/train/pos/*.txt')[:500]
print(len(files))

lengths = []

for filename in files:
    with open(filename, 'r') as file:
        lengths.append(len(file.readline().split(' ')))

12500


In [57]:
print(torch.mean(torch.tensor(lengths).type(Tensor)))
print(min(lengths))

tensor(236.6957)
12


In [3]:
# Define variables
vocab_size = 6
var_ids = list(range(vocab_size))
var_names = ['var' + str(i) for i in var_ids]
var_weights = torch.tensor([1, 6, 3, 4, 2, 1]).type(Tensor) 
var_weights = var_weights / torch.sum(var_weights)# variable distribution of mock data
sequence_length = 10
n_individuals = 2000

#noise_length = 2
print(var_weights)

tensor([0.0588, 0.3529, 0.1765, 0.2353, 0.1176, 0.0588])


In [4]:
# Helper function(s)

# round a number to n significant digits
def round_to_n(x, n = 2):
    return round(x, -int(floor(log10(abs(x)))) + (n - 1)) if x != 0 else 0

# visualize the output of the generator
def visualize_output(generator, z, n = 2):
    p = generator(z).view(sequence_length, vocab_size)
    p.shape
    for t in range(p.shape[0]):
        tmp = []
        for f in range(p.shape[1]):
            tmp.append(round_to_n(p[t,f], n))
        print(tmp)

#y = data[:5, :]
#print(y)
#print(F.one_hot(y, vocab_size))

In [5]:
# Generate mock data

events = []

start_time = time.time()

alternative_weights = torch.tensor([7, 2, 1, 2, 1, 4]).type(Tensor) 
alternative_weights = alternative_weights / torch.sum(alternative_weights)

for indv in range(n_individuals):
    tmp = []
    for t in range(sequence_length):
        if t > 0 and tmp[t - 1] == 'var2':
            var = np.random.choice(var_names, p=alternative_weights)
        else:
            var = np.random.choice(var_names, p=var_weights)
        tmp.append(var)
    events.append(tmp)
        
print('time taken:', round_to_n(time.time() - start_time), 'seconds')

for i in range(10):
    print(events[i])

time taken: 0.56 seconds
['var4', 'var1', 'var3', 'var1', 'var0', 'var0', 'var3', 'var3', 'var3', 'var1']
['var3', 'var1', 'var1', 'var4', 'var4', 'var4', 'var3', 'var4', 'var5', 'var1']
['var2', 'var3', 'var2', 'var0', 'var3', 'var1', 'var3', 'var3', 'var1', 'var1']
['var1', 'var1', 'var0', 'var2', 'var1', 'var1', 'var1', 'var5', 'var5', 'var0']
['var3', 'var2', 'var1', 'var1', 'var4', 'var0', 'var5', 'var4', 'var3', 'var2']
['var2', 'var5', 'var5', 'var3', 'var1', 'var3', 'var0', 'var1', 'var0', 'var1']
['var1', 'var3', 'var3', 'var1', 'var1', 'var4', 'var3', 'var5', 'var0', 'var2']
['var1', 'var3', 'var3', 'var1', 'var5', 'var3', 'var3', 'var1', 'var1', 'var2']
['var1', 'var2', 'var5', 'var1', 'var1', 'var4', 'var2', 'var5', 'var0', 'var2']
['var1', 'var3', 'var1', 'var3', 'var2', 'var0', 'var1', 'var3', 'var1', 'var1']


In [6]:
vars_to_indices = dict([(v, i) for i, v in enumerate(var_names)])
print(vars_to_indices)
data = torch.tensor([[vars_to_indices[e] for e in event] for event in events])
print(data[:10])

{'var0': 0, 'var1': 1, 'var2': 2, 'var3': 3, 'var4': 4, 'var5': 5}
tensor([[4, 1, 3, 1, 0, 0, 3, 3, 3, 1],
        [3, 1, 1, 4, 4, 4, 3, 4, 5, 1],
        [2, 3, 2, 0, 3, 1, 3, 3, 1, 1],
        [1, 1, 0, 2, 1, 1, 1, 5, 5, 0],
        [3, 2, 1, 1, 4, 0, 5, 4, 3, 2],
        [2, 5, 5, 3, 1, 3, 0, 1, 0, 1],
        [1, 3, 3, 1, 1, 4, 3, 5, 0, 2],
        [1, 3, 3, 1, 5, 3, 3, 1, 1, 2],
        [1, 2, 5, 1, 1, 4, 2, 5, 0, 2],
        [1, 3, 1, 3, 2, 0, 1, 3, 1, 1]])


In [7]:
# Test generator output

mem_slots = 4
head_size = 2
embed_size = 2
temperature = 1
num_heads = 2

G = RelationalMemoryGenerator(mem_slots, head_size, embed_size, vocab_size, temperature, num_heads)

start_token = torch.tensor([[0]])
memory = G.initial_state(batch_size = 1)
print(memory)
logits, tokens, _, memory = G(start_token, memory, sequence_length, None)
print(logits)
print(tokens)
print(memory)


tensor([[[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]])
tensor([[[1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.]]], grad_fn=<CatBackward>)
tensor([[0, 5, 1, 0, 3, 4, 2, 4, 2, 2]])
tensor([[[ 0.4614,  1.2949, -0.6494, -2.0191],
         [ 0.4594,  1.4459, -0.8590, -2.1287],
         [ 0.3686,  1.2222,  0.1015, -1.9834],
         [ 0.4619,  1.3342, -0.8642, -1.9224]]], grad_fn=<AddBackward0>)


In [8]:
# Define generator evaluation functions

def eval_generator(G, data, vocab_size):
    _, data_fake, _, _ = G(data[:, :1], G.initial_state(batch_size = data.shape[0]), data.shape[1])
    word_means = torch.stack([torch.mean((data == i).type(torch.FloatTensor), dim = 0) for i in range(vocab_size)])
    word_means_fake = torch.stack([torch.mean((data_fake == i).type(torch.FloatTensor), dim = 0) for i in range(vocab_size)])
    
    scores = torch.sum(torch.abs(word_means - word_means_fake), dim = 1)
    
    return scores # for each word; the lower the better

def count_special_cases(data, vocab_size):
    counts1 = torch.zeros(vocab_size)
    counts2 = torch.zeros(vocab_size)
    for i in range(data.shape[0]):
        for t in range(data.shape[1] - 1):
            if data[i, t] == 2:
                counts1[data[i, t + 1]] += 1
            else:
                counts2[data[i, t + 1]] += 1
                
    return counts1, counts2

def test_special_case(G, data, vocab_size, return_freq = True):
    counts_real1, counts_real2 = count_special_cases(data, vocab_size)
    freq_real1 = counts_real1 / torch.sum(counts_real1)
    freq_real2 = counts_real2 / torch.sum(counts_real2)
    
    _, data_fake, _, _ = G(data[:, :1], G.initial_state(batch_size = data.shape[0]), data.shape[1])
    
    counts_fake1, counts_fake2 = count_special_cases(data_fake, vocab_size)
    freq_fake1 = counts_fake1 / torch.sum(counts_fake1)
    freq_fake2 = counts_fake2 / torch.sum(counts_fake2)
    
    scores1 = freq_real1 - freq_fake1
    scores2 = freq_real2 - freq_fake2
    #print(freq_real1, freq_real2)
    
    if return_freq:
        return freq_fake1, freq_fake2
    else:
        return torch.mean(torch.abs(scores1)), torch.mean(torch.abs(scores2))

#print(data[:5, :])
scores = eval_generator(G, data, vocab_size)
print(scores)
print(scores / torch.tensor(var_weights)) # adjusted to the words' frequencies

scores1, scores2 = test_special_case(G, data, vocab_size)
print(scores1, scores2)



tensor([1.0335, 1.3815, 0.1525, 0.5535, 0.3695, 0.5545])
tensor([17.5695,  3.9142,  0.8642,  2.3524,  3.1407,  9.4265])




tensor([0.2201, 0.1639, 0.1563, 0.1476, 0.1505, 0.1617]) tensor([0.2260, 0.1565, 0.1465, 0.1685, 0.1449, 0.1576])


In [9]:
# Test Discriminator output

n_embeddings = 2
embed_size = 2
out_channels = 5 
filter_sizes = [2, 3] # values can be at most the sequence_length

D = RelGANDiscriminator(n_embeddings, vocab_size, embed_size, sequence_length, out_channels, filter_sizes)

inp = logits
print(inp)
print(D(inp, False))
print(D(inp))


tensor([[[1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 1., 0., 0., 0.]]], grad_fn=<CatBackward>)
tensor([[0.5249, 0.5356]], grad_fn=<SqueezeBackward1>)
tensor([0.5302], grad_fn=<MeanBackward2>)


In [10]:
# Define the generator pre-train function

def pretrain_generator(G, train_data, vocab_size, n_epochs, lr, print_step = 10):
    loss_function = nn.BCELoss()
    optimizer = torch.optim.Adam(G.parameters(), lr=lr)
    
    if cuda:
        G.cuda()
        loss_function.cuda()
    
    train_data_one_hot = F.one_hot(train_data, vocab_size).type(Tensor)
    start_token = train_data[:, :1]
    sequence_length = train_data.shape[1]
    
    for e in range(n_epochs):
        optimizer.zero_grad()
        
        memory = G.initial_state(batch_size = train_data.shape[0])
        
        logits, _, _, _ = G(start_token, memory, sequence_length)
        
        loss = loss_function(logits, train_data_one_hot)
        
        loss.backward()
        optimizer.step()
        
        if e % print_step == 0:
            print(
                "[Epoch %d/%d] [G loss: %f]"
                % (e, n_epochs, loss.item())
            )

In [11]:
mem_slots = 4
head_size = 2
embed_size = 2
temperature = 1
num_heads = 2

G = RelationalMemoryGenerator(mem_slots, head_size, embed_size, vocab_size, temperature, num_heads)

scores = eval_generator(G, data, vocab_size)
print(scores, scores / torch.tensor(var_weights))
pretrain_generator(G, data, vocab_size, 10, 0.001, 2)
scores = eval_generator(G, data, vocab_size)
print(scores, scores / torch.tensor(var_weights))

  # Remove the CWD from sys.path while we load stuff.


tensor([0.4190, 1.5945, 0.5430, 0.9085, 0.2780, 0.6490]) tensor([ 7.1230,  4.5178,  3.0770,  3.8611,  2.3630, 11.0330])
[Epoch 0/10] [G loss: 6.503215]
[Epoch 2/10] [G loss: 6.524394]
[Epoch 4/10] [G loss: 6.494467]
[Epoch 6/10] [G loss: 6.566292]
[Epoch 8/10] [G loss: 6.625226]
tensor([0.1080, 0.4580, 0.1670, 0.6595, 0.1635, 0.5290]) tensor([1.8360, 1.2977, 0.9463, 2.8029, 1.3897, 8.9930])


  del sys.path[0]


In [12]:
# Define the training function

def train(G, D, train_data, vocab_size, n_epochs, lr, temperature, print_step = 10, score_fn = test_special_case):
    print('pretraining generator...')
    pretrain_generator(G, train_data, vocab_size, n_epochs // 10, lr, n_epochs // 10 - 1)
    print('pretraining complete')
    
    adversarial_loss = torch.nn.BCELoss()
    
    if cuda:
        G.cuda()
        D.cuda()
        adversarial_loss.cuda()
    
    optimizer_G = torch.optim.Adam(G.parameters(), lr=lr)
    optimizer_D = torch.optim.Adam(D.parameters(), lr=lr)
    
    train_data_one_hot = F.one_hot(train_data, vocab_size).type(Tensor)

    start_token = train_data[:, :1]
    sequence_length = train_data.shape[1]
    
    for e in range(n_epochs):
        # Adversarial ground truths
        valid = Variable(Tensor(train_data.shape[0], 1).fill_(1.0), requires_grad=False)
        fake = Variable(Tensor(train_data.shape[0], 1).fill_(0.0), requires_grad=False)

        optimizer_G.zero_grad()

        # Generate a batch of images
        memory = G.initial_state(batch_size = train_data.shape[0])
        temp = temperature #** ((e + 1) / n_epochs)
        fake_one_hot, _, _, _ = G(start_token, memory, sequence_length, temp)

        # Loss measures generator's ability to fool the discriminator
        g_loss = adversarial_loss(D(fake_one_hot), valid)

        g_loss.backward()
        optimizer_G.step()
        
        optimizer_D.zero_grad()

        # Measure discriminator's ability to classify real from generated samples
        real_loss = adversarial_loss(D(train_data_one_hot), valid)
        fake_loss = adversarial_loss(D(fake_one_hot.detach()), fake)
        d_loss = (real_loss + fake_loss) / 2

        d_loss.backward()
        optimizer_D.step()

        if e % print_step == 0:
            print(
                "[Epoch %d/%d] [D loss: %f] [G loss: %f]"
                % (e, n_epochs, d_loss.item(), g_loss.item())
            )
            print("[Frequencies:", score_fn(G, train_data, vocab_size), "]")
            scores1, scores2 = score_fn(G, train_data, vocab_size, False)
            print("[Scores:", scores1, scores2, "]")
            


In [14]:
# Train the GAN

# Generator params
mem_slots = 1
head_size = 4
embed_size = 3
temperature = 10
num_heads = 6
num_blocks = 4

G = RelationalMemoryGenerator(mem_slots, head_size, embed_size, vocab_size, temperature, num_heads, num_blocks)

# Discriminator params
n_embeddings = 3
embed_size = embed_size
out_channels = 10
filter_sizes = [2, 3, 4] # values can be at most the sequence_length

D = RelGANDiscriminator(n_embeddings, vocab_size, embed_size, sequence_length, out_channels, filter_sizes)

#scores = eval_generator(G, data, vocab_size)
scores1, scores2 = test_special_case(G, data, vocab_size, False)
print('scores before training:', scores1, scores2)
scores1, scores2 = test_special_case(G, data, vocab_size)
print('distributions before training:', scores1, scores2)
print('benchmark distributions:', alternative_weights, var_weights)

# Train the GAN
train(G, D, data, vocab_size, 1000, 0.001, temperature, 50)

#scores = eval_generator(G, data, vocab_size)
scores1, scores2 = test_special_case(G, data, vocab_size, False)
print('scores after training:', scores1, scores2)
scores1, scores2 = test_special_case(G, data, vocab_size)
print('distributions after training:', scores1, scores2)
print('benchmark distributions:', alternative_weights, var_weights)

Device: cpu
score before training: tensor(0.0991) tensor(0.0994)
pretraining generator...
[Epoch 0/100] [G loss: 6.931863]
[Epoch 99/100] [G loss: 5.767930]
pretraining complete
[Epoch 0/1000] [D loss: 0.693834] [G loss: 0.816437]
[Frequencies: (tensor([0.0804, 0.6686, 0.0588, 0.1059, 0.0510, 0.0353]), tensor([0.0084, 0.9588, 0.0075, 0.0145, 0.0058, 0.0049])) ]
[Scores: tensor(0.1974) tensor(0.2033) ]
[Epoch 50/1000] [D loss: 0.693121] [G loss: 0.694957]
[Frequencies: (tensor([0.1667, 0.1136, 0.1431, 0.2364, 0.1593, 0.1808]), tensor([0.1717, 0.1073, 0.1293, 0.2289, 0.1865, 0.1764])) ]
[Scores: tensor(0.1088) tensor(0.1006) ]
[Epoch 100/1000] [D loss: 0.693045] [G loss: 0.705340]
[Frequencies: (tensor([0.0996, 0.2806, 0.4507, 0.1119, 0.0205, 0.0367]), tensor([0.0933, 0.2826, 0.4541, 0.1121, 0.0227, 0.0353])) ]
[Scores: tensor(0.1879) tensor(0.1012) ]
[Epoch 150/1000] [D loss: 0.691387] [G loss: 0.689033]
[Frequencies: (tensor([0.0877, 0.3184, 0.1281, 0.1910, 0.1457, 0.1291]), tensor([0.

In [15]:

scores1, scores2 = test_special_case(G, data, vocab_size, False)
print('scores after training:', scores1, scores2)
scores1, scores2 = test_special_case(G, data, vocab_size)
print('distributions after training:', scores1, scores2)
print('benchmark distributions:', alternative_weights, var_weights)

scores after training: tensor(0.0769) tensor(0.0319)
distributions after training: tensor([0.5007, 0.2659, 0.0405, 0.1244, 0.0410, 0.0275]) tensor([0.0644, 0.4216, 0.1114, 0.2473, 0.0923, 0.0630])
benchmark distributions: tensor([0.4118, 0.1176, 0.0588, 0.1176, 0.0588, 0.2353]) tensor([0.0588, 0.3529, 0.1765, 0.2353, 0.1176, 0.0588])
