In [1]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from sklearn.model_selection import train_test_split
import time
from math import log10, floor
import os
import glob

from relational_rnn_models import RelationalMemoryGenerator
from discriminator import RelGANDiscriminator

cuda = torch.cuda.is_available()

# Try setting the device to a GPU
device = torch.device("cuda:0" if cuda else "cpu")
print('Device:', device)

Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

Device: cuda:0


In [2]:
vocab = []

with open('aclImdb/imdb.vocab', 'r') as file:
    for line in file:
        vocab.append(line[:-1])
        
        
vocab_size = len(vocab)
print(vocab_size)
print(vocab[:10])

word_to_index = dict([(w, i) for i, w in enumerate(vocab)])
index_to_word = dict([(i, w) for i, w in enumerate(vocab)])

print(word_to_index['awesome'])
print(index_to_word[12345])

89527
['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
1143
nobility


In [3]:
files = glob.glob('aclImdb/train/pos/*.txt')[:500]
print(len(files))

lengths = []

for filename in files:
    with open(filename, 'r') as file:
        lengths.append(len(file.readline().split(' ')))

500


In [4]:
print(torch.mean(torch.tensor(lengths).type(Tensor)))
print(min(lengths))
print(max(lengths))

tensor(228.9180, device='cuda:0')
34
995


In [14]:
# Define variables
vocab_size = 6
var_ids = list(range(vocab_size))
var_names = ['var' + str(i) for i in var_ids]
var_weights = torch.tensor([15, 5, 3, 4, 2, 1]).type(Tensor) 
var_weights = var_weights / torch.sum(var_weights)# variable distribution of mock data
sequence_length = 10
n_individuals = 2000

#noise_length = 2
print(var_weights)

tensor([0.0588, 0.3529, 0.1765, 0.2353, 0.1176, 0.0588], device='cuda:0')


In [6]:
# Helper function(s)

# round a number to n significant digits
def round_to_n(x, n = 2):
    return round(x, -int(floor(log10(abs(x)))) + (n - 1)) if x != 0 else 0

In [7]:
# Generate mock data

events = []

start_time = time.time()

alternative_weights = torch.tensor([7, 2, 1, 2, 1, 4]).type(torch.FloatTensor) 
alternative_weights = alternative_weights / torch.sum(alternative_weights)

for indv in range(n_individuals):
    tmp = []
    for t in range(sequence_length):
        if t > 0 and tmp[t - 1] == 'var2':
            var = np.random.choice(var_names, p=alternative_weights)
        else:
            var = np.random.choice(var_names, p=var_weights)
        tmp.append(var)
    events.append(tmp)
        
print('time taken:', round_to_n(time.time() - start_time), 'seconds')

for i in range(10):
    print(events[i])

time taken: 0.8 seconds
['var1', 'var1', 'var4', 'var2', 'var0', 'var3', 'var1', 'var1', 'var4', 'var1']
['var1', 'var2', 'var0', 'var2', 'var4', 'var3', 'var1', 'var4', 'var1', 'var1']
['var1', 'var1', 'var3', 'var1', 'var1', 'var1', 'var3', 'var1', 'var1', 'var3']
['var1', 'var1', 'var2', 'var0', 'var1', 'var3', 'var3', 'var3', 'var1', 'var0']
['var2', 'var0', 'var2', 'var4', 'var1', 'var1', 'var1', 'var1', 'var1', 'var1']
['var1', 'var3', 'var2', 'var5', 'var3', 'var3', 'var1', 'var1', 'var3', 'var3']
['var2', 'var2', 'var1', 'var3', 'var1', 'var1', 'var3', 'var3', 'var5', 'var2']
['var3', 'var1', 'var3', 'var0', 'var3', 'var1', 'var2', 'var1', 'var3', 'var2']
['var1', 'var1', 'var1', 'var3', 'var4', 'var3', 'var3', 'var2', 'var3', 'var1']
['var1', 'var3', 'var4', 'var1', 'var1', 'var2', 'var5', 'var1', 'var3', 'var4']


In [11]:
vars_to_indices = dict([(v, i) for i, v in enumerate(var_names)])
print(vars_to_indices)
data = torch.tensor([[vars_to_indices[e] for e in event] for event in events])
if cuda:
    data = data.cuda()
print(data[:10])

{'var0': 0, 'var1': 1, 'var2': 2, 'var3': 3, 'var4': 4, 'var5': 5}
tensor([[1, 1, 4, 2, 0, 3, 1, 1, 4, 1],
        [1, 2, 0, 2, 4, 3, 1, 4, 1, 1],
        [1, 1, 3, 1, 1, 1, 3, 1, 1, 3],
        [1, 1, 2, 0, 1, 3, 3, 3, 1, 0],
        [2, 0, 2, 4, 1, 1, 1, 1, 1, 1],
        [1, 3, 2, 5, 3, 3, 1, 1, 3, 3],
        [2, 2, 1, 3, 1, 1, 3, 3, 5, 2],
        [3, 1, 3, 0, 3, 1, 2, 1, 3, 2],
        [1, 1, 1, 3, 4, 3, 3, 2, 3, 1],
        [1, 3, 4, 1, 1, 2, 5, 1, 3, 4]], device='cuda:0')


In [9]:
# Test generator output

mem_slots = 4
head_size = 2
embed_size = 2
temperature = 1
num_heads = 2

G = RelationalMemoryGenerator(mem_slots, head_size, embed_size, vocab_size, temperature, num_heads)

start_token = torch.tensor([[0]])
memory = G.initial_state(batch_size = 1)

if cuda:
    G.cuda()
    start_token = start_token.cuda()
    memory = memory.cuda()

print(memory)
logits, tokens, _, memory = G(start_token, memory, sequence_length, None)
print(logits)
print(tokens)
print(memory)


tensor([[[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]]], device='cuda:0')
tensor([[[1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 1., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 1.]]], device='cuda:0', grad_fn=<CatBackward>)
tensor([[0, 4, 3, 4, 4, 0, 5, 1, 5, 5]], device='cuda:0')
tensor([[[-0.3938,  0.7008, -2.2735,  1.2140],
         [-0.5985,  0.5627, -2.1246,  1.2501],
         [-0.4631,  0.5688, -1.6212,  1.2787],
         [-0.6056,  0.5891, -2.0371,  1.4096]]], device='cuda:0',
       grad_fn=<AddBackward0>)


In [17]:
# Define generator evaluation functions

def eval_generator(G, data, vocab_size):
    memory = G.initial_state(batch_size = data.shape[0])
    if cuda:
        memory = memory.cuda()
    _, data_fake, _, _ = G(data[:, :1], memory, data.shape[1])
    word_means = torch.stack([torch.mean((data == i).type(Tensor), dim = 0) for i in range(vocab_size)])
    word_means_fake = torch.stack([torch.mean((data_fake == i).type(Tensor), dim = 0) for i in range(vocab_size)])
    
    scores = torch.sum(torch.abs(word_means - word_means_fake), dim = 1)
    
    return scores # for each word; the lower the better

def count_special_cases(data, vocab_size):
    counts1 = torch.zeros(vocab_size)
    counts2 = torch.zeros(vocab_size)
    
    if cuda:
        counts1 = counts1.cuda()
        counts2 = counts2.cuda()
    
    for i in range(data.shape[0]):
        for t in range(data.shape[1] - 1):
            if data[i, t] == 2:
                counts1[data[i, t + 1]] += 1
            else:
                counts2[data[i, t + 1]] += 1
                
    return counts1, counts2

def test_special_case(G, data, vocab_size, return_freq = True):
    counts_real1, counts_real2 = count_special_cases(data, vocab_size)
    freq_real1 = counts_real1 / torch.sum(counts_real1)
    freq_real2 = counts_real2 / torch.sum(counts_real2)
    
    memory = G.initial_state(batch_size = data.shape[0])

    if cuda:
        memory = memory.cuda()
    
    _, data_fake, _, _ = G(data[:, :1], memory, data.shape[1])
    
    counts_fake1, counts_fake2 = count_special_cases(data_fake, vocab_size)
    freq_fake1 = counts_fake1 / torch.sum(counts_fake1)
    freq_fake2 = counts_fake2 / torch.sum(counts_fake2)
    
    scores1 = freq_real1 - freq_fake1
    scores2 = freq_real2 - freq_fake2
    #print(freq_real1, freq_real2)
    
    if return_freq:
        return freq_fake1, freq_fake2
    else:
        return torch.mean(torch.abs(scores1)), torch.mean(torch.abs(scores2))

#print(data[:5, :])
scores = eval_generator(G, data, vocab_size)
print(scores)
print(scores / var_weights) # adjusted to the words' frequencies

scores1, scores2 = test_special_case(G, data, vocab_size)
print(scores1, scores2)



tensor([0.1480, 1.7525, 0.1560, 0.6020, 0.4215, 2.3240], device='cuda:0')
tensor([ 2.5160,  4.9654,  0.8840,  2.5585,  3.5828, 39.5080], device='cuda:0')
tensor([0.1131, 0.1311, 0.1697, 0.1434, 0.0943, 0.3484], device='cuda:0') tensor([0.1071, 0.1157, 0.1773, 0.1445, 0.1156, 0.3397], device='cuda:0')


In [19]:
# Test Discriminator output

n_embeddings = 2
embed_size = 2
out_channels = 5 
filter_sizes = [2, 3] # values can be at most the sequence_length

D = RelGANDiscriminator(n_embeddings, vocab_size, embed_size, sequence_length, out_channels, filter_sizes)

if cuda:
    D.cuda()

inp = logits
print(inp)
print(D(inp, False))
print(D(inp))


tensor([[[1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 1., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 0., 0., 1., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 1.]]], device='cuda:0', grad_fn=<CatBackward>)
tensor([[0.5376, 0.5376]], device='cuda:0', grad_fn=<SqueezeBackward1>)
tensor([0.5376], device='cuda:0', grad_fn=<MeanBackward2>)


In [22]:
# Define the generator pre-train function

def pretrain_generator(G, train_data, vocab_size, n_epochs, lr, print_step = 10):
    loss_function = nn.BCELoss()
    optimizer = torch.optim.Adam(G.parameters(), lr=lr)
    
    train_data_one_hot = F.one_hot(train_data, vocab_size).type(Tensor)
    start_token = train_data[:, :1]
    sequence_length = train_data.shape[1]
    
    if cuda:
        G.cuda()
        loss_function.cuda()
        start_token = start_token.cuda()
    
    for e in range(n_epochs):
        optimizer.zero_grad()
        
        memory = G.initial_state(batch_size = train_data.shape[0])
        
        if cuda:
            memory = memory.cuda()
        
        logits, _, _, _ = G(start_token, memory, sequence_length)
        
        loss = loss_function(logits, train_data_one_hot)
        
        loss.backward()
        optimizer.step()
        
        if e % print_step == 0:
            print(
                "[Epoch %d/%d] [G loss: %f]"
                % (e, n_epochs, loss.item())
            )

In [23]:
mem_slots = 4
head_size = 2
embed_size = 2
temperature = 1
num_heads = 2

G = RelationalMemoryGenerator(mem_slots, head_size, embed_size, vocab_size, temperature, num_heads)

if cuda:
    G.cuda()

# TODO: change to the better evaluation functions
scores = eval_generator(G, data, vocab_size)
print(scores, scores / var_weights)
pretrain_generator(G, data, vocab_size, 10, 0.001, 2)
scores = eval_generator(G, data, vocab_size)
print(scores, scores / var_weights)

tensor([0.3110, 2.0260, 0.7225, 0.0855, 0.5240, 1.8770], device='cuda:0') tensor([ 5.2870,  5.7403,  4.0942,  0.3634,  4.4540, 31.9090], device='cuda:0')
[Epoch 0/10] [G loss: 7.064792]
[Epoch 2/10] [G loss: 7.051437]
[Epoch 4/10] [G loss: 7.003083]
[Epoch 6/10] [G loss: 7.015517]
[Epoch 8/10] [G loss: 6.998477]
tensor([0.3575, 1.8690, 0.5015, 0.2765, 0.4195, 1.3170], device='cuda:0') tensor([ 6.0775,  5.2955,  2.8418,  1.1751,  3.5658, 22.3890], device='cuda:0')


In [24]:
# Define the training function

def train(G, D, train_data, vocab_size, n_epochs, lr, temperature, print_step = 10, score_fn = test_special_case):
    print('pretraining generator...')
    pretrain_generator(G, train_data, vocab_size, n_epochs // 10, lr, n_epochs // 10 - 1)
    print('pretraining complete')
    
    adversarial_loss = torch.nn.BCELoss()
    
    optimizer_G = torch.optim.Adam(G.parameters(), lr=lr)
    optimizer_D = torch.optim.Adam(D.parameters(), lr=lr)
    
    train_data_one_hot = F.one_hot(train_data, vocab_size).type(Tensor)

    start_token = train_data[:, :1]
    sequence_length = train_data.shape[1]
    
    if cuda:
        G.cuda()
        D.cuda()
        adversarial_loss.cuda()
        start_token = start_token.cuda()
    
    for e in range(n_epochs):
        # Adversarial ground truths
        valid = Variable(Tensor(train_data.shape[0], 1).fill_(1.0), requires_grad=False)
        fake = Variable(Tensor(train_data.shape[0], 1).fill_(0.0), requires_grad=False)

        optimizer_G.zero_grad()

        # Generate a batch of images
        memory = G.initial_state(batch_size = train_data.shape[0])
        if cuda:
            memory = memory.cuda()
        temp = temperature ** ((e + 1) / n_epochs)
        fake_one_hot, _, _, _ = G(start_token, memory, sequence_length, temp)

        # Loss measures generator's ability to fool the discriminator
        g_loss = adversarial_loss(D(fake_one_hot), valid)

        g_loss.backward()
        optimizer_G.step()
        
        optimizer_D.zero_grad()

        # Measure discriminator's ability to classify real from generated samples
        real_loss = adversarial_loss(D(train_data_one_hot), valid)
        fake_loss = adversarial_loss(D(fake_one_hot.detach()), fake)
        d_loss = (real_loss + fake_loss) / 2

        d_loss.backward()
        optimizer_D.step()

        if e % print_step == 0:
            print(
                "[Epoch %d/%d] [D loss: %f] [G loss: %f]"
                % (e, n_epochs, d_loss.item(), g_loss.item())
            )
            print("[Frequencies:", score_fn(G, train_data, vocab_size), "]")
            scores1, scores2 = score_fn(G, train_data, vocab_size, False)
            print("[Scores:", scores1, scores2, "]")
            


In [25]:
# Train the GAN

start_time = time.time()

# Generator params
mem_slots = 1
head_size = 4
embed_size = 3
temperature = 50
num_heads = 6
num_blocks = 4

G = RelationalMemoryGenerator(mem_slots, head_size, embed_size, vocab_size, temperature, num_heads, num_blocks)

# Discriminator params
n_embeddings = 3
embed_size = embed_size
out_channels = 10
filter_sizes = [2, 3, 4] # values can be at most the sequence_length

D = RelGANDiscriminator(n_embeddings, vocab_size, embed_size, sequence_length, out_channels, filter_sizes)

if cuda:
    G.cuda()
    D.cuda()

#scores = eval_generator(G, data, vocab_size)
scores1, scores2 = test_special_case(G, data, vocab_size, False)
print('scores before training:', scores1, scores2)
scores1, scores2 = test_special_case(G, data, vocab_size)
print('distributions before training:', scores1, scores2)
print('benchmark distributions:', alternative_weights, var_weights)

# Train the GAN
train(G, D, data, vocab_size, 1000, 0.001, temperature, 50)

#scores = eval_generator(G, data, vocab_size)
scores1, scores2 = test_special_case(G, data, vocab_size, False)
print('scores after training:', scores1, scores2)
scores1, scores2 = test_special_case(G, data, vocab_size)
print('distributions after training:', scores1, scores2)
print('benchmark distributions:', alternative_weights, var_weights)

print('time taken:', round_to_n(time.time() - start_time), 'seconds')


scores before training: tensor(0.1212, device='cuda:0') tensor(0.0975, device='cuda:0')
distributions before training: tensor([0.1618, 0.1999, 0.1406, 0.1367, 0.2280, 0.1329], device='cuda:0') tensor([0.1663, 0.2050, 0.1401, 0.1168, 0.2494, 0.1224], device='cuda:0')
benchmark distributions: tensor([0.4118, 0.1176, 0.0588, 0.1176, 0.0588, 0.2353]) tensor([0.0588, 0.3529, 0.1765, 0.2353, 0.1176, 0.0588], device='cuda:0')
pretraining generator...
[Epoch 0/100] [G loss: 6.880124]
[Epoch 99/100] [G loss: 5.789620]
pretraining complete


  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


[Epoch 0/1000] [D loss: 0.707789] [G loss: 0.529448]
[Scores: tensor(0.1964, device='cuda:0') tensor(0.1990, device='cuda:0') ]
[Epoch 50/1000] [D loss: 0.697131] [G loss: 0.641678]
[Frequencies: (tensor([0.0805, 0.0242, 0.6509, 0.0782, 0.1121, 0.0540], device='cuda:0'), tensor([0.1297, 0.0486, 0.4921, 0.0912, 0.1351, 0.1033], device='cuda:0')) ]
[Scores: tensor(0.2151, device='cuda:0') tensor(0.1476, device='cuda:0') ]
[Epoch 100/1000] [D loss: 0.700375] [G loss: 0.645965]
[Frequencies: (tensor([0.1051, 0.5720, 0.0903, 0.0785, 0.0297, 0.1243], device='cuda:0'), tensor([0.0682, 0.6780, 0.0674, 0.0702, 0.0270, 0.0891], device='cuda:0')) ]
[Scores: tensor(0.1710, device='cuda:0') tensor(0.1251, device='cuda:0') ]
[Epoch 150/1000] [D loss: 0.689762] [G loss: 0.643644]
[Frequencies: (tensor([0.0986, 0.2154, 0.0571, 0.2119, 0.2388, 0.1782], device='cuda:0'), tensor([0.0847, 0.1780, 0.0458, 0.2612, 0.2911, 0.1391], device='cuda:0')) ]
[Scores: tensor(0.1360, device='cuda:0') tensor(0.1002, d

In [26]:
scores1, scores2 = test_special_case(G, data, vocab_size, False)
print('scores after training:', scores1, scores2)
scores1, scores2 = test_special_case(G, data, vocab_size)
print('distributions after training:', scores1, scores2)
print('benchmark distributions:', alternative_weights, var_weights)

scores after training: tensor(0.0827, device='cuda:0') tensor(0.0468, device='cuda:0')
distributions after training: tensor([0.3275, 0.2157, 0.2167, 0.0583, 0.0462, 0.1355], device='cuda:0') tensor([0.0454, 0.3511, 0.3162, 0.1542, 0.0990, 0.0342], device='cuda:0')
benchmark distributions: tensor([0.4118, 0.1176, 0.0588, 0.1176, 0.0588, 0.2353]) tensor([0.0588, 0.3529, 0.1765, 0.2353, 0.1176, 0.0588], device='cuda:0')
