In [14]:
import os
import fasttext
import fasttext.util
import torch
import math
from model import gan
from sklearn.utils import shuffle
import numpy as np

# Unique Naming
from datetime import datetime
import random, string

In [15]:
def random_string(length=10):
    """
        Generate a random string of given length. For safely storing produced images.
    """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))


def get_model_id():
    """
        Creates folder with unique ID in which everything related to a particular testrun can be saved.
    :return: Unique folder identifier
    """
    # Construct testrun identifier
    TIME_STAMP = datetime.now().strftime("%Y_%d_%m__%H_%M_%S__%f_")
    model_folder_id = TIME_STAMP + '_' + random_string() + '/'

    try:
        os.mkdirs(model_folder_id)
    except Exception as e:
        print('Exception occurred: ', e)

    return model_folder_id

In [110]:
### VARIABLES & ADMINISTRATIVE STUFF ###
# System
#dataset_path = '/media/daniel/Elements/FastText_Data/'  # In case dataset is stored somewhere else, e.g. on hard-drive
dataset_path = '/media/daniel/Elements/FastText_Data/'  #data in same directory
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

# Network
embedding_dim = 300
internal_dim = 300
hidden = 300

# Train hyperparameters
epochs = 100
batch_size = 32
vocab_size = 2000
num_minibatches = vocab_size // batch_size
real_label, fake_label = 1, 0
languages = {'src': ['de', 'nl'], 'trgt': 'en'}  # Target language to be indicated in last position
checkpoint_frequency = 0  # 0 == Off; i > 0 == actual checkpoint frequency in epochs
avg_dec_grads = True  # Boolean indicating whether to average the grads of decoder accumulated over nr of source languages by nr of source langs


In [109]:
# Set up saving paths

data_storage_path = './'

model_id = get_model_id()

checkpoint_path = data_storage_path + model_id + 'Checkpoint/'
final_state_path = data_storage_path + model_id + 'Final/'

try:
    if checkpoint_frequency > 0:
        os.mkdir(checkpoint_path)
    os.mkdir(final_state_path)
except Exception as e:
    raise Warning('Exception occurred: Cound not create dirs! Exception:\n', e)
    
print('Model ID:', model_id)

Model ID: 2020_03_06__23_19_13__517510__liaooyyryt/


In [3]:
def cleaned_vocab(vocab, vocab_size):
    # Remove all punctuation tokens while valid nr of tokens is insufficient yet for having full vocab size
    # TODO & possibly reserve testing vocab

    # Return clean & restricted vocab
    words = vocab.words[:vocab_size]              # Y (labels)
    vects = [vocab[word] for word in words]       # X (input data)

    return vects, words

In [4]:
def add_lang_to_vocab(lang_type, lang_id, vocab_size, vocabs):
    # Get dataset
    if dataset_path == './':
        fasttext.util.download_model(lang_id)  # Download word embedding vector data if not available
    vocab = fasttext.load_model(dataset_path + 'cc.' + lang_id + '.300.bin')  # Load language data

    # Add train data (embedding-vectors) and labels (words) to vocab
    x, y = cleaned_vocab(vocab, vocab_size)
    vocabs[lang_type][lang_id] = {'x': torch.tensor(x), 'y': y}

    return vocabs

In [5]:
def load_vocab(languages):
    nr_src_langs = len(languages)
    vocabs = {'src': {}, 'trgt': {}}
    
    for language in languages['src']:
        vocabs = add_lang_to_vocab('src', language, vocab_size, vocabs)
        
    language = languages['trgt']
    vocabs = add_lang_to_vocab('trgt', language, vocab_size, vocabs)

    print('Successfully loaded language models.')
    return vocabs

In [6]:
    # load vocab
    vocabs = load_vocab(languages)



Successfully loaded language models.


In [79]:
def save_checkpoint(data, save):
    if save:
        torch.save(data, checkpoint_path + 'checkpoint_%d.pt' % data['epoch'])

In [86]:
def mean_param(model):
    return torch.mean(torch.cat([param.data.view(-1) for param in model.parameters()], 0))

In [123]:
x_t = torch.rand([5,7])
print(x_t)
indices = torch.LongTensor(3).random_(0, len(x_t))
print(indices)
print(x_t[indices])

tensor([[0.5607, 0.3826, 0.8992, 0.9056, 0.5622, 0.6645, 0.8006],
        [0.5582, 0.1138, 0.7248, 0.0861, 0.7134, 0.7503, 0.4936],
        [0.1320, 0.3971, 0.7478, 0.3828, 0.2057, 0.8433, 0.3996],
        [0.3900, 0.3583, 0.4438, 0.8527, 0.4797, 0.2626, 0.8606],
        [0.0792, 0.0668, 0.0232, 0.9301, 0.7002, 0.2058, 0.5192]])
tensor([0, 4, 1])
tensor([[0.5607, 0.3826, 0.8992, 0.9056, 0.5622, 0.6645, 0.8006],
        [0.0792, 0.0668, 0.0232, 0.9301, 0.7002, 0.2058, 0.5192],
        [0.5582, 0.1138, 0.7248, 0.0861, 0.7134, 0.7503, 0.4936]])


In [126]:
def get_dataset_sample(lang, vocab, batch_size):
    """
    Thiss function draws batch_size-many training samples at random 
    from a vocab corresponding to queried language.  
    """
    indices = torch.LongTensor(batch_size).random_(0, len(vocab))
    return vocab[indices]


def get_train_data(languages, vocabs, batch_size):
    """
    Returns one set of samples datapoints form a vocabulary for each provided language.
    """
    samples = {lang: get_dataset_sample(lang, vocab['src'][lang]) for lang in languages['src']}
    samples[languages['trgt']] = get_dataset_sample(lang, vocab['trgt'][languages['trgt']])
    return samples

In [111]:
def main():
    
    NLLLoss = torch.nn.NLLLoss()
    nr_src_langs = len(vocabs['src'])
    print('Nr source languages:', nr_src_langs)
    print('Nr target languages:', len(vocabs['trgt']))
    
    if avg_dec_grads:
        avg_factor = 1/nr_src_langs
        print('Decoder gradient averaging factor:', avg_factor)
    
    # Get bilingual dictionary for evaluating train loss or at least testing
    dicts = dict()
    #TODO

    # Set up model architecture
    net = gan.GAN(embedding_dim, internal_dim, hidden, languages['src'])

    # Get optimizers; 1 per source language and 1 for target language
    optims_g = {}
    for language in languages['src']:

        #params = net.generator.encoders[language].parameters() + net.generator.decoder.parameters()
        optims_g[language] = torch.optim.Adam([{'params': net.generator.encoders[language].parameters()},
                                               {'params': net.generator.decoder.parameters()}],
                                              lr=0.0001, betas=(0.9, 0.999), eps=1e-08, 
                                              weight_decay=0, amsgrad=False)

    optim_d = torch.optim.Adam(net.discriminator.parameters(), 
                               lr=0.0001, betas=(0.9, 0.999), eps=1e-08, 
                               weight_decay=0, amsgrad=False)

    # Train
    train_loss_real_d, train_loss_fake_d, train_loss_g = [], [], []
    eval_loss = [] # To be populated...
    last_loss = -1
    for epoch in range(epochs):
        print('Epoch ', epoch, '/', epochs)
        loss_real_total_d, loss_fake_total_d, loss_total_g = 0., 0., 0.

        # Shuffle data #
        # Source languages
        for lang in languages['src']:
            vocabs['src'][lang]['x'], vocabs['src'][lang]['y'] = shuffle(np.array(vocabs['src'][lang]['x']), np.array(vocabs['src'][lang]['y']))
            vocabs['src'][lang]['x'] = torch.from_numpy(vocabs['src'][lang]['x'])
            #vocabs['src'][lang]['y'] = torch.from_numpy(vocabs['src'][lang]['y']) # We don't really need that except for eval
        # Target language
        lang = languages['trgt'] # Retrieve language id
        vocabs['trgt'][lang]['x'], vocabs['trgt'][lang]['y'] = shuffle(np.array(vocabs['trgt'][lang]['x']), np.array(vocabs['trgt'][lang]['y']))
        vocabs['trgt'][lang]['x'] = torch.from_numpy(vocabs['trgt'][lang]['x'])

        # Train #
        for batch in range(num_minibatches):
            #print('Epoch ', epoch, ', Batch ', batch, '/', num_minibatches)

            # Update discriminator #
            net.discriminator.train()
            net.generator.eval()
            # All-real minibatch
            net.discriminator.zero_grad()
            x = vocabs['trgt'][languages['trgt']]['x'][batch * batch_size:(batch + 1) * batch_size]#.to(device)
            y_true = torch.full((batch_size,), real_label).long() #device=device  # TODO: could probs be converted to long straight away when reading in, already...
            y_pred = net.discriminator(x)
            # Loss proportional to discriminator's probability of correctly distinguishing TP and FP
            loss_real = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
            loss_real.backward()
            loss_real_total_d += loss_real

            # One minibatch per source language
            translations = {}
            loss_fake_batch_avg = 0.
            for language in languages['src']:
                # All-real minibatch
                net.discriminator.zero_grad()
                x = vocabs['src'][language]['x'][batch * batch_size:(batch + 1) * batch_size]#.to(device)
                x = net.generator(x, language)
                translations[language] = x
                y_true = torch.full((batch_size,), fake_label).long() #, device=device
                y_pred = net.discriminator(x.detach())      # Detach to avoid computing grads for generator
                # Loss proportional to discriminator's probability of correctly distinguishing TP and FP
                loss_fake = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
                loss_fake_batch_avg += loss_fake
                loss_fake.backward()    # Compute gradients only for discriminator
            optim_d.step()              # Weight update
            loss_fake_total_d += (loss_fake_batch_avg/nr_src_langs)

            # Update generator #
            net.generator.train()
            net.discriminator.eval()
            # Compute gradients
            loss_fake_batch_avg = 0.
            for language in languages['src']:
                net.generator.encoders[language].zero_grad()
                x = translations[language]
                y_true = torch.full((batch_size,), real_label).long() #device=device
                y_pred = net.discriminator(x)
                # Loss proportional to discriminator's probability of confusing TP and FP
                loss_fake = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
                loss_fake_batch_avg += loss_fake
                loss_fake.backward()
            loss_total_g += (loss_fake_batch_avg / nr_src_langs)
            
            
            # TODO: possibly average decoder's gradients over nr of src languages
            if avg_dec_grads:
                for p in net.generator.decoder.parameters():
                    p.grad *= avg_factor
            
            # Perform weight updates
            for language in languages['src']:
                optims_g[language].step()
            #print(loss_real_total_d, loss_fake_total_d, loss_total_g)
        # Document accumulated losses per epoch
        train_loss_real_d.append(loss_real_total_d)
        train_loss_fake_d.append(loss_fake_total_d)
        train_loss_g.append(loss_total_g)
        
        print('Mean: ', mean_param(net.generator.decoder))
        print('Progress: ', loss_real_total_d.detach().numpy(), 
              loss_fake_total_d.detach().numpy(), loss_total_g.detach().numpy())
        
        # TODO: Similarity metric-based evaluation per epoch?
        
        # Save checkpoints
        save = last_loss > train_loss_g[-1]  # Provisional: save when loss of generator has improved
        last_loss = train_loss_g[-1]
        save_checkpoint({'epoch': epoch,
                         'model_state_dict': net.state_dict(),
                         'optimizer_state_dicts': 
                             {**{lang: optims_g[lang].state_dict() for lang in languages['src']}, 
                              **{lang: optim_d.state_dict() for lang in languages['trgt']}
                             },
                         'losses': {'train_loss_real_d': train_loss_real_d[-1],
                                    'train_loss_fake_d': train_loss_fake_d[-1],
                                    'train_loss_g': train_loss_g[-1],},
                         }, save)

    # TODO: Final evaluation

    # Store model
    torch.save(net.state_dict(), final_state_path + 'final_model%d.pt' % epoch)

if __name__ == "__main__":
    # execute only if run as a script
    main()



Nr source languages: 2
Nr target languages: 1
Decoder gradient averaging factor: 0.5
Gan
Epoch  0 / 100
Mean:  tensor(0.0003)
Progress:  349.92917 279.79453 321.1089
Epoch  1 / 100
Mean:  tensor(-1.6334e-05)
Progress:  350.18982 96.84667 300.5612
Epoch  2 / 100
Mean:  tensor(-0.0002)
Progress:  349.85635 120.904205 324.34152
Epoch  3 / 100
Mean:  tensor(-0.0003)
Progress:  349.24692 127.83351 323.16083
Epoch  4 / 100
Mean:  tensor(-0.0002)
Progress:  348.79453 113.63088 305.19528
Epoch  5 / 100
Mean:  tensor(-0.0003)
Progress:  349.00992 95.34325 293.73938
Epoch  6 / 100
Mean:  tensor(-0.0002)
Progress:  349.2224 103.67438 300.47552
Epoch  7 / 100
Mean:  tensor(-0.0004)
Progress:  349.47415 102.670715 298.74066
Epoch  8 / 100
Mean:  tensor(-0.0005)
Progress:  349.46805 119.14551 318.5368
Epoch  9 / 100
Mean:  tensor(-0.0004)
Progress:  348.9474 117.397934 323.54987
Epoch  10 / 100
Mean:  tensor(-0.0004)
Progress:  348.09763 123.50196 323.78452
Epoch  11 / 100
Mean:  tensor(-0.0005)
Pro