In [1]:
import os
import fasttext
import fasttext.util
import torch
import math
from model import gan
from sklearn.utils import shuffle
import numpy as np

# Unique Naming
from datetime import datetime
import random, string

In [2]:
def random_string(length=10):
    """
        Generate a random string of given length. For safely storing produced images.
    """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))


def get_model_id():
    """
        Creates folder with unique ID in which everything related to a particular testrun can be saved.
    :return: Unique folder identifier
    """
    # Construct testrun identifier
    TIME_STAMP = datetime.now().strftime("%Y_%d_%m__%H_%M_%S__%f_")
    model_folder_id = TIME_STAMP + '_' + random_string() + '/'

    try:
        os.mkdirs(model_folder_id)
    except Exception as e:
        print('Exception occurred: ', e)

    return model_folder_id

In [3]:
### VARIABLES & ADMINISTRATIVE STUFF ###
# System
#dataset_path = '/media/daniel/Elements/FastText_Data/'  # In case dataset is stored somewhere else, e.g. on hard-drive
dataset_path = ''  #data in same directory
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

# Network
embedding_dim = 300
internal_dim = 300
hidden = 300

# Train hyperparameters
epochs = 5
batch_size = 32
vocab_size = 2000
num_minibatches = vocab_size // batch_size
real_label, fake_label = 1, 0
languages = {'src': ['de', 'nl'], 'trgt': ['en']}  # Target language to be indicated in last position
checkpoint_frequency = 0  # 0 == Off; i > 0 == actual checkpoint frequency in epochs
avg_grads = True  # Boolean indicating whether to average the grads of decoder & discriminator accumulated over nr of source languages by nr of source langs


In [4]:
# Changed the way languages are stored. 
# For easy access to complete set of all included languages, just concatenate lists
languages = {'src': ['de', 'nl'], 'trgt': ['en']}

for lang in languages['src']+languages['trgt']:
    print(lang)
print(languages)

de
nl
en
{'trgt': ['en'], 'src': ['de', 'nl']}


In [40]:
# Set up saving paths

data_storage_path = './'

model_id = get_model_id()

checkpoint_path = data_storage_path + model_id + 'Checkpoint/'
final_state_path = data_storage_path + model_id + 'Final/'

try:
    if checkpoint_frequency > 0:
        os.makedirs(checkpoint_path)
        print('Created:', checkpoint_path)
    os.makedirs(final_state_path)
    print('Created:', final_state_path)
except Exception as e:
    raise Warning('Exception occurred: Cound not create dirs! Exception:', e)
    
print('Model ID:', model_id)

Exception occurred:  module 'os' has no attribute 'mkdirs'
Created: ./2020_04_06__02_04_29__163672__mfpcenzavm/Final/
Model ID: 2020_04_06__02_04_29__163672__mfpcenzavm/


# Get vocab

In [6]:
def cleaned_vocab(vocab, vocab_size):
    # Remove all punctuation tokens while valid nr of tokens is insufficient yet for having full vocab size
    # TODO & possibly reserve testing vocab

    # Return clean & restricted vocab
    words = vocab.words[:vocab_size]              # Y (labels)
    vects = [vocab[word] for word in words]       # X (input data)

    return vects, words

In [7]:
def add_lang_to_vocab(lang_id, vocab_size, vocabs):
    # Get dataset
    if dataset_path == './':
        fasttext.util.download_model(lang_id)  # Download word embedding vector data if not available
    vocab = fasttext.load_model(dataset_path + 'cc.' + lang_id + '.300.bin')  # Load language data

    # Add train data (embedding-vectors) and labels (words) to vocab
    x, y = cleaned_vocab(vocab, vocab_size)
    vocabs[lang_id] = {'x': torch.tensor(x), 'y': y}

    return vocabs

In [8]:
def load_vocab(languages):
    vocabs = {}
    
    for language in languages['src']+languages['trgt']:
        vocabs = add_lang_to_vocab(language, vocab_size, vocabs)

    print('Successfully loaded language models.')
    return vocabs

In [9]:
    # load vocab
    vocabs = load_vocab(languages)



Successfully loaded language models.


# Define training related functions

In [10]:
def save_checkpoint(data, save):
    if save:
        torch.save(data, checkpoint_path + 'checkpoint_%d.pt' % data['epoch'])

In [11]:
def mean_param(model):
    return torch.mean(torch.cat([param.data.view(-1) for param in model.parameters()], 0))

In [16]:
def get_dataset_sample(lang, vocab, batch_size, include_y=False):
    """
    Thiss function draws batch_size-many training samples at random 
    from a vocab corresponding to queried language.  
    """
    indices = torch.LongTensor(batch_size).random_(0, len(vocab))
    if include_y:
        return vocab['x'][indices], vocab['y'][indices]
    return vocab['x'][indices]


def get_train_data(languages, vocabs, batch_size, include_y=False):
    """
    Returns one set of samples datapoints form a vocabulary for each provided language.
    """
    x, y = {}, {}
    
    # Source languages
    for lang in languages['src']+languages['trgt']:
        if include_y:
            x[lang], y[lang] = get_dataset_sample(lang, vocabs[lang], batch_size, include_y)
        else:
            x[lang] = get_dataset_sample(lang, vocabs[lang], batch_size)
    
    # Return
    if include_y:
        return x, y
    return x

In [42]:
def main():
    
    NLLLoss = torch.nn.NLLLoss()
    nr_src_langs = len(languages['src'])
    print('Nr source languages:', nr_src_langs)
    print('Nr target languages:', len(languages['trgt']))
    
    print(languages)
    
    if avg_grads:
        avg_factor = 1/nr_src_langs
        print('Decoder gradient averaging factor:', avg_factor)
    
    # Get bilingual dictionary for evaluating train loss or at least testing
    dicts = dict()
    #TODO

    # Set up model architecture
    net = gan.GAN(embedding_dim, internal_dim, hidden, languages['src'])

    # Get optimizers; 1 per source language of encoder and 1 for discriminator
    optimizers = {'gen': {}}
    for lang in languages['src']:
        optimizers['gen'][lang] = torch.optim.Adam([{'params': net.generator.encoders[lang].parameters()},
                                                    {'params': net.generator.decoder.parameters()}],
                                                    lr=0.001, betas=(0.9, 0.999), eps=1e-08, 
                                                    weight_decay=0, amsgrad=False)
    optimizers['dis'] = torch.optim.Adam(net.discriminator.parameters(),
                                         lr=0.001, betas=(0.9, 0.999), eps=1e-08, 
                                         weight_decay=0, amsgrad=False)
    
    # Train
    train_loss_real_d, train_loss_fake_d = [], []
    train_loss_real_g, train_loss_fake_g = [], []
    eval_loss = [] # TODO: To be populated...
    last_loss = -1
    
    for epoch in range(epochs):
        print('Epoch ', epoch, '/', epochs)
        loss_real_total_d, loss_fake_total_d, loss_real_total_g, loss_fake_total_g = 0., 0., 0., 0.

        # Train #
        for batch in range(num_minibatches):
            #print('Epoch ', epoch, ', Batch ', batch, '/', num_minibatches)
            
            # Update discriminator #
            net.discriminator.train()
            net.generator.eval()
            net.discriminator.zero_grad()
            
            x = get_train_data(languages, vocabs, batch_size)#.to(device)
            
            # All-fake minibatches - One minibatch per source language
            loss_fake_batch_avg = 0.
            y_true = torch.full((batch_size,), fake_label).long()#.to(device)
            for language in languages['src']:
                x_fake = x[lang]
                x_trans = net.generator(x_fake, language)
                y_pred = net.discriminator(x_trans.detach())      # Detach to avoid computing grads for generator
                
                # Loss proportional to discriminator's probability of correctly distinguishing TP and FP
                loss_fake = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
                loss_fake.backward()    # Compute gradients only for discriminator
                loss_fake_batch_avg += loss_fake
            
            loss_fake_total_d += (loss_fake_batch_avg/nr_src_langs)
            
            # Possibly average discriminators's gradients over nr of src languages 
            # (--> ensures all-fake loss's contribution is equal to that of all-real data)
            if avg_grads:
                for p in net.discriminator.parameters():
                    p.grad *= avg_factor
            
            # All-real minibatch
            x_real = x[languages['trgt'][0]]  # Extract all-real data
            y_true = torch.full((batch_size,), real_label).long()#.to(device)
            y_pred = net.discriminator(x_real)
            
            # Loss proportional to discriminator's probability of correctly distinguishing TP and FP
            loss_real = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
            loss_real.backward()
            loss_real_total_d += loss_real
                    
            optimizers['dis'].step() # Weight update for discriminator

            # Update generator #
            net.generator.train()
            net.discriminator.eval()
            net.generator.zero_grad()
            
            x = get_train_data(languages, vocabs, batch_size)#.to(device)
            
            # All-fake minibatches - One minibatch per source language
            loss_fake_batch_avg = 0.
            y_true = torch.full((batch_size,), real_label).long()#.to(device) # Try to fool the discriinator
            for language in languages['src']:
                x_fake = x[lang]
                x_trans = net.generator(x_fake, language)
                y_pred = net.discriminator(x_trans)      # Detach to avoid computing grads for generator
                
                # Loss proportional to discriminator's probability of misclassifying TP and FP
                loss_fake = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
                loss_fake.backward()    # Compute gradients only for discriminator
                loss_fake_batch_avg += loss_fake
            
            loss_fake_total_g += (loss_fake_batch_avg/nr_src_langs)
            
            # Possibly average decoder's gradients over nr of src languages 
            # (--> ensures that decoder isn't trained as many times as there are source langs per train step)
            if avg_grads:
                for p in net.generator.decoder.parameters():
                    p.grad *= avg_factor
            
            # All-real minibatch
            x_real = x[languages['trgt'][0]]  # Extract all-real data
            y_true = torch.full((batch_size,), fake_label).long()#.to(device)  # Pretend true targets were fake
            y_pred = net.discriminator(x_real)
            
            # Loss proportional to discriminator's probability of misclassifying TP and FP
            loss_real = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
            loss_real.backward()
            loss_real_total_g += loss_real
                    
            # Perform weight updates
            for language in languages['src']:
                optimizers['gen'][language].step()
            
            #print(loss_real_total_d, loss_fake_total_d, loss_real_total_g, loss_fake_total_g)
        
        # Document accumulated losses per epoch
        train_loss_real_d.append(loss_real_total_d)
        train_loss_fake_d.append(loss_fake_total_d)
        train_loss_real_g.append(loss_real_total_g)
        train_loss_fake_g.append(loss_fake_total_g)
        
        #print('Mean: ', mean_param(net.generator.decoder))
        print('Progress: ', loss_real_total_d.detach().numpy(), 
                            loss_fake_total_d.detach().numpy(),
                            loss_real_total_g.detach().numpy(), 
                            loss_fake_total_g.detach().numpy())
        
        # TODO: Similarity metric-based evaluation per epoch?
        
        # Save checkpoints
        print(loss_real_total_g.detach().numpy(), loss_fake_total_g.detach().numpy())
        save = checkpoint_frequency > 0 and epoch % checkpoint_frequency == 0 and \
            last_loss > loss_real_total_g+loss_fake_total_g  # Provisional: save when loss of generator has improved
        last_loss = loss_real_total_g+loss_fake_total_g
        save_checkpoint({'epoch': epoch,
                         'model_state_dict': net.state_dict(),
                         'optimizer_state_dicts': 
                             {**{lang: optimizers['gen'][lang].state_dict() for lang in languages['src']}, 
                              **{languages['trgt'][0]: optimizers['dis']}
                             },
                         'losses': {'train_loss_real_d': train_loss_real_d[-1],
                                    'train_loss_fake_d': train_loss_fake_d[-1],
                                    'train_loss_real_g': train_loss_real_g[-1],
                                    'train_loss_fake_g': train_loss_fake_g[-1],},
                         }, save)

    # TODO: Final evaluation

    # Store model
    torch.save(net.state_dict(), final_state_path + 'final_model%d.pt' % epoch)

if __name__ == "__main__":
    # execute only if run as a script
    main()
    print('Done.')



Nr source languages: 2
Nr target languages: 1
{'trgt': ['en'], 'src': ['de', 'nl']}
Decoder gradient averaging factor: 0.5
Epoch  0 / 5
Progress:  249.00888 94.55626 221.33014 176.41458
221.33014 176.41458
Epoch  1 / 5
Progress:  68.60198 26.276072 45.138794 142.06871
45.138794 142.06871
Epoch  2 / 5
Progress:  41.870083 37.457222 48.166256 60.8237
48.166256 60.8237
Epoch  3 / 5
Progress:  43.026512 40.902798 44.953365 53.800385
44.953365 53.800385
Epoch  4 / 5
Progress:  41.301746 43.934948 47.35086 51.10363
47.35086 51.10363
Done.
