In [36]:
import os
import fasttext
import fasttext.util
import torch
import math
import random
from model import gan
from early_stopping import EarlyStopping
from sklearn.utils import shuffle
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Unique Naming
from datetime import datetime
import random, string

In [2]:
def random_string(length=10):
    """
        Generate a random string of given length. For safely storing produced images.
    """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))


def get_model_id():
    """
        Creates folder with unique ID in which everything related to a particular testrun can be saved.
    :return: Unique folder identifier
    """
    # Construct testrun identifier
    TIME_STAMP = datetime.now().strftime("%Y_%d_%m__%H_%M_%S__%f_")
    model_folder_id = TIME_STAMP + '_' + random_string() + '/'

    try:
        os.makedirs(model_folder_id)
    except Exception as e:
        print('Exception occurred: ', e)

    return model_folder_id

In [31]:
### VARIABLES & ADMINISTRATIVE STUFF ###
# System
#dataset_path = '/media/daniel/Elements/FastText_Data/'  # In case dataset is stored somewhere else, e.g. on hard-drive
dataset_path = ''  # Data in same directory
dictionary_path = ''  # Dictionaries in same directory
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

# Network
embedding_dim = 300
internal_dim = 300
hidden = 300

# Train hyperparameters
epochs = 20
batch_size = 32
vocab_size = 2000
num_minibatches = vocab_size // batch_size
real_label, fake_label = 1, 0
languages = {'src': ['de', 'nl'], 'trgt': ['en']}  # Target language to be indicated in last position
checkpoint_frequency = 0  # 0 == Off; i > 0 == actual checkpoint frequency in epochs
avg_grads = True  # Boolean indicating whether to average the grads of decoder & discriminator accumulated over nr of source languages by nr of source langs
early_stop = True # Boolean indicating whether to stop early if loss won't decrease for a certain threshold

#testing parameters
N = [1] # List of n nearest neighbours that will be performed in evaluation


In [4]:
# Changed the way languages are stored. 
# For easy access to complete set of all included languages, just concatenate lists
languages = {'src': ['de', 'nl'], 'trgt': ['en']}

for lang in languages['src']+languages['trgt']:
    print(lang)
print(languages)

de
nl
en
{'src': ['de', 'nl'], 'trgt': ['en']}


In [27]:
# Set up saving paths
data_storage_path = './'
model_id = get_model_id()
checkpoint_path = data_storage_path + model_id + 'Checkpoint/'
final_state_path = data_storage_path + model_id + 'Final/'

try:
    if checkpoint_frequency > 0:
        os.makedirs(checkpoint_path)
        print('Created:', checkpoint_path)
    os.makedirs(final_state_path)
    print('Created:', final_state_path)
except Exception as e:
    raise Warning('Exception occurred: Cound not create dirs! Exception:', e)
    
print('Model ID:', model_id)

Created: ./2020_04_06__11_11_23__045717__lxbtxpxcuy/Final/
Model ID: 2020_04_06__11_11_23__045717__lxbtxpxcuy/


# Get vocab

In [None]:
def full_vocab(vocab):
    # Returns the word embeddings and matching labels for the full vocabulary
    words = vocab.words
    vectors = [vocab[word] for word in words]
    return vectors, words

In [28]:
def cleaned_vocab(vocab, vocab_size):
    # Remove all punctuation tokens while valid nr of tokens is insufficient yet for having full vocab size
    # TODO & possibly reserve testing vocab
    # Return clean & restricted vocab
    words = vocab.words[:vocab_size]              # Y (labels)
    vects = [vocab[word] for word in words]       # X (input data)

    return vects, words

In [7]:
def add_lang_to_vocab(lang_id, vocab_size, vocabs):
    # Get dataset
    if dataset_path == './':
        fasttext.util.download_model(lang_id)  # Download word embedding vector data if not available
    vocab = fasttext.load_model(dataset_path + 'cc.' + lang_id + '.300.bin')  # Load language data

    # Add train data (embedding-vectors) and labels (words) to vocab
    X, Y = full_vocab(vocab)
    x, y = cleaned_vocab(vocab, vocab_size)
    vocabs[lang_id] = {'x': torch.tensor(x), 'y': y}
    full_vocabs[lang_id] = {'X': X, 'y': Y}

    return vocabs

In [8]:
def load_vocab(languages):
    vocabs = {}
    
    for language in languages['src']+languages['trgt']:
        vocabs, full_vocabs = add_lang_to_vocab(language, vocab_size, vocabs)

    print('Successfully loaded language models.')
    return vocabs, full_vocabs

In [9]:
    # load vocab
    vocabs, full_vocabs = load_vocab(languages)
    
    source_vocabs = {}
    source_full_vocabs = {}
    for source_language in languages['src']
        source_vocabs[source_language] = vocabs[source_langauge]
        source_full_vocabs[source_language] = full_vocabs[source_langauge]
    target_full_vocabs = full_vocabs[languages['trgt'][0]]



Successfully loaded language models.


# Dictionary related functions

In [1]:
def convert_dictionary(dictionary_text):
    # Converts an input dictionary text file to a python dictionary
    dictionary = {}
    source = True
    source_word = ''
    target_word = ''
    
    for character in dictionary_text:
        if source is True:
            if character is '\t' or character is ' ':
                source = False
            else:
                source_word = source_word + character
        else:
            if character is '\n':
                source = True
                if source_word in dictionary:
                    dictionary[source_word].append(target_word)
                else:
                    dictionary[source_word] = [target_word]
                source_word = ''
                target_word = ''
            else:
                target_word = target_word + character
                
    return dictionary

In [2]:
def load_dictionaries(languages):
    # Loads in the bilingual dictionaries
    dictionaries = {}
    
    for source_language in languages['src']:
        file = open(dictionary_path + language + '-' + languages['trgt'][0] + '.txt', 'r')
        dictionary_text = file.read()
        dictionaries[source_language] = convert_dictionary(dictionary_text)
    
    return dictionaries

In [None]:
def split_translation_task(languages, full_vocabs, dictionaries):
    # Creates a split in eval and training translation task
    eval_words = {}
    test_words = {}
    
    for source_language in languages['src']:
        source_words = list(dictionaries[source_language].keys())
        source_words = random.shuffle(source_words)
        eval_words[source_langauge] = source_words[0:int(len(source_words)/2)-1]
        test_words[source_language] = source_words[int(len(source_words)/2):len(source_words)-1]
        
    return eval_words, test_words

In [None]:
    #load in dictionaries
    dictionaries = load_dictionaries(languages)
    
    # split in train and evaluation
    eval_words, test_words = split_translation_task(languages, full_vocabs, dictionaries)

# Nearest neighbour fitting

In [3]:
def fit_neighbours(N, languages, full_vocabs):
    target_neighbours = {}
    
    for n in N:
        target_neighbours[n] = NearestNeighbors(n_neighbors=n, metric='cosine').fit(full_vocabs[languages['trgt'][0]]['X'])
        
    return target_neighbours

In [None]:
    neighbors = fit_neighbours(N, languages, full_vocabs)

# Evaluation functions

In [4]:
def compute_cosine(vector1, vector2):
    # Computes the cosine simularity between two vectors
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    return dot_product/(norm_vector1*norm_vector2)

In [5]:
def get_n_translations(generator, language, source_vector, target_vocab, neighbors):
    # Gets n possible translations, as given by the n nearest neighbors of the transformed
    # source vector in the target embeddings space, we will use a package for this for optimization
    # purposes. n is given in the nearest neighbor fitting stage.
    transformed_source_embedding = generator.forward(source_vector, language)

    # only takes 2D arrays, hence the extra bracket [1][0] stands for select indices of
    # the first input vector (the only one in this case)
    vocab_indices = neighbors(np.array([transformed_source_embedding]))[1][0]

    target_vectors = []
    target_words = []
    for index in vocab_indices:
        target_vectors.append(target_vocab['X'][index])
        target_words.append(target_vocab['Y'][index])

    return target_vectors, target_words

In [6]:
def get_average_cosine(generator, language, source_word_vectors, target_vocab, neighbors):
    # Computes the average cosine simularity between the source words and their translations
    sum_of_cosines = 0
    for source_word_vector in source_word_vectors:
        translated_word_vector = get_n_translations(generator, language, source_word_vector, target_vocab, neighbors)[0][0]
        sum_of_cosines += compute_cosine(source_word_vector, translated_word_vector)
    return sum_of_cosines/len(source_word_vectors)

In [7]:
def get_translation_accuracy(generator, language, source_words, source_vocab, target_vocab, dictionary, neighbors):
    # Compute the accuracy of translation over the given set of source words
    correct_translations = 0
    for source_word in source_words:
        source_word_index = source_vocab['Y'].index(source_word)
        source_word_vector = source_vocab['X'][source_word_index]
        n_target_words = get_n_translations(generator, language, source_word_vector, target_vocab, neighbors)[1]
        for target_word in n_target_words:
            if target_word in dictionary[source_word]:
                correct_translations += 1
                break
    return correct_translations/len(source_words)

In [8]:
def evaluation(generator, languages, source_training_vocabs, source_eval_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N):
    # Evaluates the current model by using both an unsupervised cosine similiraty metric and a 
    # supervised translation accuracy metric. We have included both to see how they compare.
    for source_language in languages['src']:
        cosine_metric =  get_average_cosine(generator, source_language, source_training_vocabs[source_language]['x'], target_full_vocab, neighbors)

        accuracy_text = 'accuracies are '
        for n in N:
            accuracy = get_translation_accuracy(generator, source_language, source_eval_words[source_language], source_full_vocabs[source_language], target_full_vocab, dictionaries[source_language], neighbours)
            accuracy_text = accuracy_text + 'p@' + n + '=' + accuracy + ', '
        
        print('evaluation of source language ' + source_language + ': average cosine=' + cosine_metric + accuracy_text)
    

In [9]:
def testing(generator, languages, source_test_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N):
    # Testing based on translation accuracy on testing set
    for source_language in languages['src']:
        accuracy_text = ''
        for n in N:
            accuracy = get_translation_accuracy(generator, source_language, source_test_words[source_language], source_full_vocabs[source_language], target_full_vocab, dictionaries[source_language], neighbours)
            accuracy_text = accuracy_text + 'p@' + n + '=' + accuracy + ', '
        
        print('Testing accuracies of source language ' + source_language + ": " + accuracy_text)

# Define training related functions

In [10]:
def save_checkpoint(data, save):
    if save:
        torch.save(data, checkpoint_path + 'checkpoint_%d.pt' % data['epoch'])

In [11]:
def mean_param(model):
    return torch.mean(torch.cat([param.data.view(-1) for param in model.parameters()], 0))

In [29]:
def get_dataset_sample(lang, vocab, batch_size, include_y=False):
    """
    This function draws batch_size-many training samples at random 
    from a vocab corresponding to queried language.  
    """
    indices = torch.LongTensor(batch_size).random_(0, len(vocab))
    if include_y:
        return vocab['x'][indices], vocab['y'][indices]
    return vocab['x'][indices]


def get_train_data(languages, vocabs, batch_size, include_y=False):
    """
    Returns one set of samples datapoints form a vocabulary for each provided language.
    """
    x, y = {}, {}
    
    # Source languages
    for lang in languages['src']+languages['trgt']:
        if include_y:
            x[lang], y[lang] = get_dataset_sample(lang, vocabs[lang], batch_size, include_y)
        else:
            x[lang] = get_dataset_sample(lang, vocabs[lang], batch_size)
    
    # Return
    if include_y:
        return x, y
    return x

In [41]:
def main():
    
    NLLLoss = torch.nn.NLLLoss()
    nr_src_langs = len(languages['src'])
    print('Nr source languages:', nr_src_langs)
    print('Nr target languages:', len(languages['trgt'])) 
    print('\n', languages)
    
    if avg_grads:
        avg_factor = 1/nr_src_langs
        print('Decoder gradient averaging factor:', avg_factor, "\n")
    
    # Get bilingual dictionary for evaluating train loss or at least testing
    dicts = dict()
    #TODO

    # Set up model architecture
    net = gan.GAN(embedding_dim, internal_dim, hidden, languages['src'])

    # Get optimizers; 1 per source language of encoder and 1 for discriminator
    optimizers = {'gen': {}}
    for lang in languages['src']:
        optimizers['gen'][lang] = torch.optim.Adam([{'params': net.generator.encoders[lang].parameters()},
                                                    {'params': net.generator.decoder.parameters()}],
                                                    lr=0.001, betas=(0.9, 0.999), eps=1e-08, 
                                                    weight_decay=0, amsgrad=False)
    optimizers['dis'] = torch.optim.Adam(net.discriminator.parameters(),
                                         lr=0.001, betas=(0.9, 0.999), eps=1e-08, 
                                         weight_decay=0, amsgrad=False)
    
    # Train
    train_loss_real_d, train_loss_fake_d = [], []
    train_loss_real_g, train_loss_fake_g = [], []
    eval_loss = [] # TODO: To be populated...
    last_loss = -1
    
    es = EarlyStopping(patience=10) #patience = amount of epochs the loss has to stop decreasing in a row for it to early stop
    
    for epoch in range(epochs):
        print('Epoch ', epoch, '/', epochs)
        loss_real_total_d, loss_fake_total_d, loss_real_total_g, loss_fake_total_g = 0., 0., 0., 0.

        # Train #
        for batch in range(num_minibatches):
            #print('Epoch ', epoch, ', Batch ', batch, '/', num_minibatches)
            
            # Update discriminator #
            net.discriminator.train()
            net.generator.eval()
            net.discriminator.zero_grad()
            
            x = get_train_data(languages, vocabs, batch_size)#.to(device)
            
            # All-fake minibatches - One minibatch per source language
            loss_fake_batch_avg = 0.
            y_true = torch.full((batch_size,), fake_label).long()#.to(device)
            for language in languages['src']:
                x_fake = x[lang]
                x_trans = net.generator(x_fake, language)
                y_pred = net.discriminator(x_trans.detach())      # Detach to avoid computing grads for generator
                
                # Loss proportional to discriminator's probability of correctly distinguishing TP and FP
                loss_fake = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
                loss_fake.backward()    # Compute gradients only for discriminator
                loss_fake_batch_avg += loss_fake
            
            loss_fake_total_d += (loss_fake_batch_avg/nr_src_langs)
            
            # Possibly average discriminators's gradients over nr of src languages 
            # (--> ensures all-fake loss's contribution is equal to that of all-real data)
            if avg_grads:
                for p in net.discriminator.parameters():
                    p.grad *= avg_factor
            
            # All-real minibatch
            x_real = x[languages['trgt'][0]]  # Extract all-real data
            y_true = torch.full((batch_size,), real_label).long()#.to(device)
            y_pred = net.discriminator(x_real)
            
            # Loss proportional to discriminator's probability of correctly distinguishing TP and FP
            loss_real = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
            loss_real.backward()
            loss_real_total_d += loss_real
                    
            optimizers['dis'].step() # Weight update for discriminator

            # Update generator #
            net.generator.train()
            net.discriminator.eval()
            net.generator.zero_grad()
            
            x = get_train_data(languages, vocabs, batch_size)#.to(device)
            
            # All-fake minibatches - One minibatch per source language
            loss_fake_batch_avg = 0.
            y_true = torch.full((batch_size,), real_label).long()#.to(device) # Try to fool the discriinator
            for language in languages['src']:
                x_fake = x[lang]
                x_trans = net.generator(x_fake, language)
                y_pred = net.discriminator(x_trans)      # Detach to avoid computing grads for generator
                
                # Loss proportional to discriminator's probability of misclassifying TP and FP
                loss_fake = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
                loss_fake.backward()    # Compute gradients only for discriminator
                loss_fake_batch_avg += loss_fake
            
            loss_fake_total_g += (loss_fake_batch_avg/nr_src_langs)
            
            # Possibly average decoder's gradients over nr of src languages 
            # (--> ensures that decoder isn't trained as many times as there are source langs per train step)
            if avg_grads:
                for p in net.generator.decoder.parameters():
                    p.grad *= avg_factor
            
            # All-real minibatch
            x_real = x[languages['trgt'][0]]  # Extract all-real data
            y_true = torch.full((batch_size,), fake_label).long()#.to(device)  # Pretend true targets were fake
            y_pred = net.discriminator(x_real)
            
            # Loss proportional to discriminator's probability of misclassifying TP and FP
            loss_real = NLLLoss(torch.log(y_pred+0.0000001), y_true)  # NLLLoss needs log(prob_distribution); adding small amount to avoid log(0)
            loss_real.backward()
            loss_real_total_g += loss_real
                    
            # Perform weight updates
            for language in languages['src']:
                optimizers['gen'][language].step()
            
            #print(loss_real_total_d, loss_fake_total_d, loss_real_total_g, loss_fake_total_g)
        
        # Document accumulated losses per epoch
        train_loss_real_d.append(loss_real_total_d)
        train_loss_fake_d.append(loss_fake_total_d)
        train_loss_real_g.append(loss_real_total_g)
        train_loss_fake_g.append(loss_fake_total_g)
        
        #print('Mean: ', mean_param(net.generator.decoder))
        print('Progress: ', loss_real_total_d.detach().numpy(), 
                            loss_fake_total_d.detach().numpy(),
                            loss_real_total_g.detach().numpy(), 
                            loss_fake_total_g.detach().numpy())
        
        # Evaluation step
        valuation(generator, languages, source_vocabs, eval_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N)

        if early_stop: # if early stopping is enabled or not
            if es.step(loss_real_total_g.detach()): # using the real loss of the generator for now, maybe use something else later? e.g. evaluation loss?
                print('early stopping')
                break  # early stop criterion is met, stop the loop now
        
        # Save checkpoints
        print(loss_real_total_g.detach().numpy(), loss_fake_total_g.detach().numpy())
        
        save = checkpoint_frequency > 0 and epoch % checkpoint_frequency == 0 and \
            last_loss > loss_real_total_g+loss_fake_total_g  # Provisional: save when loss of generator has improved
        last_loss = loss_real_total_g+loss_fake_total_g
        save_checkpoint({'epoch': epoch,
                         'model_state_dict': net.state_dict(),
                         'optimizer_state_dicts': 
                             {**{lang: optimizers['gen'][lang].state_dict() for lang in languages['src']}, 
                              **{languages['trgt'][0]: optimizers['dis']}
                             },
                         'losses': {'train_loss_real_d': train_loss_real_d[-1],
                                    'train_loss_fake_d': train_loss_fake_d[-1],
                                    'train_loss_real_g': train_loss_real_g[-1],
                                    'train_loss_fake_g': train_loss_fake_g[-1],},
                         }, save)

    # Final testing
    testing(generator, languages, test_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N)

    # Store model
    torch.save(net.state_dict(), final_state_path + 'final_model%d.pt' % epoch)

if __name__ == "__main__":
    # execute only if run as a script
    main()
    print('Done.')



Nr source languages: 2
Nr target languages: 1

 {'src': ['de', 'nl'], 'trgt': ['en']}
Decoder gradient averaging factor: 0.5 

Epoch  0 / 20
Progress:  252.09407 89.848366 231.7427 175.53697
sdsfs
231.7427 175.53697
Epoch  1 / 20
Progress:  60.589394 27.374695 47.230286 117.01661
47.230286 117.01661
Epoch  2 / 20
Progress:  47.255955 33.009087 44.820065 89.150185
44.820065 89.150185
Epoch  3 / 20
Progress:  41.449257 40.61509 46.77988 53.627583
46.77988 53.627583
Epoch  4 / 20
Progress:  39.266617 44.66491 48.851814 48.163185
48.851814 48.163185
Epoch  5 / 20
Progress:  39.810505 44.277462 52.205376 50.089413
52.205376 50.089413
Epoch  6 / 20
Progress:  33.58342 42.449757 57.361675 55.48435
57.361675 55.48435
Epoch  7 / 20
Progress:  28.201244 36.203766 70.14098 60.422897
70.14098 60.422897
Epoch  8 / 20
Progress:  23.747543 27.071314 104.17129 73.9532
104.17129 73.9532
Epoch  9 / 20
Progress:  17.714909 18.038177 120.94053 98.00282
120.94053 98.00282
Epoch  10 / 20
Progress:  16.56294