In [1]:
import os
import fasttext
import fasttext.util
import torch
import math
import random
from early_stopping import EarlyStopping
from sklearn.utils import shuffle
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Unique Naming
from datetime import datetime
import random, string
import importlib

In [2]:
from model import gan

In [3]:
def random_string(length=10):
    """
        Generate a random string of given length. For safely storing produced images.
    """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(length))


def get_model_id():
    """
        Creates folder with unique ID in which everything related to a particular testrun can be saved.
    :return: Unique folder identifier
    """
    # Construct testrun identifier
    TIME_STAMP = datetime.now().strftime("%Y_%d_%m__%H_%M_%S__%f_")
    model_folder_id = TIME_STAMP + '_' + random_string() + '/'

    try:
        os.makedirs(model_folder_id)
    except Exception as e:
        print('Exception occurred: ', e)

    return model_folder_id

In [4]:
### VARIABLES & ADMINISTRATIVE STUFF ###
# System
#dataset_path = '/media/daniel/Elements/FastText_Data/'  # In case dataset is stored somewhere else, e.g. on hard-drive
dataset_path = '/media/daniel/Elements/FastText_Data/'  # Data in same directory
dictionary_path = '/media/daniel/Elements/FastText_Data/'  # Dictionaries in same directory
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

# Network
embedding_dim = 300
internal_dim = 300
output_dim = 1

# Train hyperparameters
epochs = 100
batch_size = 32
vocab_size = 5000
num_minibatches = vocab_size // batch_size
real_label, fake_label = 1, 0
languages = {'src': ['de', 'nl']
             , 'trgt': ['en']}  # Target language to be indicated in last position
checkpoint_frequency = 0  # 0 == Off; i > 0 == actual checkpoint frequency in epochs
avg_grads = False  # Boolean indicating whether to average the grads of decoder & discriminator accumulated over nr of source languages by nr of source langs
early_stop = False # Boolean indicating whether to stop early if loss won't decrease for a certain threshold
eval_frequency = 10

#testing parameters
N = [1] # List of n nearest neighbors that will be performed in evaluation


In [5]:
# Changed the way languages are stored. 
# For easy access to complete set of all included languages, just concatenate lists
languages = {'src': ['de', 'nl'], 'trgt': ['en']}

for lang in languages['src']+languages['trgt']:
    print(lang)
print(languages)

de
nl
en
{'trgt': ['en'], 'src': ['de', 'nl']}


In [6]:
# Set up saving paths
data_storage_path = './'
model_id = get_model_id()
checkpoint_path = data_storage_path + model_id + 'Checkpoint/'
final_state_path = data_storage_path + model_id + 'Final/'

try:
    if checkpoint_frequency > 0:
        os.makedirs(checkpoint_path)
        print('Created:', checkpoint_path)
    os.makedirs(final_state_path)
    print('Created:', final_state_path)
except Exception as e:
    raise Warning('Exception occurred: Cound not create dirs! Exception:', e)
    
print('Model ID:', model_id)

Created: ./2020_10_06__21_06_17__945418__pzizvxqfda/Final/
Model ID: 2020_10_06__21_06_17__945418__pzizvxqfda/


# Get vocab

In [7]:
def full_vocab(vocab):
    # Returns the word embeddings and matching labels for the full vocabulary
    words = vocab.words
    vectors = [vocab[word] for word in words]
    return vectors, words

In [8]:
def cleaned_vocab(vocab, vocab_size):
    # Remove all punctuation tokens while valid nr of tokens is insufficient yet for having full vocab size
    # TODO & possibly reserve testing vocab
    # Return clean & restricted vocab
    words = vocab.words[:vocab_size]              # Y (labels)
    vects = [vocab[word] for word in words]       # X (input data)

    return vects, words

In [9]:
def add_lang_to_vocab(lang_id, vocab_size, vocabs, full_vocabs):
    # Get dataset
    if dataset_path == './':
        fasttext.util.download_model(lang_id)  # Download word embedding vector data if not available
    vocab = fasttext.load_model(dataset_path + 'cc.' + lang_id + '.300.bin')  # Load language data

    # Add train data (embedding-vectors) and labels (words) to vocab
    X, Y = cleaned_vocab(vocab,500000)
    x, y = cleaned_vocab(vocab, vocab_size)
    vocabs[lang_id] = {'x': torch.tensor(x), 'y': y}
    full_vocabs[lang_id] = {'X': X, 'Y': Y}

    return vocabs, full_vocabs

In [10]:
def load_vocab(languages):
    vocabs = {}
    full_vocabs = {}
    
    for language in languages['src']+languages['trgt']:
        vocabs, full_vocabs = add_lang_to_vocab(language, vocab_size, vocabs, full_vocabs)

    print('Successfully loaded language models.')
    return vocabs, full_vocabs

In [11]:
#load vocab (keep in independent cell for bugfixing purposes)
vocabs, full_vocabs = load_vocab(languages)



Successfully loaded language models.


In [12]:
source_vocabs = {}
source_full_vocabs = {}

for source_language in languages['src']:
    source_vocabs[source_language] = vocabs[source_language]
    source_full_vocabs[source_language] = full_vocabs[source_language]
target_full_vocabs = full_vocabs[languages['trgt'][0]]


# Dictionary related functions

In [13]:
def convert_dictionary(dictionary_text):
    # Converts an input dictionary text file to a python dictionary
    dictionary = {}
    source = True
    source_word = ''
    target_word = ''
    
    for character in dictionary_text:
        if source is True:
            if character is '\t' or character is ' ':
                source = False
            else:
                source_word = source_word + character
        else:
            if character is '\n':
                source = True
                if source_word in dictionary:
                    dictionary[source_word].append(target_word)
                else:
                    dictionary[source_word] = [target_word]
                source_word = ''
                target_word = ''
            else:
                target_word = target_word + character
                
    return dictionary

In [14]:
def load_dictionaries(languages):
    # Loads in the bilingual dictionaries
    dictionaries = {}
    
    for source_language in languages['src']:
        file = open(dictionary_path + source_language + '-' + languages['trgt'][0] + '.txt', 'r', errors='ignore')
        dictionary_text = file.read()
        dictionaries[source_language] = convert_dictionary(dictionary_text)
    
    return dictionaries

In [15]:
def split_translation_task(languages, source_full_vocabs, dictionaries):
    # Creates a split in eval and training translation task
    eval_words = {}
    test_words = {}
    
    for source_language in languages['src']:
        source_words = list(dictionaries[source_language].keys())
#         random.shuffle(source_words)
        eval_list = []
        for source_word in source_words:
            if source_word in source_full_vocabs[source_language]['Y']:
                eval_list.append(source_word)
            if len(eval_list) is 200:
                eval_words[source_language] = eval_list
                break
#             eval_words[source_language] = source_words[0:50]
#             test_words[source_language] = source_words[50:150]        
#         eval_words[source_language] = source_words[0:int(len(source_words)/2)]
#         test_words[source_language] = source_words[int(len(source_words)/2):len(source_words)]
        
    return eval_words, test_words

In [16]:
#load in dictionaries (independent cell for bugfixing)
dictionaries = load_dictionaries(languages)

# split in train and evaluation
eval_words, test_words = split_translation_task(languages, source_full_vocabs, dictionaries)

# Nearest neighbor fitting

In [17]:
def fit_neighbors(N, languages, full_vocabs):
    target_neighbors = {}
    
    for n in N:
        target_neighbors[n] = NearestNeighbors(n_neighbors=n, metric='cosine').fit(full_vocabs[languages['trgt'][0]]['X'])
        
    return target_neighbors

In [18]:
neighbors = fit_neighbors(N, languages, full_vocabs)

# Evaluation functions

In [19]:
def compute_cosine(vector1, vector2):
    # Computes the cosine simularity between two vectors
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    return dot_product/(norm_vector1*norm_vector2)

In [21]:
def get_n_translations_batch(generator, language, source_vectors, target_vocab, neighbors):
    # Gets n possible translations, as given by the n nearest neighbors of the transformed
    # source vector in the target embeddings space, we will use a package for this for optimization
    # purposes. n is given in the nearest neighbor fitting stage.
    #print(source_vector.numpy()[0])
    transformed_source_embedding = generator(torch.as_tensor(source_vectors), language).detach().numpy()

    # only takes 2D arrays, hence the extra bracket [1][0] stands for select indices of
    # the first input vector (the only one in this case)
    vocab_indices = neighbors.kneighbors(np.array(transformed_source_embedding))[1]
    target_vectors = []
    target_words = []
    for target_indices in vocab_indices:
        vectors = []
        words = []
        for index in target_indices:
            vectors.append(target_vocab['X'][index])
            words.append(target_vocab['Y'][index])
        target_vectors.append(vectors)
        target_words.append(words)

    return target_vectors, target_words

In [23]:
def get_average_cosine_batch(generator, language, source_word_vectors, target_vocab, neighbors):
    # Computes the average cosine simularity between the source words and their translations
    sum_of_cosines = 0
    translated_word_vectors = get_n_translations_batch(generator, language, source_word_vectors, target_vocab, neighbors[1])[0]
    for source_word_vector, translated_word_vector in zip(source_word_vectors, translated_word_vectors):
        sum_of_cosines += compute_cosine(source_word_vector, translated_word_vector[0])
    return sum_of_cosines/len(source_word_vectors)

In [25]:
def get_translation_accuracy_batch(generator, language, source_words, source_vocab, target_vocab, dictionary, neighbors):
    # Compute the accuracy of translation over the given set of source words
    correct_translations = 0
    source_word_vectors = []
    for source_word in source_words:
        source_word_index = source_vocab['Y'].index(source_word)
        source_word_vectors.append(source_vocab['X'][source_word_index])
    target_words = get_n_translations_batch(generator, language, source_word_vectors, target_vocab, neighbors)[1]
    for n_target_words,source_word in zip(target_words,source_words):
        for target_word in n_target_words:
            #print(target_word,dictionary[source_word]) #check the translation and og word
            if target_word in dictionary[source_word]:
                correct_translations += 1
                break
    return correct_translations/len(source_words)

In [26]:
def evaluation(generator, languages, source_training_vocabs, source_eval_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N):
    # Evaluates the current model by using both an unsupervised cosine similiraty metric and a 
    # supervised translation accuracy metric. We have included both to see how they compare.
    for source_language in languages['src']:
        cosine_metric =  get_average_cosine_batch(generator, source_language, source_training_vocabs[source_language]['x'], target_full_vocabs, neighbors) #experimental
#         cosine_metric = ''
        accuracy_text = 'accuracies are '
        for n in N:
            accuracy = get_translation_accuracy_batch(generator, source_language, source_eval_words[source_language], source_full_vocabs[source_language], target_full_vocabs, dictionaries[source_language], neighbors[n])
            accuracy_text = str(accuracy_text) + 'p@' + str(n) + '=' + str(accuracy) + ', '
        
        print('evaluation of source language ' + source_language + ': average cosine=',cosine_metric, accuracy_text)
    

In [27]:
def testing(generator, languages, source_test_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N):
    # Testing based on translation accuracy on testing set
    for source_language in languages['src']:
        accuracy_text = ''
        for n in N:
            accuracy = get_translation_accuracy_batch(generator, source_language, source_test_words[source_language], source_full_vocabs[source_language], target_full_vocabs, dictionaries[source_language], neighbors[n])
            accuracy_text = accuracy_text + 'p@' + n + '=' + accuracy + ', '
        
        print('Testing accuracies of source language ' + source_language + ": " + accuracy_text)

# Auto Encoder

In [28]:
class AutoEncoder(torch.nn.Module):

    def __init__(self, D_in, H, shared_decoder):
        # Encoder
        super(AutoEncoder, self).__init__()
        self.w1 = torch.nn.Linear(D_in, H)
        self.w2 = torch.nn.Linear(H, H)
        
        self.activation = torch.nn.functional.relu
        
        # Decoder
        self.decoder = shared_decoder
        
    def forward(self, x):
        # Use encoder for encoding
        x = self.w1(x)
        x = self.activation(x)
        x = self.w2(x)
        
        # Decoder
        x = self.decoder(x)
        
        return x

In [29]:
def AutoEncoderLoss(output, target):
    # MSE loss
    loss = torch.mean((output - target)**2)
    return loss

In [30]:
"""
# Dummy test for AutoEncoder
from model import gan

# Param settings
# Network
embedding_dim = 300
internal_dim = 300
output_dim = 1#2

# Train hyperparameters
epochs = 100
batch_size = 32
vocab_size = 5000
num_minibatches = vocab_size // batch_size
real_label, fake_label = 1, 0
languages = {'src': ['de', 'nl']
             , 'trgt': ['en']}  # Target language to be indicated in last position

# Architecture

net = gan.GAN(embedding_dim, internal_dim, output_dim, languages['src'])

ae = AutoEncoder(embedding_dim, internal_dim, net.generator.decoder)
optimizer_ae = torch.optim.Adam(ae.parameters(),
                                         lr=0.01, betas=(0.9, 0.999), eps=1e-08, 
                                         weight_decay=0., amsgrad=False)

rows, cols = 10, 300
train_data = torch.rand(rows, cols)
print(train_data)


for i in range(1000):
    ae.train()
    optimizer_ae.zero_grad()
    y_pred = ae(train_data)
    loss = AutoEncoderLoss(y_pred, train_data)
    loss.backward()
    optimizer_ae.step()
    print('Epoch', i, 'Loss:', loss.detach().numpy())
    """

"\n# Dummy test for AutoEncoder\nfrom model import gan\n\n# Param settings\n# Network\nembedding_dim = 300\ninternal_dim = 300\noutput_dim = 2\n\n# Train hyperparameters\nepochs = 100\nbatch_size = 32\nvocab_size = 5000\nnum_minibatches = vocab_size // batch_size\nreal_label, fake_label = 1, 0\nlanguages = {'src': ['de', 'nl']\n             , 'trgt': ['en']}  # Target language to be indicated in last position\n\n# Architecture\n\nnet = gan.GAN(embedding_dim, internal_dim, output_dim, languages['src'])\n\nae = AutoEncoder(embedding_dim, internal_dim, net.generator.decoder)\noptimizer_ae = torch.optim.Adam(ae.parameters(),\n                                         lr=0.01, betas=(0.9, 0.999), eps=1e-08, \n                                         weight_decay=0., amsgrad=False)\n\nrows, cols = 10, 300\ntrain_data = torch.rand(rows, cols)\nprint(train_data)\n\n\nfor i in range(1000):\n    ae.train()\n    optimizer_ae.zero_grad()\n    y_pred = ae(train_data)\n    loss = AutoEncoderLoss(y_pr

# Define training related functions

In [31]:
def save_checkpoint(data, save):
    if save:
        torch.save(data, checkpoint_path + 'checkpoint_%d.pt' % data['epoch'])

In [32]:
def mean_param(model):
    return torch.mean(torch.cat([param.data.view(-1) for param in model.parameters()], 0))

In [33]:
def get_dataset_sample(lang, vocab, batch_size, include_y=False):
    """
    This function draws batch_size-many training samples at random 
    from a vocab corresponding to queried language.  
    """
    indices = torch.LongTensor(batch_size).random_(0, len(vocab))
    if include_y:
        return vocab['x'][indices], vocab['y'][indices]
    return vocab['x'][indices]


def get_train_data(languages, vocabs, batch_size, include_y=False):
    """
    Returns one set of samples datapoints form a vocabulary for each provided language.
    """
    x, y = {}, {}
    
    # Source languages
    for lang in languages['src']+languages['trgt']:
        if include_y:
            x[lang], y[lang] = get_dataset_sample(lang, vocabs[lang], batch_size, include_y)
        else:
            x[lang] = get_dataset_sample(lang, vocabs[lang], batch_size)
    
    # Return
    if include_y:
        return x, y
    return x

In [34]:
# For debugging - Compute sum of abs(gradients) of model
def get_summed_abs_grads(model):
#     summed_abs = torch.tensor(0)
    summed_abs = 0
    for p in model.parameters():
        summed_abs += torch.sum(torch.abs(p))
    return summed_abs

In [None]:
def main():
    
    nr_src_langs = len(languages['src'])
    nr_trgt_langs = len(languages['trgt'])
    nr_langs = nr_src_langs + nr_trgt_langs
    print('Nr source languages:', nr_src_langs)
    print('Nr target languages:', len(languages['trgt'])) 
    print('\n', languages)
    
    #if avg_grads:
    #    avg_factor = 1/nr_src_langs
    #    print('Decoder gradient averaging factor:', avg_factor, "\n")
    
    ### GAN ###
    importlib.reload(gan)
    
    # Set up model architecture
    net = gan.GAN(embedding_dim, internal_dim, output_dim, languages['src'])
    print(net)
    #NLLLoss = torch.nn.NLLLoss()
    criterion = torch.nn.BCELoss()

    # Get optimizers; 1 per source language of encoder and 1 for discriminator
    optimizers = {'gen': {}}
    for lang in languages['src']:
        optimizers['gen'][lang] = torch.optim.Adam([{'params': net.generator.encoders[lang].parameters()},
                                                    {'params': net.generator.decoder.parameters()}
                                                   ],
                                                    lr=0.0001, betas=(0.9, 0.999), eps=1e-08, 
                                                    weight_decay=0.01, amsgrad=False)
    optimizers['dis'] = torch.optim.Adam(net.discriminator.parameters(),
                                         lr=0.001, betas=(0.9, 0.999), eps=1e-08, 
                                         weight_decay=0.01, amsgrad=False)
    
    ### AUTO ENCODER ###
    ae = AutoEncoder(embedding_dim, internal_dim, net.generator.decoder)
    optimizer_ae = torch.optim.Adam(ae.parameters(),
                                         lr=0.00001, betas=(0.9, 0.999), eps=1e-08, 
                                         weight_decay=0., amsgrad=False)
    
    
    # Train
    train_loss_gen, train_loss_dis, train_loss_ae = [], [], []
    eval_loss = [] # TODO: To be populated...
    last_loss = -1
    
    es = EarlyStopping(patience=10) #patience = amount of epochs the loss has to stop decreasing in a row for it to early stop
    
    # TODO: 1000 epochs of pre-training?
    
    for epoch in range(epochs):
        print('Epoch ', epoch, '/', epochs)
        loss_gen, loss_dis, loss_ae = 0., 0., 0.

        # Train #
        for batch in range(num_minibatches):
            
            ### GAN STEP ###
            # Update discriminator #
            net.discriminator.train()
            net.generator.eval()
            net.discriminator.zero_grad()
            
            # Retrieve data
            x = get_train_data(languages, vocabs, batch_size)#.to(device) 

            # Init data-storage
            y_preds = torch.zeros([nr_langs*batch_size, 1]) # 2
            y_true = torch.zeros([nr_langs*batch_size])#.long()
            
            y_true[0:batch_size] = real_label  # First elements are target embeddings

            
            # All-real minibatch
            x_real = x[languages['trgt'][0]]  # Extract all-real data
            y_preds[0:batch_size] = net.discriminator(x_real)
            
            # All-fake minibatches - One minibatch per source language
            for i, language in enumerate(languages['src']):
                idx_from = batch_size*i+batch_size*nr_trgt_langs
                idx_to = batch_size*(i+1)+batch_size*nr_trgt_langs
                x_trans = net.generator(x[language], language)  # Generate fake data aka translate
                y_preds[idx_from:idx_to] = net.discriminator(x_trans)
            #print('Preds:', y_preds)
            
            # Loss proportional to discriminator's probability of correctly distinguishing TP and FP
            loss = criterion(y_preds, y_true)
            loss.backward()    # Compute gradients only for discriminator
            loss_dis += loss
            
            # Weight update for discriminator
            optimizers['dis'].step() 

            
            # Update generator #
            net.generator.train()
            net.discriminator.eval()
            net.generator.zero_grad()
            
            # Retrieve data
            x = get_train_data(languages, vocabs, batch_size)#.to(device)
            
            # All-fake minibatches - One minibatch per source language
            y_true = torch.full((batch_size,), real_label)#.long()#.to(device) # Try to fool the discriminator
            for language in languages['src']:
                x_src = x[language]
                x_trans = net.generator(x_src, language)
                y_pred = net.discriminator(x_trans)
                # Loss proportional to discriminator's probability of misclassifying TP and FP
                loss = criterion(y_pred, y_true)
                loss.backward()    # Compute gradients only for discriminator
                loss_gen += loss
            
            # Perform weight updates
            for language in languages['src']:
                optimizers['gen'][language].step()
        
        
            ### AUTO ENCODER STEP ###

            # Retrieve data
            x = get_train_data(languages, vocabs, batch_size)#.to(device) 

            # Perform training
            ae.train()
            optimizer_ae.zero_grad()
            train_data = x[languages['trgt'][0]]
            y_pred = ae(train_data)
            loss = AutoEncoderLoss(y_pred, train_data)
            loss.backward()
            optimizer_ae.step()
            loss_ae += loss
        
        
        # Document accumulated losses per epoch
        train_loss_gen.append(loss_gen.detach().numpy())
        train_loss_dis.append(loss_dis.detach().numpy())
        train_loss_ae.append(loss_ae.detach().numpy())
        
        #print('Mean: ', mean_param(net.generator.decoder))
        print('Progress: ', loss_gen.detach().numpy(), 
                            loss_dis.detach().numpy(), 
                            loss_ae.detach().numpy())
        
        print('Summed abs weights Generator:', get_summed_abs_grads(net.generator).detach().numpy())
        print('Summed abs weights Discrimi.:', get_summed_abs_grads(net.discriminator).detach().numpy())
        
        # Evaluation step
        if epoch > 0 and epoch % eval_frequency is 0:
            evaluation(net.generator, languages, source_vocabs, eval_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N)
        
        if early_stop: # if early stopping is enabled or not
            if es.step(loss_gen.detach()): # using the real loss of the generator for now, maybe use something else later? e.g. evaluation loss?
                print('early stopping')
                break  # early stop criterion is met, stop the loop now
        
    # Store model
    torch.save(net.state_dict(), final_state_path + 'final_model%d.pt' % epoch)
    
    # Some provision for final eval
    evaluation(net.generator, languages, source_vocabs, eval_words, source_full_vocabs, target_full_vocabs, dictionaries, neighbors, N)

    
if __name__ == "__main__":
    # execute only if run as a script
    main()
    print('Done.')



Nr source languages: 2
Nr target languages: 1

 {'trgt': ['en'], 'src': ['de', 'nl']}
GAN(
  (generator): Generator(
    (decoder): FeedForwardDecoder(
      (w1): Linear(in_features=300, out_features=300, bias=True)
      (w2): Linear(in_features=300, out_features=300, bias=True)
      (w3): Linear(in_features=300, out_features=300, bias=True)
    )
  )
  (discriminator): Discriminator(
    (w1): Linear(in_features=300, out_features=300, bias=True)
    (w2): Linear(in_features=300, out_features=300, bias=True)
    (w3): Linear(in_features=300, out_features=1, bias=True)
    (sigmoid): Sigmoid()
  )
)
Epoch  0 / 100
Progress:  1118.2549 25.457033 1.5239131
Summed abs weights Generator: 4838.0513
Summed abs weights Discrimi.: 1783.9814
Epoch  1 / 100
Progress:  1642.9061 6.0815372 1.04404
Summed abs weights Generator: 4408.263
Summed abs weights Discrimi.: 2173.485
Epoch  2 / 100
Progress:  1586.3173 13.10609 1.2638792
Summed abs weights Generator: 4347.182
Summed abs weights Discrimi.: