In [1]:
from parameters_lstm import *

from preprocess_datapoints_lstm import *
from preprocess_text_to_tensors_lstm import *
from domain_classifier_model_lstm import *
from meter import *

import torch
from torch.autograd import Variable
import numpy
import time

saved_model_name_lstm = 'lstm_domain_adapt_explore'
saved_model_name_nn = 'lstm_nn_domain_adapt_explore'


# Initialize the data sets
processed_corpus = process_whole_corpuses()
word_to_id_vocab = processed_corpus['word_to_id']
word2vec = load_glove_embeddings(glove_path, word_to_id_vocab)
ubuntu_id_to_data_title = processed_corpus['ubuntu_id_to_data_title']
android_id_to_data_title = processed_corpus['android_id_to_data_title']
ubuntu_id_to_data_body = processed_corpus['ubuntu_id_to_data_body']
android_id_to_data_body = processed_corpus['android_id_to_data_body']


''' Data Sets '''
training_data_ubuntu = ubuntu_id_to_similar_different()
training_question_ids_ubuntu = list(training_data_ubuntu.keys())

np.random.seed(1)
data_android = android_id_to_similar_different(dev=True)
all_dev_question_ids_android = list(data_android.keys())
training_question_ids_android = all_dev_question_ids_android[:500]
dev_question_ids_android = all_dev_question_ids_android[500:]
    
test_data_android = android_id_to_similar_different(dev=False)
test_question_ids_android = list(test_data_android.keys())
# Note: Remember to edit batch_size accordingly if testing on smaller size data sets

In [3]:
def eval_model(lstm, ids, data, word2vec, id2Data_title, id2Data_body, word_to_id_vocab, truncation_val_title, truncation_val_body):
    lstm.eval()
    auc_scorer.reset()

    candidate_ids, q_main_ids, labels = organize_test_ids(ids, data)
    num_q_main = len(q_main_ids)
    len_pieces = round(num_q_main/50)
    print(num_q_main)

    for i in range(0, num_q_main, len_pieces):
        print(i, end = ' ')
        q_main_id_num_repl_tuple = q_main_ids[i:i+len_pieces]
        candidates = candidate_ids[i:i+len_pieces]
        current_labels = torch.from_numpy(np.array(labels[i:i+len_pieces])).long()

        candidates_qs_matrix = construct_qs_matrix_testing(candidates, lstm, h0, c0, word2vec, id2Data_title, id2Data_body,
        word_to_id_vocab, truncation_val_title, truncation_val_body, main=False)
        main_qs_matrix = construct_qs_matrix_testing(q_main_id_num_repl_tuple, lstm, h0, c0, word2vec, id2Data_title, id2Data_body,
        word_to_id_vocab, truncation_val_title, truncation_val_body, main=True)

        similarity_matrix_this_batch = torch.nn.functional.cosine_similarity(candidates_qs_matrix, main_qs_matrix, eps=1e-08).data
        auc_scorer.add(similarity_matrix_this_batch, current_labels)

    auc_score = auc_scorer.value()

    return auc_score

In [4]:
''' Params Dashboard '''

''' Procedural parameters '''
batch_size = 40
num_differing_questions = 20
num_epochs = 2


''' Model specs LSTM '''
dropout = 0.3
margin = 0.1
lr_lstm = 10**-3

input_size = 300
hidden_size = 240
num_layers = 1
bias = True
batch_first = True
bidirectional = True
first_dim = num_layers * 2 if bidirectional else num_layers


''' Model specs NN '''
lr_nn = -10**-3
lamb = 10**-6

input_size_nn = 2*hidden_size if bidirectional else hidden_size
first_hidden_size_nn = 300
second_hidden_size_nn = 150


''' Data processing specs '''
truncation_val_title = 15
truncation_val_body = 85
padding_idx = 0

glove_path = '../glove.840B.300d.txt'
android_corpus_path = '../android_dataset/corpus.tsv'
ubuntu_corpus_path = '../ubuntu_dataset/text_tokenized.txt'

In [5]:
''' Encoder (LSTM) '''
lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional)
loss_function_lstm = torch.nn.MultiMarginLoss(margin=margin)
optimizer_lstm = torch.optim.Adam(lstm.parameters(), lr=lr_lstm)

h0 = Variable(torch.zeros(first_dim, 1, hidden_size), requires_grad=False)
c0 = Variable(torch.zeros(first_dim, 1, hidden_size), requires_grad=False)


''' Domain Classifier (Neural Net) '''
neural_net = DomainClassifier(input_size_nn, first_hidden_size_nn, second_hidden_size_nn)
loss_function_nn = nn.CrossEntropyLoss()
optimizer_nn = torch.optim.Adam(neural_net.parameters(), lr=lr_nn)

''' Procedural parameters '''
num_batches = round(len(training_question_ids_ubuntu) / batch_size)
auc_scorer = AUCMeter()


def train_lstm_question_similarity(lstm, batch_ids_ubuntu, batch_ids_android, batch_data_ubuntu, batch_data_android, word2vec, ubuntu_id_to_data_title, ubuntu_id_to_data_body,\
                                   android_id_to_data_title, android_id_to_data_body, word_to_id_vocab, truncation_val_title, truncation_val_body):
    lstm.train()
    sequence_ids_ubuntu, dict_sequence_lengths_ubuntu = organize_ids_training(batch_ids_ubuntu, batch_data_ubuntu, num_differing_questions)
    sequence_ids_android, dict_sequence_lengths_android = organize_ids_training(batch_ids_android, batch_data_android, num_differing_questions)
    
    candidates_qs_tuples_matrix_ubuntu = construct_qs_matrix_training(sequence_ids_ubuntu, lstm, h0, c0, word2vec, ubuntu_id_to_data_title, ubuntu_id_to_data_body,
        dict_sequence_lengths_ubuntu, num_differing_questions, word_to_id_vocab, truncation_val_title, truncation_val_body, candidates=True)
    
    candidates_qs_tuples_matrix_android = construct_qs_matrix_training(sequence_ids_android, lstm, h0, c0, word2vec, android_id_to_data_title, android_id_to_data_body,
        dict_sequence_lengths_android, num_differing_questions, word_to_id_vocab, truncation_val_title, truncation_val_body, candidates=True)
    
    main_qs_tuples_matrix_ubuntu = construct_qs_matrix_training(batch_ids_ubuntu, lstm, h0, c0, word2vec, ubuntu_id_to_data_title, ubuntu_id_to_data_body,
        dict_sequence_lengths_ubuntu, num_differing_questions, word_to_id_vocab, truncation_val_title, truncation_val_body, candidates=False)
    
    main_qs_tuples_matrix_android = construct_qs_matrix_training(batch_ids_android, lstm, h0, c0, word2vec, android_id_to_data_title, android_id_to_data_body,
        dict_sequence_lengths_android, num_differing_questions, word_to_id_vocab, truncation_val_title, truncation_val_body, candidates=False)
        
    similarity_matrix_ubuntu = torch.nn.functional.cosine_similarity(candidates_qs_tuples_matrix_ubuntu, main_qs_tuples_matrix_ubuntu, dim=2, eps=1e-6)
    target_ubuntu = Variable(torch.LongTensor([0] * int(len(sequence_ids_ubuntu) / (1 + num_differing_questions))))
    
    similarity_matrix_android = torch.nn.functional.cosine_similarity(candidates_qs_tuples_matrix_android, main_qs_tuples_matrix_android, dim=2, eps=1e-6)
    target_android = Variable(torch.LongTensor([0] * int(len(sequence_ids_android) / (1 + num_differing_questions))))
    
    loss_batch = (loss_function_lstm(similarity_matrix_ubuntu, target_ubuntu) + loss_function_lstm(similarity_matrix_android, target_android))/2

    print("lstm multi-margin loss on batch:", loss_batch.data[0])
    return loss_batch


def train_nn_domain_classification(neural_net, lstm, h0, c0, ids_ubuntu, ids_android, word2vec,
    ubuntu_id_to_data_title, ubuntu_id_to_data_body, android_id_to_data_title, android_id_to_data_body, truncation_val_title, truncation_val_body):
    neural_net.train()
    lstm.train()

    qs_matrix_ubuntu = construct_qs_matrix_domain_classification(ids_ubuntu, lstm, h0, c0, word2vec,
        ubuntu_id_to_data_title, ubuntu_id_to_data_body, word_to_id_vocab, truncation_val_title, truncation_val_body)
    qs_matrix_android = construct_qs_matrix_domain_classification(ids_android, lstm, h0, c0, word2vec,
        android_id_to_data_title, android_id_to_data_body, word_to_id_vocab, truncation_val_title, truncation_val_body)
    overall_qs_matrix = torch.cat([qs_matrix_ubuntu, qs_matrix_android])

    out = neural_net.forward(overall_qs_matrix)
    target_vector = Variable(torch.LongTensor(torch.cat([torch.zeros(20).long(), torch.ones(20).long()])))
    loss_batch = loss_function_nn(out, target_vector)

    print("Neural net cross-entropy loss on batch:", loss_batch.data[0])
    return loss_batch

    
'''Begin training'''
for epoch in range(num_epochs):
    # Train on whole training data set
    for batch in range(1, num_batches + 1):
        if batch == 93 or batch == 301 or batch == 243:
            continue

        start = time.time()
        optimizer_lstm.zero_grad()
        optimizer_nn.zero_grad()
        print("Working on batch #: ", batch)

        # Train on ubuntu similar question retrieval
        ids_this_batch_for_lstm_ubuntu = training_question_ids_ubuntu[batch_size * (batch - 1):batch_size * batch]
        ids_this_batch_for_lstm_android = np.array(training_question_ids_android)[np.random.choice(np.arange(len(training_question_ids_android)), size = round(batch_size/6), replace = False)]
        loss_batch_similarity = train_lstm_question_similarity(lstm, ids_this_batch_for_lstm_ubuntu, ids_this_batch_for_lstm_android,
        training_data_ubuntu, data_android, word2vec, ubuntu_id_to_data_title, ubuntu_id_to_data_body, android_id_to_data_title, android_id_to_data_body, word_to_id_vocab,
                                                               truncation_val_title, truncation_val_body)

        # Train on ubuntu-android domain classification task
        ids_randomized_ubuntu = get_20_random_ids(training_question_ids_ubuntu)
        ids_randomized_android = get_20_random_ids(training_question_ids_android)
        loss_batch_domain_classification = train_nn_domain_classification(neural_net, lstm, h0, c0,
            ids_randomized_ubuntu, ids_randomized_android, word2vec, ubuntu_id_to_data_title, ubuntu_id_to_data_body, android_id_to_data_title, android_id_to_data_body,
                                                                         truncation_val_title, truncation_val_body)

        # Overall loss = multi-margin loss - LAMBDA * cross entropy loss
        overall_loss = loss_batch_similarity - (lamb * loss_batch_domain_classification)
        overall_loss.backward()
        optimizer_lstm.step()
        optimizer_nn.step()

        print("Time_on_batch:", time.time() - start)
        
        if batch % 100 == 0:
            # Save model for this epoch
            torch.save(lstm, '../Pickle/' + saved_model_name_lstm + '_e' + str(epoch) + '_b' + str(batch) + '.pth')
            torch.save(neural_net, '../Pickle/' + saved_model_name_nn + '_e' + str(epoch) + '_b' + str(batch) + '.pth')

            # Save optimizer for this epoch
            torch.save(optimizer_lstm, '../Pickle/' + 'optim_lstm_domain_adapt_exp_3' + '_e' + str(epoch) + '_b' + str(batch) + '.pth')
            torch.save(optimizer_nn, '../Pickle/' + 'optim_nn_domain_adapt_exp_3' + '_e' + str(epoch) + '_b' + str(batch) + '.pth')
            
        if batch % 100 == 0:
            # Evaluate on dev set for AUC score
            dev_AUC_score = eval_model(lstm, dev_question_ids_android, data_android, word2vec, android_id_to_data_title, android_id_to_data_body,
                            word_to_id_vocab, truncation_val_title, truncation_val_body)

            print("Dev AUC score:", dev_AUC_score)

Working on batch #:  1
lstm multi-margin loss on batch: 0.05727390944957733
Neural net cross-entropy loss on batch: 0.6944898366928101
Time_on_batch: 42.03669261932373
Working on batch #:  2
lstm multi-margin loss on batch: 0.04206767678260803
Neural net cross-entropy loss on batch: 0.6935567259788513
Time_on_batch: 55.459444761276245
Working on batch #:  3
lstm multi-margin loss on batch: 0.02788628824055195
Neural net cross-entropy loss on batch: 0.6948354840278625
Time_on_batch: 51.19212865829468
Working on batch #:  4
lstm multi-margin loss on batch: 0.01537452731281519
Neural net cross-entropy loss on batch: 0.6943662762641907
Time_on_batch: 41.82421851158142
Working on batch #:  5
lstm multi-margin loss on batch: 0.03313933685421944
Neural net cross-entropy loss on batch: 0.6942253708839417
Time_on_batch: 166.59056568145752
Working on batch #:  6
lstm multi-margin loss on batch: 0.007746235933154821
Neural net cross-entropy loss on batch: 0.6936939358711243
Time_on_batch: 56.9450

Neural net cross-entropy loss on batch: 0.690290093421936
Time_on_batch: 55.08400106430054
Working on batch #:  50
lstm multi-margin loss on batch: 0.006610371172428131
Neural net cross-entropy loss on batch: 0.69217848777771
Time_on_batch: 44.69674897193909
Working on batch #:  51
lstm multi-margin loss on batch: 0.004995759576559067
Neural net cross-entropy loss on batch: 0.6917600035667419
Time_on_batch: 41.58535122871399
Working on batch #:  52
lstm multi-margin loss on batch: 0.0048829335719347
Neural net cross-entropy loss on batch: 0.6908021569252014
Time_on_batch: 57.686254262924194
Working on batch #:  53
lstm multi-margin loss on batch: 0.01044241152703762
Neural net cross-entropy loss on batch: 0.6903677582740784
Time_on_batch: 48.07637095451355
Working on batch #:  54
lstm multi-margin loss on batch: 0.004097806755453348
Neural net cross-entropy loss on batch: 0.6917716264724731
Time_on_batch: 69.78858041763306
Working on batch #:  55
lstm multi-margin loss on batch: 0.0046

Time_on_batch: 32.81125068664551
Working on batch #:  99
lstm multi-margin loss on batch: 0.007131865248084068
Neural net cross-entropy loss on batch: 0.6885408163070679
Time_on_batch: 142.42762660980225
Working on batch #:  100
lstm multi-margin loss on batch: 0.008276914246380329
Neural net cross-entropy loss on batch: 0.6888967156410217
Time_on_batch: 53.00249910354614
119685
0 2394 4788 7182 9576 11970 14364 16758 19152 21546 23940 26334 28728 31122 33516 35910 38304 40698 43092 45486 47880 50274 52668 55062 57456 59850 62244 64638 67032 69426 71820 74214 76608 79002 81396 83790 86184 88578 90972 93366 95760 98154 100548 102942 105336 107730 110124 112518 114912 117306 Dev AUC score: 0.726481034971
Working on batch #:  101
lstm multi-margin loss on batch: 0.0036201677285134792
Neural net cross-entropy loss on batch: 0.6892288327217102
Time_on_batch: 36.322588205337524
Working on batch #:  102
lstm multi-margin loss on batch: 0.007136214524507523
Neural net cross-entropy loss on bat

Time_on_batch: 39.67950987815857
Working on batch #:  145
lstm multi-margin loss on batch: 0.0027688229456543922
Neural net cross-entropy loss on batch: 0.6866629719734192
Time_on_batch: 37.90178728103638
Working on batch #:  146
lstm multi-margin loss on batch: 0.0076998937875032425
Neural net cross-entropy loss on batch: 0.6873549222946167
Time_on_batch: 68.33369898796082
Working on batch #:  147
lstm multi-margin loss on batch: 0.00247102789580822
Neural net cross-entropy loss on batch: 0.6853885054588318
Time_on_batch: 39.741703033447266
Working on batch #:  148
lstm multi-margin loss on batch: 0.0045669665560126305
Neural net cross-entropy loss on batch: 0.6863506436347961
Time_on_batch: 39.56923007965088
Working on batch #:  149
lstm multi-margin loss on batch: 0.002292020246386528
Neural net cross-entropy loss on batch: 0.6868152618408203
Time_on_batch: 36.31259250640869
Working on batch #:  150
lstm multi-margin loss on batch: 0.0032185991294682026
Neural net cross-entropy loss

Time_on_batch: 57.155266523361206
Working on batch #:  193
lstm multi-margin loss on batch: 0.0025188575964421034
Neural net cross-entropy loss on batch: 0.6780117154121399
Time_on_batch: 50.22607755661011
Working on batch #:  194
lstm multi-margin loss on batch: 0.0044075604528188705
Neural net cross-entropy loss on batch: 0.6807971000671387
Time_on_batch: 41.98817777633667
Working on batch #:  195
lstm multi-margin loss on batch: 0.0016850822139531374
Neural net cross-entropy loss on batch: 0.678548276424408
Time_on_batch: 39.21444606781006
Working on batch #:  196
lstm multi-margin loss on batch: 0.0031268722377717495
Neural net cross-entropy loss on batch: 0.6820353269577026
Time_on_batch: 67.33813333511353
Working on batch #:  197
lstm multi-margin loss on batch: 0.002187029691413045
Neural net cross-entropy loss on batch: 0.6762112379074097
Time_on_batch: 35.736592292785645
Working on batch #:  198
lstm multi-margin loss on batch: 0.004980946891009808
Neural net cross-entropy los

Time_on_batch: 40.112565994262695
Working on batch #:  239
lstm multi-margin loss on batch: 0.004554593004286289
Neural net cross-entropy loss on batch: 0.6652868986129761
Time_on_batch: 40.880778312683105
Working on batch #:  240
lstm multi-margin loss on batch: 0.003314410336315632
Neural net cross-entropy loss on batch: 0.6723513007164001
Time_on_batch: 44.013198375701904
Working on batch #:  241
lstm multi-margin loss on batch: 0.004403278231620789
Neural net cross-entropy loss on batch: 0.6653317213058472
Time_on_batch: 45.5206835269928
Working on batch #:  242
lstm multi-margin loss on batch: 0.0009082817705348134
Neural net cross-entropy loss on batch: 0.6728547811508179
Time_on_batch: 37.86125040054321
Working on batch #:  243
lstm multi-margin loss on batch: 0.0028442491311579943
Neural net cross-entropy loss on batch: 0.6667026281356812
Time_on_batch: 365.8026804924011
Working on batch #:  245
lstm multi-margin loss on batch: 0.005263939965516329
Neural net cross-entropy loss

Time_on_batch: 39.595385789871216
Working on batch #:  288
lstm multi-margin loss on batch: 0.003082193899899721
Neural net cross-entropy loss on batch: 0.6426658630371094
Time_on_batch: 42.77640223503113
Working on batch #:  289
lstm multi-margin loss on batch: 0.0012307389406487346
Neural net cross-entropy loss on batch: 0.6568764448165894
Time_on_batch: 46.65714383125305
Working on batch #:  290
lstm multi-margin loss on batch: 0.0033401192631572485
Neural net cross-entropy loss on batch: 0.6443213224411011
Time_on_batch: 46.32148575782776
Working on batch #:  291
lstm multi-margin loss on batch: 0.003175710793584585
Neural net cross-entropy loss on batch: 0.6442278623580933
Time_on_batch: 41.60778450965881
Working on batch #:  292
lstm multi-margin loss on batch: 0.002564840018749237
Neural net cross-entropy loss on batch: 0.6389839053153992
Time_on_batch: 202.69238209724426
Working on batch #:  293
lstm multi-margin loss on batch: 0.0035557516384869814
Neural net cross-entropy los

In [6]:
''' Encoder (LSTM) '''
#lstm_x = torch.load('../Pickle/lstm_domain_adapt_explore3_e0_b300.pth')

''' Procedural parameters '''
auc_scorer = AUCMeter()

# Evaluate on dev/test set for AUC score
test_AUC_score = eval_model(lstm, test_question_ids_android, test_data_android, word2vec, android_id_to_data_title, android_id_to_data_body,
                            word_to_id_vocab, truncation_val_title, truncation_val_body)

print("Test AUC score:", test_AUC_score)

67569
0 1351 2702 4053 5404 6755 8106 9457 10808 12159 13510 14861 16212 17563 18914 20265 21616 22967 24318 25669 27020 28371 29722 31073 32424 33775 35126 36477 37828 39179 40530 41881 43232 44583 45934 47285 48636 49987 51338 52689 54040 55391 56742 58093 59444 60795 62146 63497 64848 66199 67550 Test AUC score: 0.771608200809
