In [1]:
from parameters_lstm import *

from preprocess_datapoints_lstm import *
from preprocess_text_to_tensors_lstm import *
from domain_classifier_model_lstm import *
from meter import *

import torch
from torch.autograd import Variable
import time

saved_model_name_lstm = 'lstm_domain_adapt'
saved_model_name_nn = 'nn_domain_adapt'

# Initialize the data sets
processed_corpus = process_whole_corpuses()
word_to_id_vocab = processed_corpus['word_to_id']
word2vec = load_glove_embeddings(glove_path, word_to_id_vocab)
ubuntu_id_to_data_title = processed_corpus['ubuntu_id_to_data_title']
android_id_to_data_title = processed_corpus['android_id_to_data_title']
ubuntu_id_to_data_body = processed_corpus['ubuntu_id_to_data_body']
android_id_to_data_body = processed_corpus['android_id_to_data_body']


''' Data Sets '''
training_data_ubuntu = ubuntu_id_to_similar_different()
training_question_ids_ubuntu = list(training_data_ubuntu.keys())
dev_data_android = android_id_to_similar_different(dev=True)
dev_question_ids_android = list(dev_data_android.keys())
test_data_android = android_id_to_similar_different(dev=False)
test_question_ids_android = list(test_data_android.keys())
# Note: Remember to edit batch_size accordingly if testing on smaller size data sets

In [1]:
def eval_model(lstm, ids, data, word2vec, id2Data_title, id2Data_body, word_to_id_vocab, truncation_val_title, truncation_val_body):
    lstm.eval()
    auc_scorer.reset()

    candidate_ids, q_main_ids, labels = organize_test_ids(ids, data)
    num_q_main = len(q_main_ids)
    len_pieces = round(num_q_main/50)
    print(num_q_main)

    for i in range(0, num_q_main, len_pieces):
        print(i, end = ' ')
        q_main_id_num_repl_tuple = q_main_ids[i:i+len_pieces]
        candidates = candidate_ids[i:i+len_pieces]
        current_labels = torch.from_numpy(np.array(labels[i:i+len_pieces])).long()

        candidates_qs_matrix = construct_qs_matrix_testing(candidates, lstm, h0, c0, word2vec, id2Data_title, id2Data_body,
        word_to_id_vocab, truncation_val_title, truncation_val_body, main=False)
        main_qs_matrix = construct_qs_matrix_testing(q_main_id_num_repl_tuple, lstm, h0, c0, word2vec, id2Data_title, id2Data_body,
        word_to_id_vocab, truncation_val_title, truncation_val_body, main=True)

        similarity_matrix_this_batch = torch.nn.functional.cosine_similarity(candidates_qs_matrix, main_qs_matrix, eps=1e-8).data
        auc_scorer.add(similarity_matrix_this_batch, current_labels)

    auc_score = auc_scorer.value()

    return auc_score

In [14]:
''' Params Dashboard '''

''' Procedural parameters '''
batch_size = 40
num_differing_questions = 20
num_epochs = 2


''' Model specs LSTM '''
dropout = 0.3
margin = 0.1
lr_lstm = 10**-3

input_size = 300
hidden_size = 240
num_layers = 1
bias = True
batch_first = True
bidirectional = True
first_dim = num_layers * 2 if bidirectional else num_layers


''' Model specs NN '''
lr_nn = -10**-4
lamb = Variable(torch.ones(1) * 1e-3)

input_size_nn = 2*hidden_size if bidirectional else hidden_size
first_hidden_size_nn = 300
second_hidden_size_nn = 150


''' Data processing specs '''
truncation_val_title = 15
truncation_val_body = 85
padding_idx = 0

glove_path = '../glove.840B.300d.txt'
android_corpus_path = '../android_dataset/corpus.tsv'
ubuntu_corpus_path = '../ubuntu_dataset/text_tokenized.txt'

In [15]:
''' Encoder (LSTM) '''
lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional)
loss_function_lstm = torch.nn.MultiMarginLoss(margin=margin)
optimizer_lstm = torch.optim.Adam(lstm.parameters(), lr=lr_lstm, weight_decay = 0.00001)

h0 = Variable(torch.zeros(first_dim, 1, hidden_size), requires_grad=False)
c0 = Variable(torch.zeros(first_dim, 1, hidden_size), requires_grad=False)


''' Domain Classifier (Neural Net) '''
neural_net = DomainClassifier(input_size_nn, first_hidden_size_nn, second_hidden_size_nn)
loss_function_nn = nn.CrossEntropyLoss()
optimizer_nn = torch.optim.Adam(neural_net.parameters(), lr=lr_nn, weight_decay = 0.00001)


''' Procedural parameters '''
num_batches = round(len(training_question_ids_ubuntu) / batch_size)
auc_scorer = AUCMeter()


def train_lstm_question_similarity(lstm, batch_ids, batch_data, word2vec, id2Data_title, id2Data_body, word_to_id_vocab, truncation_val_title, truncation_val_body):
    lstm.train()
    sequence_ids, dict_sequence_lengths = organize_ids_training(batch_ids, batch_data, num_differing_questions)

    candidates_qs_tuples_matrix = construct_qs_matrix_training(sequence_ids, lstm, h0, c0, word2vec, id2Data_title, id2Data_body,
        dict_sequence_lengths, num_differing_questions, word_to_id_vocab, truncation_val_title, truncation_val_body, candidates=True)
    main_qs_tuples_matrix = construct_qs_matrix_training(batch_ids, lstm, h0, c0, word2vec, id2Data_title, id2Data_body,
        dict_sequence_lengths, num_differing_questions, word_to_id_vocab, truncation_val_title, truncation_val_body, candidates=False)

    similarity_matrix = torch.nn.functional.cosine_similarity(candidates_qs_tuples_matrix, main_qs_tuples_matrix, dim=2, eps=1e-6)
    target = Variable(torch.LongTensor([0] * int(len(sequence_ids) / (1 + num_differing_questions))))
    loss_batch = loss_function_lstm(similarity_matrix, target)

    print("lstm multi-margin loss on batch:", loss_batch.data[0])
    return loss_batch


def train_nn_domain_classification(neural_net, lstm, h0, c0, ids_ubuntu, ids_android, word2vec,
    ubuntu_id_to_data_title, ubuntu_id_to_data_body, android_id_to_data_title, android_id_to_data_body, truncation_val_title, truncation_val_body):
    neural_net.train()
    lstm.train()

    qs_matrix_ubuntu = construct_qs_matrix_domain_classification(ids_ubuntu, lstm, h0, c0, word2vec,
        ubuntu_id_to_data_title, ubuntu_id_to_data_body, word_to_id_vocab, truncation_val_title, truncation_val_body)
    qs_matrix_android = construct_qs_matrix_domain_classification(ids_android, lstm, h0, c0, word2vec,
        android_id_to_data_title, android_id_to_data_body, word_to_id_vocab, truncation_val_title, truncation_val_body)
    overall_qs_matrix = torch.cat([qs_matrix_ubuntu, qs_matrix_android])

    out = neural_net.forward(overall_qs_matrix)
    softmax = torch.nn.Softmax()
    preds = torch.round(softmax(out)[:, 1].double()).data.numpy()
    target_vector = Variable(torch.cat([torch.zeros(20).long(), torch.ones(20).long()]))
    acc = np.sum(preds == target_vector.float().data.numpy())/len(preds)
    loss_batch = loss_function_nn(out.double(), target_vector)

    print("Neural net cross-entropy loss on batch:", loss_batch.data[0])
    print("Neural net accuracy on batch:", acc)
    return loss_batch.float()


'''Begin training'''
for epoch in range(num_epochs):

    # Train on whole training data set
    for batch in range(1, num_batches + 1):
        if batch == 93 or batch == 301:
            continue
        start = time.time()
        optimizer_lstm.zero_grad()
        optimizer_nn.zero_grad()
        print("Working on batch #: ", batch)

        # Train on ubuntu similar question retrieval
        ids_this_batch_for_lstm = training_question_ids_ubuntu[batch_size * (batch - 1):batch_size * batch]
        loss_batch_similarity = train_lstm_question_similarity(lstm, ids_this_batch_for_lstm,
        training_data_ubuntu, word2vec, ubuntu_id_to_data_title, ubuntu_id_to_data_body, word_to_id_vocab, truncation_val_title, truncation_val_body)

        # Train on ubuntu-android domain classification task
        ids_randomized_ubuntu = get_20_random_ids(training_question_ids_ubuntu)
        ids_randomized_android = get_20_random_ids(dev_question_ids_android)
        loss_batch_domain_classification = train_nn_domain_classification(neural_net, lstm, h0, c0,
            ids_randomized_ubuntu, ids_randomized_android, word2vec, ubuntu_id_to_data_title, ubuntu_id_to_data_body, android_id_to_data_title, android_id_to_data_body,
                                                                         truncation_val_title, truncation_val_body)

        # Overall loss = multi-margin loss - LAMBDA * cross entropy loss
        overall_loss = loss_batch_similarity - (lamb * loss_batch_domain_classification)
        print('Overall loss: ' + str(overall_loss.data[0]))
        overall_loss.backward()
        optimizer_lstm.step()
        optimizer_nn.step()

        print("Time on batch:", time.time() - start)
        
        if batch % 100 == 0:
            # Save model for this epoch
            torch.save(lstm, '../Pickle/' + saved_model_name_lstm + '_e' + str(epoch) + '_b' + str(batch) + '.pth')
            torch.save(neural_net, '../Pickle/' + saved_model_name_nn + '_e' + str(epoch) + '_b' + str(batch) + '.pth')

            # Save optimizer for this epoch
            torch.save(optimizer_lstm, '../Pickle/' + 'optim_lstm_domain_adapt' + '_e' + str(epoch) + '_b' + str(batch) + '.pth')
            torch.save(optimizer_nn, '../Pickle/' + 'optim_nn_domain_adapt' + '_e' + str(epoch) + '_b' + str(batch) + '.pth')
            
        if batch % 100 == 0:
            # Evaluate on dev set for AUC score
            dev_AUC_score = eval_model(lstm, dev_question_ids_android, dev_data_android, word2vec, android_id_to_data_title, android_id_to_data_body,
                            word_to_id_vocab, truncation_val_title, truncation_val_body)

            print("Dev AUC score:", dev_AUC_score)

Working on batch #:  1
lstm multi-margin loss on batch: 0.056106001138687134
Neural net cross-entropy loss on batch: 0.6945119236179902
Neural net accuracy on batch: 0.5
Overall loss: 0.05541148781776428
Time on batch: 36.91167616844177
Working on batch #:  2
lstm multi-margin loss on batch: 0.03898945450782776
Neural net cross-entropy loss on batch: 0.6950739237934738
Neural net accuracy on batch: 0.5
Overall loss: 0.03829438239336014
Time on batch: 47.62154197692871
Working on batch #:  3
lstm multi-margin loss on batch: 0.028828946873545647
Neural net cross-entropy loss on batch: 0.694450050310268
Neural net accuracy on batch: 0.5
Overall loss: 0.02813449688255787
Time on batch: 46.985304832458496
Working on batch #:  4
lstm multi-margin loss on batch: 0.01979062147438526
Neural net cross-entropy loss on batch: 0.6939695307018025
Neural net accuracy on batch: 0.475
Overall loss: 0.019096652045845985
Time on batch: 35.69463324546814
Working on batch #:  5
lstm multi-margin loss on ba

Neural net cross-entropy loss on batch: 0.6852191466132487
Neural net accuracy on batch: 0.85
Overall loss: 0.005391392391175032
Time on batch: 32.14245653152466
Working on batch #:  36
lstm multi-margin loss on batch: 0.008519532158970833
Neural net cross-entropy loss on batch: 0.6843812299920892
Neural net accuracy on batch: 0.8
Overall loss: 0.007835150696337223
Time on batch: 32.41402316093445
Working on batch #:  37
lstm multi-margin loss on batch: 0.006537268869578838
Neural net cross-entropy loss on batch: 0.6804288437471498
Neural net accuracy on batch: 0.875
Overall loss: 0.005856839939951897
Time on batch: 36.30012083053589
Working on batch #:  38
lstm multi-margin loss on batch: 0.008851558901369572
Neural net cross-entropy loss on batch: 0.6837988948520984
Neural net accuracy on batch: 0.9
Overall loss: 0.008167760446667671
Time on batch: 34.738396883010864
Working on batch #:  39
lstm multi-margin loss on batch: 0.007221853826195002
Neural net cross-entropy loss on batch: 

Time on batch: 298.95687794685364
Working on batch #:  70
lstm multi-margin loss on batch: 0.009182392619550228
Neural net cross-entropy loss on batch: 0.6522467504601639
Neural net accuracy on batch: 0.95
Overall loss: 0.00853014551103115
Time on batch: 33.22250032424927
Working on batch #:  71
lstm multi-margin loss on batch: 0.004227119963616133
Neural net cross-entropy loss on batch: 0.658078768657827
Neural net accuracy on batch: 0.75
Overall loss: 0.003569041145965457
Time on batch: 66.1584153175354
Working on batch #:  72
lstm multi-margin loss on batch: 0.017941758036613464
Neural net cross-entropy loss on batch: 0.657588077136099
Neural net accuracy on batch: 0.85
Overall loss: 0.017284169793128967
Time on batch: 37.374834060668945
Working on batch #:  73
lstm multi-margin loss on batch: 0.004426118917763233
Neural net cross-entropy loss on batch: 0.6499763532335636
Neural net accuracy on batch: 0.9
Overall loss: 0.0037761423736810684
Time on batch: 33.27031874656677
Working o

Time on batch: 35.00248098373413
Working on batch #:  104
lstm multi-margin loss on batch: 0.00650456827133894
Neural net cross-entropy loss on batch: 0.5753122326615954
Neural net accuracy on batch: 0.975
Overall loss: 0.005929255858063698
Time on batch: 36.192949056625366
Working on batch #:  105
lstm multi-margin loss on batch: 0.006785592529922724
Neural net cross-entropy loss on batch: 0.6022912150090963
Neural net accuracy on batch: 0.9
Overall loss: 0.006183301098644733
Time on batch: 34.574055194854736
Working on batch #:  106
lstm multi-margin loss on batch: 0.007765198592096567
Neural net cross-entropy loss on batch: 0.6006697088862775
Neural net accuracy on batch: 0.85
Overall loss: 0.00716452905908227
Time on batch: 38.284149408340454
Working on batch #:  107
lstm multi-margin loss on batch: 0.006348751485347748
Neural net cross-entropy loss on batch: 0.5926844158842168
Neural net accuracy on batch: 0.875
Overall loss: 0.005756067112088203
Time on batch: 35.277503490448
Wor

Time on batch: 36.29784893989563
Working on batch #:  138
lstm multi-margin loss on batch: 0.016526805236935616
Neural net cross-entropy loss on batch: 0.5136502392731558
Neural net accuracy on batch: 0.95
Overall loss: 0.01601315476000309
Time on batch: 37.59382963180542
Working on batch #:  139
lstm multi-margin loss on batch: 0.003356078639626503
Neural net cross-entropy loss on batch: 0.47370510910499614
Neural net accuracy on batch: 0.95
Overall loss: 0.00288237351924181
Time on batch: 41.693284034729004
Working on batch #:  140
lstm multi-margin loss on batch: 0.0025243598502129316
Neural net cross-entropy loss on batch: 0.4987720460920274
Neural net accuracy on batch: 0.825
Overall loss: 0.0020255877170711756
Time on batch: 33.267252922058105
Working on batch #:  141
lstm multi-margin loss on batch: 0.006182648241519928
Neural net cross-entropy loss on batch: 0.4972883477236126
Neural net accuracy on batch: 0.875
Overall loss: 0.005685359705239534
Time on batch: 35.2733440399169

Time on batch: 40.205251932144165
Working on batch #:  172
lstm multi-margin loss on batch: 0.0037323383148759604
Neural net cross-entropy loss on batch: 0.4979916292511303
Neural net accuracy on batch: 0.8
Overall loss: 0.003234346630051732
Time on batch: 44.11856269836426
Working on batch #:  173
lstm multi-margin loss on batch: 0.004330973140895367
Neural net cross-entropy loss on batch: 0.4439480648070854
Neural net accuracy on batch: 0.85
Overall loss: 0.0038870251737535
Time on batch: 40.53797197341919
Working on batch #:  174
lstm multi-margin loss on batch: 0.0033927259501069784
Neural net cross-entropy loss on batch: 0.4537464953473943
Neural net accuracy on batch: 0.875
Overall loss: 0.0029389793053269386
Time on batch: 35.217458963394165
Working on batch #:  175
lstm multi-margin loss on batch: 0.004910478834062815
Neural net cross-entropy loss on batch: 0.46063380019875355
Neural net accuracy on batch: 0.85
Overall loss: 0.00444984482601285
Time on batch: 33.915536403656006

Neural net cross-entropy loss on batch: 0.369597637082202
Neural net accuracy on batch: 0.925
Overall loss: 0.0064076767303049564
Time on batch: 60.006914138793945
Working on batch #:  205
lstm multi-margin loss on batch: 0.004776333924382925
Neural net cross-entropy loss on batch: 0.4293550059539596
Neural net accuracy on batch: 0.825
Overall loss: 0.004346978850662708
Time on batch: 37.1083869934082
Working on batch #:  206
lstm multi-margin loss on batch: 0.00600763875991106
Neural net cross-entropy loss on batch: 0.39532262683031366
Neural net accuracy on batch: 0.875
Overall loss: 0.005612316075712442
Time on batch: 50.452229738235474
Working on batch #:  207
lstm multi-margin loss on batch: 0.0021845255978405476
Neural net cross-entropy loss on batch: 0.368808503911397
Neural net accuracy on batch: 0.9
Overall loss: 0.0018157170852646232
Time on batch: 33.706674098968506
Working on batch #:  208
lstm multi-margin loss on batch: 0.002014947822317481
Neural net cross-entropy loss o

Neural net cross-entropy loss on batch: 0.29916040470243105
Neural net accuracy on batch: 0.95
Overall loss: 0.003737120423465967
Time on batch: 39.16108155250549
Working on batch #:  239
lstm multi-margin loss on batch: 0.003866777289658785
Neural net cross-entropy loss on batch: 0.3206445566214292
Neural net accuracy on batch: 0.925
Overall loss: 0.003546132706105709
Time on batch: 34.793386697769165
Working on batch #:  240
lstm multi-margin loss on batch: 0.005867418367415667
Neural net cross-entropy loss on batch: 0.44681632662839865
Neural net accuracy on batch: 0.875
Overall loss: 0.005420601926743984
Time on batch: 38.182382106781006
Working on batch #:  241
lstm multi-margin loss on batch: 0.011848623864352703
Neural net cross-entropy loss on batch: 0.2881113414279462
Neural net accuracy on batch: 0.95
Overall loss: 0.011560512706637383
Time on batch: 39.93871545791626
Working on batch #:  242
lstm multi-margin loss on batch: 0.0027110844384878874
Neural net cross-entropy loss

Neural net cross-entropy loss on batch: 0.3155958653535149
Neural net accuracy on batch: 0.95
Overall loss: 0.0023401733487844467
Time on batch: 37.74280333518982
Working on batch #:  273
lstm multi-margin loss on batch: 0.005822832230478525
Neural net cross-entropy loss on batch: 0.46943774215098316
Neural net accuracy on batch: 0.825
Overall loss: 0.00535339443013072
Time on batch: 38.694926738739014
Working on batch #:  274
lstm multi-margin loss on batch: 0.003936853725463152
Neural net cross-entropy loss on batch: 0.41780912736581566
Neural net accuracy on batch: 0.85
Overall loss: 0.003519044490531087
Time on batch: 39.6329619884491
Working on batch #:  275
lstm multi-margin loss on batch: 0.0016521799843758345
Neural net cross-entropy loss on batch: 0.23653639494578488
Neural net accuracy on batch: 0.875
Overall loss: 0.0014156436081975698
Time on batch: 56.92103552818298
Working on batch #:  276
lstm multi-margin loss on batch: 0.007529746741056442
Neural net cross-entropy loss

KeyboardInterrupt: 

In [16]:
''' Encoder (LSTM) '''
#lstm_x = torch.load('../Pickle/lstm_domain_adapt_title_body_e2_b50.pt')

''' Procedural parameters '''
auc_scorer = AUCMeter()

# Evaluate on dev set for AUC score
test_AUC_score = eval_model(lstm, test_question_ids_android, test_data_android, word2vec, android_id_to_data_title, android_id_to_data_body,
                            word_to_id_vocab, truncation_val_title, truncation_val_body)

print("Test AUC score:", test_AUC_score)

119786
0 2396 4792 7188 9584 11980 14376 16772 19168 21564 23960 26356 28752 31148 33544 35940 38336 40732 43128 45524 47920 50316 52712 55108 57504 59900 62296 64692 67088 69484 71880 74276 76672 79068 81464 83860 86256 88652 91048 93444 95840 98236 100632 103028 105424 107820 110216 112612 115008 117404 Test AUC score: 0.689754596694
