In [80]:
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath('')))

In [81]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from utils_ import *
from mp_models import *
from datasets import *
from gensim import models

In [82]:
train_val_doc_list1_path = '../tmp_files/train_val_doc_list1.pkl'
train_val_doc_list2_path = '../tmp_files/train_val_doc_list2.pkl'
test_doc_list1_path = '../tmp_files/test_doc_list1.pkl'
test_doc_list2_path = '../tmp_files/test_doc_list2.pkl'
train_label_path = "../dataset/newssim-train/label.txt"
test_label_path =  "../dataset/newssim-test/label.txt"
vocab_path = '../tmp_files/vocab.pkl'

In [83]:
with open(train_val_doc_list1_path, 'rb') as f:
    train_val_doc_list1 = pickle.load(f)
with open(train_val_doc_list2_path, 'rb') as f:
    train_val_doc_list2 = pickle.load(f)
with open(test_doc_list1_path, 'rb') as f:
    test_doc_list1 = pickle.load(f)
with open(test_doc_list2_path, 'rb') as f:
    test_doc_list2 = pickle.load(f)
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

In [84]:
val_data_num = 2000
np.random.seed(3)
indices = np.random.permutation(len(train_val_doc_list1))
train_idx = indices[:-val_data_num]
val_idx = indices[-val_data_num:]

train_doc_list1 = []
train_doc_list2 = []
val_doc_list1 = []
val_doc_list2 = []

for idx in train_idx:
    train_doc_list1.append(train_val_doc_list1[idx])
    train_doc_list2.append(train_val_doc_list2[idx])

for idx in val_idx:
    val_doc_list1.append(train_val_doc_list1[idx])
    val_doc_list2.append(train_val_doc_list2[idx])

In [85]:
print(len(train_doc_list1), len(val_doc_list1), len(test_doc_list1))

25627 2000 2000


In [86]:
train_labels = get_labels(train_label_path)[train_idx]
val_labels = get_labels(train_label_path)[val_idx]
test_labels = get_labels(test_label_path)

In [87]:
print(len(train_labels), len(val_labels), len(test_labels))

25627 2000 2000


In [88]:
max_doc_len = 0
max_sent_len = 0
total_list_len = 0
total_doc_len = 0
total_sent_len = 0
for doc_list in [train_doc_list1, train_doc_list2, val_doc_list1, val_doc_list2]:
    total_list_len += len(doc_list)
    for doc in doc_list:
        max_doc_len = max(max_doc_len, len(doc))
        total_doc_len += len(doc)
        for sent in doc:
            max_sent_len = max(max_sent_len, len(sent))
            total_sent_len += len(sent)
print(f'average document length : {total_doc_len/total_list_len:0.2f}\naverage sentence length : {total_sent_len/total_doc_len:0.2f}')
print(f'max document length : {max_doc_len}\nmax sentence length : {max_sent_len}')

average document length : 24.78
average sentence length : 14.06
max document length : 844
max sentence length : 204


In [89]:
max_document_length = 26
max_sentence_length = 50

In [90]:
train_data = NewsDataset(train_doc_list1, train_doc_list2, train_labels, vocab, max_document_length, max_sentence_length)
val_data = NewsDataset(val_doc_list1, val_doc_list2, val_labels, vocab, max_document_length, max_sentence_length)
test_data = NewsDataset(test_doc_list1, test_doc_list2, test_labels, vocab, max_document_length, max_sentence_length)

In [91]:
batch_size = 64

In [92]:
train_data_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_data_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [93]:
len(vocab)

51053

In [102]:
vocab_size = len(vocab)
embed_dim = 256
hidden_dim = 50
pad_idx = vocab('<pad>')
num_layers_gru = 1
conv_sizes = [[3,3,8], [3,3,16]]
pool_sizes = [[12,12], [5,5]]
mp_hidden_dim = 128
learning_rate = 3e-3

In [103]:
encoder = GMEncoder(vocab_size, embed_dim, hidden_dim, pad_idx, num_layers_gru, conv_sizes, pool_sizes, mp_hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)

In [104]:
word2vec_model = models.Word2Vec.load('../tmp_files/word2vec_256d.model')

In [105]:
pretrained_word_vectors = np.zeros((vocab_size, embed_dim))
for i in range(2, vocab_size):
    pretrained_word_vectors[i] = word2vec_model.wv[vocab.idx2word[i]]
pretrained_word_vectors_torch = torch.FloatTensor(pretrained_word_vectors)

In [106]:
encoder.sentence_encoder.embed.weight = nn.Parameter(pretrained_word_vectors_torch)
encoder.sentence_encoder.embed.weight.requires_grad = True

In [107]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    encoder = nn.DataParallel(encoder)
encoder.to(device)

DataParallel(
  (module): GMEncoder(
    (sentence_encoder): SentenceEncoder(
      (embed): Embedding(51053, 256, padding_idx=0)
      (gru): GRU(256, 50, batch_first=True, bidirectional=True)
      (fc): Linear(in_features=100, out_features=100, bias=True)
    )
    (match_pyramid): MatchPyramid(
      (conv): ModuleList(
        (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
        (1): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1))
      )
      (bn): ModuleList(
        (0): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (pool): ModuleList(
        (0): AdaptiveAvgPool2d(output_size=(12, 12))
        (1): AdaptiveAvgPool2d(output_size=(5, 5))
      )
      (linear1): Linear(in_features=400, out_features=128, bias=True)
      (linear2): Linear(in_features=128, out_features=2, bias=True)
    )
  )
)

In [108]:
min_loss = 10000
max_acc = 0

In [109]:
num_epoches = 5
train_batch_num = len(train_data_loader)
val_batch_num = len(val_data_loader)
log_step = 200
start_time = time.time()

for epoch in range(num_epoches):

    print("{:-^50s}".format("TRAIN"))
    encoder.train()
    total_loss = 0
    for i, (t1, t2, target) in enumerate(train_data_loader):

        t1 = t1.to(device)
        t2 = t2.to(device)
        target = target.long().to(device)
        
        out = encoder(t1, t2)
        loss = criterion(out, target)
        encoder.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if (i+1) % log_step == 0:
            print(f'epoch: {epoch+1:2d}/{num_epoches:2d}, batch: {i+1:3d}/{train_batch_num}, elapsed_time: {time.time()-start_time:10.4f}s')

    print(f'epoch: {epoch+1:2d}/{num_epoches:2d}, average_loss_per_batch: {total_loss/train_batch_num:5.4f}\n')

    print("{:-^50s}".format("VALIDATION"))
    encoder.eval()
    total_loss = 0
    with torch.no_grad():
        out_list = []
        target_list = []
        for i, (t1, t2, target) in enumerate(val_data_loader):
            
            t1 = t1.to(device)
            t2 = t2.to(device)
            target = target.long().to(device)
            
            out = encoder(t1, t2)
            loss = criterion(out, target)
            total_loss += loss.item()

            out_list.append(out.detach().cpu().numpy())
            target_list.append(target.detach().cpu().numpy())

            if (i+1) % (log_step//4) == 0:
                print(f'epoch: {epoch+1:2d}/{num_epoches:2d}, batch: {i+1:3d}/{val_batch_num}, elapsed_time: {time.time()-start_time:10.4f}s')

        out = np.concatenate(out_list, axis=0).argmax(1)
        target = np.concatenate(target_list, axis=0)
        acc = np.sum(out == target) / len(out)
        print(f'accuracy : {acc:.4f}')
        
        print(f'epoch: {epoch+1:2d}/{num_epoches:2d}, average_loss_per_batch: {total_loss/val_batch_num:5.4f}\n')

        if total_loss/val_batch_num < min_loss:
            min_loss = total_loss/val_batch_num
            torch.save(encoder.state_dict(), './parameters/gru_mp_ptd_loss.pt')
            print("model parameters saved : loss\n")

        if acc > max_acc:
            max_acc = acc
            torch.save(encoder.state_dict(), './parameters/gru_mp_ptd_acc.pt')
            print("model parameters saved : acc\n")

----------------------TRAIN-----------------------


  result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,


epoch:  1/ 5, batch:  20/401, elapsed_time:     4.1707s
epoch:  1/ 5, batch:  40/401, elapsed_time:     8.0318s
epoch:  1/ 5, batch:  60/401, elapsed_time:    11.8049s
epoch:  1/ 5, batch:  80/401, elapsed_time:    15.7116s
epoch:  1/ 5, batch: 100/401, elapsed_time:    20.0075s
epoch:  1/ 5, batch: 120/401, elapsed_time:    23.7206s
epoch:  1/ 5, batch: 140/401, elapsed_time:    27.4445s
epoch:  1/ 5, batch: 160/401, elapsed_time:    31.1783s
epoch:  1/ 5, batch: 180/401, elapsed_time:    35.0445s
epoch:  1/ 5, batch: 200/401, elapsed_time:    38.8505s
epoch:  1/ 5, batch: 220/401, elapsed_time:    42.5620s
epoch:  1/ 5, batch: 240/401, elapsed_time:    46.3150s
epoch:  1/ 5, batch: 260/401, elapsed_time:    50.4006s
epoch:  1/ 5, batch: 280/401, elapsed_time:    54.6896s
epoch:  1/ 5, batch: 300/401, elapsed_time:    58.4371s
epoch:  1/ 5, batch: 320/401, elapsed_time:    62.1770s
epoch:  1/ 5, batch: 340/401, elapsed_time:    65.9297s
epoch:  1/ 5, batch: 360/401, elapsed_time:    6

In [119]:
# encoder.load_state_dict(torch.load('./parameters/gru_mp_ptd_loss.pt'))
# encoder.load_state_dict(torch.load('./parameters/gru_mp_ptd_acc.pt'))
# encoder.load_state_dict(torch.load('./parameters/gru_mp_ptd_best.pt'))

<All keys matched successfully>

In [123]:
encoder = GMEncoder(vocab_size, embed_dim, hidden_dim, pad_idx, num_layers_gru, conv_sizes, pool_sizes, mp_hidden_dim)
encoder.load_state_dict(torch.load('./parameters/gru_mp_ptd_best.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.device_count() > 1:
    encoder = nn.DataParallel(encoder)
encoder.to(device)

DataParallel(
  (module): GMEncoder(
    (sentence_encoder): SentenceEncoder(
      (embed): Embedding(51053, 256, padding_idx=0)
      (gru): GRU(256, 50, batch_first=True, bidirectional=True)
      (fc): Linear(in_features=100, out_features=100, bias=True)
    )
    (match_pyramid): MatchPyramid(
      (conv): ModuleList(
        (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(1, 1))
        (1): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1))
      )
      (bn): ModuleList(
        (0): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (pool): ModuleList(
        (0): AdaptiveAvgPool2d(output_size=(12, 12))
        (1): AdaptiveAvgPool2d(output_size=(5, 5))
      )
      (linear1): Linear(in_features=400, out_features=128, bias=True)
      (linear2): Linear(in_features=128, out_features=2, bias=True)
    )
  )
)

In [124]:
out_list_test = []
target_list_test = []
encoder.eval()

for i, (t1, t2, target) in enumerate(test_data_loader):

    t1 = t1.to(device)
    t2 = t2.to(device)
    target = target.long().to(device)
    
    out = encoder(t1, t2)
    
    out_list_test.append(out.detach().cpu().numpy().argmax(1))
    target_list_test.append(target.detach().cpu().numpy())

In [125]:
out_test = np.concatenate(out_list_test, axis=0)
target_test = np.concatenate(target_list_test, axis=0)
predictions = out_test
labels = target_test

In [126]:
print_test_statistics(predictions, labels)

accuracy : 0.8330
precision : 0.7868
recall : 0.7766
F1 score : 0.7817
TP :  598  FN :  172
FP :  162  TN : 1068
