In [62]:
import numpy as np
import math
import torch
import torch.nn as nn
from easydict import EasyDict

In [63]:
from models.neural import MultiHeadedAttention, PositionwiseFeedForward
from models.model_builder import Bert
from models.encoder import Classifier, PositionalEncoding, TransformerEncoderLayer, ExtTransformerEncoder
from models.trainer_ext import Trainer
from models.data_loader import TextLoader, load_dataset, Dataloader, get_kobert_vocab

In [64]:
args = EasyDict({
    "visible_gpus" : -1,
    "temp_dir" : './tmp/',
    "test_from": None,
    "max_pos" : 512,
    "large" : False,
    "finetune_bert": True,
    "encoder": "bert",
    "share_emb": False,
    "dec_layers": 6,
    "dec_dropout": 0.2,
    "dec_hidden_size": 768,
    "dec_heads": 8,
    "dec_ff_size": 2048,
    "enc_hidden_size": 512,
    "enc_ff_size": 512,
    "enc_dropout": 0.2,
    "enc_layers": 6,
    
    "ext_dropout": 0.2,
    "ext_layers": 2,
    "ext_hidden_size": 768,
    "ext_heads": 8,
    "ext_ff_size": 2048,
    
    "accum_count": 1,
    "save_checkpoint_steps": 5,
    
    "generator_shard_size": 32,
    "alpha": 0.6,
    "beam_size": 5,
    "min_length": 15,
    "max_length": 150,
    "max_tgt_len": 140,  
    "block_trigram": True,
    
    "model_path": "./tmp_model/",
    "result_path": "./tmp_result/src",
    "recall_eval": False,
    "report_every": 1,
})

In [65]:
class ExtTransformerEncoder(nn.Module):
    def __init__(self, d_model, d_ff, heads, dropout, num_inter_layers=0):
        super(ExtTransformerEncoder, self).__init__()
        self.d_model = d_model
        self.num_inter_layers = num_inter_layers
        self.pos_emb = PositionalEncoding(dropout, d_model)
        self.transformer_inter = nn.ModuleList(
            [TransformerEncoderLayer(d_model, heads, d_ff, dropout)
             for _ in range(num_inter_layers)])
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.wo = nn.Linear(d_model, 1, bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, top_vecs, mask):
        """ See :obj:`EncoderBase.forward()`"""

        batch_size, n_sents = top_vecs.size(0), top_vecs.size(1)
        pos_emb = self.pos_emb.pe[:, :n_sents]
        x = top_vecs * mask[:, :, None].float()
        x = x + pos_emb

        for i in range(self.num_inter_layers):
            x = self.transformer_inter[i](i, x, x, 1 - mask)  # all_sents * max_tokens * dim

        x = self.layer_norm(x)
        sent_scores = self.sigmoid(self.wo(x))
        sent_scores = sent_scores.squeeze(-1) * mask.float()

        return sent_scores

In [66]:
class ExtSummarizer(nn.Module):
    def __init__(self, args, device, checkpoint):
        super(ExtSummarizer, self).__init__()
        self.args = args
        self.device = device
        self.bert = Bert(args.large, args.temp_dir, args.finetune_bert)
        self.ext_layer = ExtTransformerEncoder(self.bert.model.config.hidden_size,\
                                                args.ext_ff_size, args.ext_heads,\
                                                args.ext_dropout, args.ext_layers)
        
        if (args.encoder == 'baseline'):
            bert_config = BertConfig(self.bert.model.config.vocab_size, hidden_size=args.ext_hidden_size,
                                     num_hidden_layers=args.ext_layers, num_attention_heads=args.ext_heads, intermediate_size=args.ext_ff_size)
            self.bert.model = BertModel(bert_config)
            self.ext_layer = Classifier(self.bert.model.config.hidden_size)

        if(args.max_pos>512):
            my_pos_embeddings = nn.Embedding(args.max_pos, self.bert.model.config.hidden_size)
            my_pos_embeddings.weight.data[:512] = self.bert.model.embeddings.position_embeddings.weight.data
            my_pos_embeddings.weight.data[512:] = self.bert.model.embeddings.position_embeddings.weight.data[-1][None,:].repeat(args.max_pos-512,1)
            self.bert.model.embeddings.position_embeddings = my_pos_embeddings


        if checkpoint is not None:
            self.load_state_dict(checkpoint['model'], strict=True)
        else:
            if args.param_init != 0.0:
                for p in self.ext_layer.parameters():
                    p.data.uniform_(-args.param_init, args.param_init)
            if args.param_init_glorot:
                for p in self.ext_layer.parameters():
                    if p.dim() > 1:
                        xavier_uniform_(p)

        self.to(device)

    def forward(self, src, segs, clss, mask_src, mask_cls):
        top_vec = self.bert(src, segs, mask_src)
        sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]
        sents_vec = sents_vec * mask_cls[:, :, None].float()
        #sent_scores = self.ext_layer(sents_vec, mask_cls).squeeze(-1)
        return sents_vec

In [67]:
device = "cpu" if args.visible_gpus == -1 else "cuda"
checkpoint = torch.load('./checkpoint/model_step_24000.pt', map_location=lambda storage, loc: storage)

model = ExtSummarizer(args, device, checkpoint)
model.eval()

using cached model
using cached model
using cached model


ExtSummarizer(
  (bert): Bert(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(8004, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm(torch.Size([768]),

In [68]:
from scipy.spatial import distance # for measuring distance

class WindowEmbedder:
    
    def __init__(self, src_doc='', \
                window_size=3, text_loader=None, \
                agg_mode='mean', dist_mode='cosine'):

        self.src_doc = src_doc
        self.window_size = window_size
        self.sent_list = [sent for sent in src_doc.split('\n') if sent]
        self.text_loader = text_loader
        self.agg_mode = agg_mode
        self.dist_mode = dist_mode
    
    
    def get_cand_divpoints(self):
        
        window_size = self.window_size
        sent_len = len(self.sent_list)
        div_cands = list(np.arange(window_size-1, sent_len-window_size))
        return div_cands

    
    def embedder(self, target_doc=None):

        batch_iter = self.text_loader.load_text(target_doc, '\n')
        for _, batch in enumerate(batch_iter):
            src = batch.src
            segs = batch.segs
            clss = batch.clss
            mask, mask_cls = batch.mask_src, batch.mask_cls
            result_vec = model(src, segs, clss, mask, mask_cls).detach()
        return result_vec
    
    
    def get_embeddings(self):

        div_cands = self.get_cand_divpoints()
        embedded_result = []
        for d in div_cands:
            min_pos = d - self.window_size + 1
            max_pos = d + self.window_size
        
            target_sents = self.sent_list[min_pos:max_pos+1]
            target_doc = '\n'.join((target_sents))
            tmp_embedded = self.embedder(target_doc=target_doc)
            embedded_result.append(tmp_embedded.squeeze(0))
        return embedded_result
    
    def agg_vectors(self, vec1=None, vec2=None, mode = 'mean'):

        if mode=='mean':
            vec1_agg = torch.mean(vec1, dim=0)
            vec2_agg = torch.mean(vec2, dim=0)

        elif mode=='max':
            vec1_agg = torch.max(vec1, dim=0).values
            vec2_agg = torch.max(vec2, dim=0).values
        
        elif mode=='min':
            vec1_agg = torch.min(vec1, dim=0).values
            vec2_agg = torch.min(vec2, dim=0).values

        elif mode=='max_min':
            vec1_max = torch.max(vec1, dim=0).values
            vec1_min = torch.min(vec1, dim=0).values
            vec2_max = torch.max(vec2, dim=0).values
            vec2_min = torch.min(vec2, dim=0).values
            vec1_agg = torch.cat((vec1_max, vec1_min))
            vec2_agg = torch.cat((vec2_max, vec2_min))

        return vec1_agg, vec2_agg

    
    def dist_calculator(self, vec1, vec2, mode = 'cosine'):
        if mode == 'euclidean':
            diff = distance.euclidean(vec1, vec2)
        elif mode =='cosine':
            diff = distance.cosine(vec1, vec2)
        elif mode == 'jensenshannon':
            diff = distance.jensenshannon(vec1, vec2)
        return diff
    
    
    def detect_divpoints(self):
        
        embedded_result = self.get_embeddings()
        
        dist_result = []
        for emb in embedded_result:
            lh = emb[:self.window_size]
            rh = emb[self.window_size:]
            
            lh_agg, rh_agg = self.agg_vectors(lh, rh, mode=self.agg_mode)
            dist = self.dist_calculator(lh_agg, rh_agg, mode=self.dist_mode)
            dist_result.append(dist)
            #dist_result.append(dist.item())
        
        return dist_result
            

## 기사로 실험

In [69]:
import json
def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [70]:
news_df = load_jsonl('../../data/news/train.jsonl')

Loaded 260697 records from ../../data/news/train.jsonl


In [71]:
# 전처리
# (1) 글자 개수가 너무 작은 경우 없애기 (30글자 이상)
# (2) 문장이 적은 경우 해당 기사 없애기 (10문장 이상)
news_clean = []
for news in news_df:
    news_article = news['article_original']
    if len(news_article) >= 10:
        article_clean = [sent for sent in news_article if len(sent) >= 30]
        news_clean.append(article_clean)

In [72]:
import random

def make_mixed_doc(news_dataset=None, max_num=1000):
    mixed_doc_set = []
    for i in range(max_num):
        lh_count = min(random.randint(7, 10), len(news_dataset[i]))
        rh_count = min(random.randint(7, 10), len(news_dataset[i+1]))

        lh_news = news_dataset[i][:lh_count]
        rh_news = news_dataset[i+1][:rh_count]
        
        gt = lh_count - 1

        src_doc = '\n'.join((lh_news + rh_news))
        mixed_doc_set.append((src_doc, gt))
        
    return mixed_doc_set

In [73]:
random.seed(2020011135)
mixed_doc_list = make_mixed_doc(news_dataset=news_clean, max_num=500) # previous 1000

In [74]:
len(mixed_doc_list)

500

In [75]:
# Settings
loader = TextLoader(args, device)
window_size = 4

using cached model
using cached model


In [76]:
## make comparison tuples
arguments = [('min','euclidean'), ('min','cosine'), ('min','jensenshannon'),\
            ('max','euclidean'), ('max','cosine'), ('max','jensenshannon'),\
            ('mean','euclidean'), ('mean','cosine'), ('mean','jensenshannon'),\
            ('max_min','euclidean'), ('max_min','cosine'), ('max_min','jensenshannon')]
arguments

[('min', 'euclidean'),
 ('min', 'cosine'),
 ('min', 'jensenshannon'),
 ('max', 'euclidean'),
 ('max', 'cosine'),
 ('max', 'jensenshannon'),
 ('mean', 'euclidean'),
 ('mean', 'cosine'),
 ('mean', 'jensenshannon'),
 ('max_min', 'euclidean'),
 ('max_min', 'cosine'),
 ('max_min', 'jensenshannon')]

In [77]:
## make comparison tuples
sub_arguments = ['ACC', 'div_result']
sub_arguments

['ACC', 'div_result']

In [78]:
sub_args_dict = {key: None for key in sub_arguments}
sub_args_dict

{'ACC': None, 'div_result': None}

In [79]:
result_dict = {key: sub_args_dict for key in arguments}
result_dict

{('min', 'euclidean'): {'ACC': None, 'div_result': None},
 ('min', 'cosine'): {'ACC': None, 'div_result': None},
 ('min', 'jensenshannon'): {'ACC': None, 'div_result': None},
 ('max', 'euclidean'): {'ACC': None, 'div_result': None},
 ('max', 'cosine'): {'ACC': None, 'div_result': None},
 ('max', 'jensenshannon'): {'ACC': None, 'div_result': None},
 ('mean', 'euclidean'): {'ACC': None, 'div_result': None},
 ('mean', 'cosine'): {'ACC': None, 'div_result': None},
 ('mean', 'jensenshannon'): {'ACC': None, 'div_result': None},
 ('max_min', 'euclidean'): {'ACC': None, 'div_result': None},
 ('max_min', 'cosine'): {'ACC': None, 'div_result': None},
 ('max_min', 'jensenshannon'): {'ACC': None, 'div_result': None}}

In [80]:
for argument in list(result_dict.keys()):
    print(argument)
    print(f'>>>>{result_dict[argument]}')

('min', 'euclidean')
>>>>{'ACC': None, 'div_result': None}
('min', 'cosine')
>>>>{'ACC': None, 'div_result': None}
('min', 'jensenshannon')
>>>>{'ACC': None, 'div_result': None}
('max', 'euclidean')
>>>>{'ACC': None, 'div_result': None}
('max', 'cosine')
>>>>{'ACC': None, 'div_result': None}
('max', 'jensenshannon')
>>>>{'ACC': None, 'div_result': None}
('mean', 'euclidean')
>>>>{'ACC': None, 'div_result': None}
('mean', 'cosine')
>>>>{'ACC': None, 'div_result': None}
('mean', 'jensenshannon')
>>>>{'ACC': None, 'div_result': None}
('max_min', 'euclidean')
>>>>{'ACC': None, 'div_result': None}
('max_min', 'cosine')
>>>>{'ACC': None, 'div_result': None}
('max_min', 'jensenshannon')
>>>>{'ACC': None, 'div_result': None}


In [81]:
# # save as pkl
# import os 

# save_path = "./chunk_results"

# with open(os.path.join(save_path, "chunk_results.pkl"), "wb") as f:
#     pickle.dump(result_dict, f)

In [82]:
done_list = []
for argument in arguments : 
    print(argument)
    done_list.append(argument)
done_list

('min', 'euclidean')
('min', 'cosine')
('min', 'jensenshannon')
('max', 'euclidean')
('max', 'cosine')
('max', 'jensenshannon')
('mean', 'euclidean')
('mean', 'cosine')
('mean', 'jensenshannon')
('max_min', 'euclidean')
('max_min', 'cosine')
('max_min', 'jensenshannon')


[('min', 'euclidean'),
 ('min', 'cosine'),
 ('min', 'jensenshannon'),
 ('max', 'euclidean'),
 ('max', 'cosine'),
 ('max', 'jensenshannon'),
 ('mean', 'euclidean'),
 ('mean', 'cosine'),
 ('mean', 'jensenshannon'),
 ('max_min', 'euclidean'),
 ('max_min', 'cosine'),
 ('max_min', 'jensenshannon')]

In [83]:
done_list = []


In [84]:
# save as pkl
import os
import pickle

save_path = "./chunk_result"

# 학습 완료된 model param 추적
if "result_log.pkl" in os.listdir(save_path):
    with open(os.path.join(save_path, "result_log.pkl"), "rb") as f1:
        done_list = pickle.load(f1)
else:
    done_list = []

# 학습 중 튕겼을 시를 대비한 코드
if "chunk_results.pkl" in os.listdir(save_path):
    with open(os.path.join(save_path, "chunk_results.pkl"), "rb") as f2:
        result_dict = pickle.load(f2)
else:
    arguments = [('min','euclidean'), ('min','cosine'), ('min','jensenshannon'),\
            ('max','euclidean'), ('max','cosine'), ('max','jensenshannon'),\
            ('mean','euclidean'), ('mean','cosine'), ('mean','jensenshannon'),\
            ('max_min','euclidean'), ('max_min','cosine'), ('max_min','jensenshannon')]

    sub_arguments = ['ACC', 'div_result']

    sub_args_dict = {key: None for key in sub_arguments}
    result_dict = {key: sub_args_dict for key in arguments}


# inference
for argument in list(result_dict.keys()):

    if argument in done_list:
        # 이미 학습된 것이 있으면 SKIP
        print(f"[{argument}] results already exist ! -- skipping inference")
        continue
    else:
        # 이미 학습된 것이 있으면 학습 수행
        agg_mode, dist_mode = argument
        
        print(f"Running {agg_mode}, {dist_mode} experiment : ")

        err_cnt = 0
        acc_cnt = 0
        div_result = []

        for i, a_set in enumerate(mixed_doc_list):
            
            if (i+1) % 20 == 0:
                print(f"working on {i+1}th doc...")
                
            src_doc = a_set[0]
            gt = a_set[1]
            
            window_embedder = WindowEmbedder(src_doc=src_doc, window_size=window_size, text_loader=loader, agg_mode=agg_mode, dist_mode=dist_mode)
            div_scores = window_embedder.detect_divpoints()
            div_point = (window_size - 1) + div_scores.index(max(div_scores))

            if div_point == gt:
                acc_cnt += 1
            else:
                err_cnt += 1
            
            sents = [sent for sent in src_doc.split('\n') if sent]
            lh_sent, rh_sent = [], []
            for i, sent in enumerate(sents):
                if i <= div_point:
                    lh_sent.append(sent)
                else:
                    rh_sent.append(sent)
                    
            result_sents = lh_sent + ["----------------[DIV]---------------"] + rh_sent
            div_result.append((result_sents, div_scores, div_point, gt))     

        # get acc for final argument
        acc = acc_cnt/(acc_cnt + err_cnt)*100
        result_dict[argument]['ACC'] = acc
        print(f'{argument} accuracy : {acc}')

        # save div_results for final argument
        result_dict[argument]['div_result'] = div_result

        # check if it is ran well
        # print(result_dict)

        # add to done list
        done_list.append(argument)

        # save result
        with open(os.path.join(save_path, "result_log.pkl"), "wb") as f3:
            pickle.dump(done_list, f3)

        with open(os.path.join(save_path, "chunk_results.pkl"), "wb") as f4:
            pickle.dump(result_dict, f4)
        

Running min, euclidean experiment : 
working on 20th doc...
working on 40th doc...
working on 60th doc...
working on 80th doc...
working on 100th doc...
working on 120th doc...
working on 140th doc...
working on 160th doc...
working on 180th doc...
working on 200th doc...
working on 220th doc...
working on 240th doc...
working on 260th doc...
working on 280th doc...
working on 300th doc...
working on 320th doc...
working on 340th doc...
working on 360th doc...
working on 380th doc...
working on 400th doc...
working on 420th doc...
working on 440th doc...
working on 460th doc...
working on 480th doc...
working on 500th doc...
('min', 'euclidean') accuracy : 14.000000000000002
Running min, cosine experiment : 
working on 20th doc...
working on 40th doc...
working on 60th doc...
working on 80th doc...
working on 100th doc...
working on 120th doc...
working on 140th doc...
working on 160th doc...
working on 180th doc...
working on 200th doc...
working on 220th doc...
working on 240th doc..

In [93]:
import pandas as pd

for key,value in result_dict.items():
    result_df = pd.DataFrame(value, columns=['ACC', 'div_result'])
    result_df.to_csv(f'/repo/course/sem21_01/youtube-summarization/src/bertsum/chunk_result/{key}.csv')

In [87]:
for key,value in result_dict.items():
    if key == ('mean', 'jensenshannon'):
        mean_js = value
    elif  key == ('mean', 'cosine'):
        mean_cos = value

In [88]:
import pandas as pd
result_df = pd.DataFrame(mean_js, columns=['ACC', 'div_result'])
result_df.to_csv('mean_js.csv')

In [89]:
print(result_df.head(1))

   ACC                                         div_result
0  0.0  ([지난해 고령화와 유례가 드문 겨울 한파 등 영향으로 우리나라 사망자 수가 통계 ...


In [90]:
import pandas as pd
result_df = pd.DataFrame(mean_cos, columns=['ACC', 'div_result'])
result_df.to_csv('mean_cos.csv')

In [91]:
print(result_df.head(1))

   ACC                                         div_result
0  0.0  ([지난해 고령화와 유례가 드문 겨울 한파 등 영향으로 우리나라 사망자 수가 통계 ...


## 미수행 실험건
- 추가 word embedding 건에 대해
- 각각 window 별로 다른 bert에 넣어서 비교