In [1]:
import os
import evaluate
import numpy as np
import pandas as pd
from eval_utils import read_pmcids, sent_json, load_ExtModel, TrigramBlock

In [2]:
IDS_PATH = '../../dataset/pmcids/test.txt'
JSON_DIR = '../../dataset/sentence_json/'
PARQUET_DIR = '../../dataset/sentence_features/'
MODEL = load_ExtModel('../extractive_summarizer/model/LGB_model_F10_S.pkl')
BLOCK = ['F8','F9','label']

In [3]:
def convert_sentence_df(sentJson, pred, true_proba):
    
    # 摘要、正文 轉換為 DataFrame
    abstract = pd.DataFrame([(section, sent['text'].strip()) for section in 'IMRD' for sent in sentJson['abstract'][section]],
                       columns=['section', 'text']).astype({'section': 'category', 'text': 'string'})
    
    body = pd.DataFrame([(section, sent['text'].strip(), sent['label']) for section in 'IMRD' for sent in sentJson['body'][section]],
                       columns=['section', 'text', 'label']).astype({'section': 'category', 'text': 'string', 'label': 'bool'})

    # 加上預測結果和機率
    body['predict'] = pred.astype('bool')
    body['proba'] = true_proba.astype('float16')

    
    # 對每章節的提取句子進行 trigram blocking
    if set_trigram_blocking:
        for section in 'IMRD':
            block = TrigramBlock()
            temp = body.loc[(body['section'] == section) & (body['predict'] == True)].sort_values(by='proba', ascending=False)
            for i, row in temp.iterrows():
                if block.check_overlap(row['text']):
                    body.at[i, 'predict'] = False 
                    
    return body, abstract

In [4]:
def process_article(pmcid, threshold,
                    model=MODEL, block_cols=BLOCK, json_dir=JSON_DIR, parquet_dir=PARQUET_DIR):
    
    # 預測
    def predict(x):
        true_proba = model.predict_proba(x)[:, 1]
        # 如果沒有任何句子的預測機率大於閾值，則選取最大機率的句子為摘要句
        if not np.any(true_proba > threshold):
            true_proba[true_proba == np.max(true_proba)] = 1
        pred = (true_proba > threshold).astype('int')
        return pred, true_proba
    
    # 讀取句子特徵，進行預測
    df = pd.read_parquet(f'{parquet_dir}/{pmcid}.parquet')
    sentFeat  = df.drop(columns=block_cols)
    pred, true_proba = predict(sentFeat)
    
    # 讀取句子資料，組合對應文本
    sentJson = sent_json(f'{json_dir}/{pmcid}.json')
    body, abstract = convert_sentence_df(sentJson, pred, true_proba)
    ext = body[body['predict'] == True]
    
    return ext, abstract 

In [5]:
# 平均 ROUGE 分數
def main(pmcid_file=IDS_PATH, threshold=0.5):    
    
    rouge = evaluate.load('rouge')
    pmcids = read_pmcids(pmcid_file)
    lst = ['ALL', 'I', 'M', 'R', 'D']
    hyp = {key: [] for key in lst}
    ref = {key: [] for key in lst}
    
    for pmcid in pmcids:
        ext, abstract = process_article(pmcid, threshold)
        for section in lst:
            hyp_txt = '\n'.join(list(ext['text'])) if section == 'ALL' else '\n'.join(list(ext[ext['section']==section]['text']))
            ref_txt = '\n'.join(list(abstract['text'])) if section == 'ALL' else '\n'.join(list(abstract[abstract['section']==section]['text']))
            hyp[section].append(hyp_txt)
            ref[section].append(ref_txt)
    
    res = {key: rouge.compute(predictions=hyp[key], references=ref[key], use_stemmer=True, use_aggregator=True) for key in lst}
    return pd.DataFrame(res).round(4)


### LGBM

In [7]:
%%time
set_trigram_blocking=False
main()

CPU times: user 36min 39s, sys: 2.48 s, total: 36min 42s
Wall time: 24min 10s


Unnamed: 0,ALL,I,M,R,D
rouge1,0.3246,0.3039,0.2683,0.2809,0.3642
rouge2,0.1893,0.1607,0.121,0.164,0.2111
rougeL,0.2065,0.2136,0.1759,0.1994,0.2778
rougeLsum,0.311,0.2788,0.2439,0.2626,0.331


In [6]:
%%time
set_trigram_blocking=True
main()

CPU times: user 31min 12s, sys: 3.93 s, total: 31min 16s
Wall time: 15min 36s


Unnamed: 0,ALL,I,M,R,D
rouge1,0.4608,0.3781,0.3297,0.3858,0.4251
rouge2,0.2194,0.1802,0.1324,0.1677,0.2325
rougeL,0.2511,0.2555,0.2055,0.2359,0.3167
rougeLsum,0.4354,0.3394,0.2939,0.3461,0.3788


### Output sentence pair

In [5]:
# 輸出預測摘要/參考摘要句子對 (作為生成式模型訓練資料)
def generate_ext_abstract_pairs(pmcid_file, output_file, threshold, batch_size=5000):
    pmcids = read_pmcids(pmcid_file)
    num = 0
    pairs = []
    append_mode = False

    if os.path.exists(output_file):
        append_mode = True
    
    # 批量輸出
    def output_batch(pairs, output_file, append_mode):
        pairs_arr = np.array(pairs)
        df = pd.DataFrame({'extract': pairs_arr[:, 0], 'abstract': pairs_arr[:, 1]}) 
        if append_mode:
            orig_df = pd.read_parquet(output_file)
            pd.concat([orig_df, df], ignore_index=True).to_parquet(output_file)
        else:
            df.to_parquet(output_file)
            append_mode = True   
        return append_mode
    
    
    # 輸出句子對(依章節單位)
    for pmcid in pmcids:
        ext, abstract = process_article(pmcid, threshold)
        for section in 'IMRD':
            ext_text = ' '.join(list(ext[ext['section']==section]['text']))
            abstract_text = ' '.join(list(abstract[abstract['section']==section]['text']))
            pairs.append([ext_text, abstract_text])
             
        num += 1
        if num % batch_size == 0:
            print(batch_size)
            append_mode = output_batch(pairs, output_file, append_mode)
            pairs = []
          
    if pairs:
        append_mode = output_batch(pairs, output_file, append_mode)
    
    merge_df = pd.read_parquet(output_file)  
    return merge_df.info()

In [None]:
# %%time
# set_trigram_blocking = False
# generate_ext_abstract_pairs(pmcid_file = '../../dataset/pmcids/train.txt', # or test.txt
#                          output_file = '../../dataset/to_abstractive/train_pair.parquet', # or test_pair
#                          threshold = 0.5)

In [None]:
# %%time
# set_trigram_blocking = True
# generate_ext_abstract_pairs(pmcid_file = '../../dataset/pmcids/train.txt', # or test.txt
#                          output_file = '../../dataset/to_abstractive/tb_train_pair.parquet', # or test_pair
#                          threshold = 0.5)