In [1]:
import logging
import os
import argparse
import math
import numpy as np

import traceback

from tqdm import tqdm

import pandas as pd

from src.config.config import Config, load_config_from_json
from src.evaluate.rouge_evaluator import ScoreSummary
from src.loader.class_loader import Dataset, SOURCE, Cluster
from src.utils.factory import create_model, create_evaluator
from src.model.sds.combination import CombinationRanker
from src.loader.class_loader import load_cluster

In [2]:
config = load_config_from_json()

try:
    train_set = load_cluster(
        config.train_path,
    )
    logging.warning("[PIPELINE] - Load train set from {}. Done.".format(config.train_path))
except Exception as e:
    train_set = None
    logging.warning("[PIPELINE] - Load train set from {}. Failed. Using None.".format(config.train_path))
    print(e)

try:
    valid_set = load_cluster(
        config.valid_path,
    )
    logging.warning("[PIPELINE] - Load valid set from {}. Done.".format(config.valid_path))
except Exception as e:
    valid_set = None
    logging.warning("[PIPELINE] - Load valid set from {}. Failed. Using None.".format(config.valid_path))
    print(e)

try:
    test_set = load_cluster(
        "/home/dang/vlsp-final-year/dataset/vlsp_abmusu_test_data.jsonl",
    )
    logging.warning("[PIPELINE] - Load test set from {}. Done.".format("/home/dang/vlsp-final-year/dataset/vlsp_abmusu_test_data.jsonl"))
except Exception as e:
    test_set = None
    logging.warning("[PIPELINE] - Load test set from {}. Failed. Using None.".format("/home/dang/vlsp-final-year/dataset/vlsp_abmusu_test_data.jsonl"))
    print(e)

Total number of cluster:  200


200it [04:39,  1.40s/it]


Total number of cluster:  100


100it [02:10,  1.31s/it]


Total number of cluster:  300


300it [05:55,  1.19s/it]


In [3]:
train_set.set_source(SOURCE.SENT_SPLITTED_TOKEN.value)
valid_set.set_source(SOURCE.SENT_SPLITTED_TOKEN.value)
test_set.set_source(SOURCE.SENT_SPLITTED_TOKEN.value)

train_scores = []
valid_scores = []
test_scores = []

In [4]:
from src.model.sds.combination import CombinationRanker

config = load_config_from_json()
model_config = config.models[15]

model = CombinationRanker(model_config)
model.training(valid_set)

Loading codes from /home/dang/vlsp-final-year/external/sentence_transformer/vn_sbert_deploy/bpe/bpe.codes ...
Read 64000 codes from the codes file.


In [5]:
weight = {
    "tfidf": 0.1,
    "lexrank": 0.1,
    "textrank": 0.1
}
    
print("get local score on train set")
for cluster in tqdm(train_set.clusters):
    document_score = []

    for doc in cluster.documents:
        document_score.append(model.get_score(doc.get_all_sents(), 1))

    train_scores.append(document_score)

print("get local score on valid set")
for cluster in tqdm(valid_set.clusters):
    document_score = []

    for doc in cluster.documents:
        document_score.append(model.get_score(doc.get_all_sents(), 1))

    valid_scores.append(document_score)
    
print("get local score on test set")
for cluster in tqdm(test_set.clusters):
    document_score = []

    for doc in cluster.documents:
        document_score.append(model.get_score(doc.get_all_sents(), 1))

    test_scores.append(document_score)

get local score on train set


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:49<00:00,  4.08it/s]


get local score on valid set


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.47it/s]


get local score on test set


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:04<00:00,  4.63it/s]


In [6]:
from src.evaluate.rouge_evaluator import RougeScore

class RougeScoreStorage:
    def __init__(self):
        self.df = pd.DataFrame(columns=[
            'cluster_id',
            'rouge_1_p',
            'rouge_1_r',
            'rouge_1_f',
            'rouge_2_p',
            'rouge_2_r',
            'rouge_2_f',
            'rouge_l_p',
            'rouge_l_r',
            'rouge_l_f',
        ])
    
    def add_score(self, cluster_id: int, score: RougeScore):
        self.df = self.df.append({
            'cluster_id': cluster_id,
            'rouge_1_p': score.rouge1.p,
            'rouge_1_r': score.rouge1.r,
            'rouge_1_f': score.rouge1.f1,
            'rouge_2_p': score.rouge2.p,
            'rouge_2_r': score.rouge2.r,
            'rouge_2_f': score.rouge2.f1,
            'rouge_l_p': score.rougeL.p,
            'rouge_l_r': score.rougeL.r,
            'rouge_l_f': score.rougeL.f1,
        }, ignore_index=True)
        
    def summary_score(self):
        summary_df = pd.DataFrame(columns=[
            'name',
            'mean',
            'min',
            'max',
            'std',
        ])

        metric_cols = [
            'rouge_1_p',
            'rouge_1_r',
            'rouge_1_f',
            'rouge_2_p',
            'rouge_2_r',
            'rouge_2_f',
            'rouge_l_p',
            'rouge_l_r',
            'rouge_l_f', ]

        for col in metric_cols:
            describe = self.df[col].describe()
            summary_df = summary_df.append({
                'name': col,
                'mean': describe['mean'],
                'min': describe['min'],
                'max': describe['max'],
                'std': describe['std'],
            }, ignore_index=True)
        
        return summary_df

In [7]:
from src.evaluate.rouge_evaluator import ScoreSummary
from src.utils.factory import create_model, create_evaluator
from src.model.mmr_query import MMRSummarizerQuery

evaluator = create_evaluator(config.eval_config)

In [22]:
n_sent = [11, 0.2]

def get_rouge_score(weights, sigma):
    local_config = config.models[15]
    local_config.sigma = sigma
    
    mmr = MMRSummarizerQuery(local_config) 
    
    train_storage = RougeScoreStorage()
    valid_storage = RougeScoreStorage()
    
    print("Test on training set")
    
    """train_set.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
    for idx, cluster in tqdm(enumerate(train_set.clusters)):
        chosen_sent = [] 

        sent_count = len(cluster.get_all_sents())
        for SENT_COUNT in n_sent:
            if 0 <= SENT_COUNT < 1:
                sent_count = min(int(math.ceil(len(cluster.get_all_sents()) * SENT_COUNT)), sent_count)
            else:
                sent_count = min(int(SENT_COUNT), sent_count)


        for idxc, doc in enumerate(cluster.documents):
            doc.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
            sents = doc.get_all_sents()

            scores = train_scores[idx][idxc]

            combine_score = np.zeros((len(scores["tfidf"])), dtype=float)

            for key in weights.keys():
                combine_score += scores[key] * weights[key]

            if (sent_count >= len(combine_score)):
                chosen_idx = list(range(len(combine_score)))
            else:
                chosen_idx = np.argpartition(combine_score, -sent_count)[-sent_count:]

            for i in chosen_idx:
                chosen_sent.append(sents[i]) 

        pred_sent, _ = mmr(chosen_sent, sent_count, cluster.get_all_anchor())

        train_storage.add_score(
            cluster.cluster_idx,
            evaluator(
                '.'.join(pred_sent),
                '.'.join(cluster.get_summary()),
            )
        )"""
    
    print("Test on valid set")
    
    valid_set.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
    for idx, cluster in tqdm(enumerate(valid_set.clusters)):
        chosen_sent = [] 
        
        sent_count = len(cluster.get_all_sents())
        for SENT_COUNT in n_sent:
            if 0 <= SENT_COUNT < 1:
                sent_count = min(int(math.ceil(len(cluster.get_all_sents()) * SENT_COUNT)), sent_count)
            else:
                sent_count = min(int(SENT_COUNT), sent_count)
        
        
        for idxc, doc in enumerate(cluster.documents):
            doc.set_source(SOURCE.SENT_SPLITTED_TOKEN.value)
            sents = doc.get_all_sents()
            
            scores = valid_scores[idx][idxc]
            
            combine_score = np.zeros((len(scores["tfidf"])), dtype=float)
        
            for key in weights.keys():
                combine_score += scores[key] * weights[key]
            
            if (sent_count >= len(combine_score)):
                chosen_idx = list(range(len(combine_score)))
            else:
                chosen_idx = np.argpartition(combine_score, -sent_count)[-sent_count:]
            
            for i in chosen_idx:
                chosen_sent.append(sents[i]) 
                
        pred_sent, _ = mmr(chosen_sent, sent_count, cluster.get_all_anchor())
        
        cluster.set_source(SOURCE.SENT_SPLITTED_TOKEN.value)
        valid_storage.add_score(
            cluster.cluster_idx,
            evaluator(
                '.'.join(pred_sent),
                '.'.join(cluster.get_summary()),
            )
        )
        
    print("Using weight\n", weights)
    print("Sigma: ", sigma)
    # print("Train result\n", train_storage.summary_score())
    print("Valid result\n", valid_storage.summary_score())

In [23]:
def get_rouge_score_and_saved(weights, sigma):
    local_config = config.models[15]
    local_config.sigma = sigma
    
    mmr = MMRSummarizerQuery(local_config) 
    
    train_storage = RougeScoreStorage()
    valid_storage = RougeScoreStorage()
    
    print("Test on training set")
    
    """train_set.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
    for idx, cluster in tqdm(enumerate(train_set.clusters)):
        chosen_sent = [] 

        sent_count = len(cluster.get_all_sents())
        for SENT_COUNT in n_sent:
            if 0 <= SENT_COUNT < 1:
                sent_count = min(int(math.ceil(len(cluster.get_all_sents()) * SENT_COUNT)), sent_count)
            else:
                sent_count = min(int(SENT_COUNT), sent_count)


        for idxc, doc in enumerate(cluster.documents):
            doc.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
            sents = doc.get_all_sents()

            scores = train_scores[idx][idxc]

            combine_score = np.zeros((len(scores["tfidf"])), dtype=float)

            for key in weights.keys():
                combine_score += scores[key] * weights[key]

            if (sent_count >= len(combine_score)):
                chosen_idx = list(range(len(combine_score)))
            else:
                chosen_idx = np.argpartition(combine_score, -sent_count)[-sent_count:]

            for i in chosen_idx:
                chosen_sent.append(sents[i]) 

        pred_sent, _ = mmr(chosen_sent, sent_count, cluster.get_all_anchor())

        train_storage.add_score(
            cluster.cluster_idx,
            evaluator(
                '.'.join(pred_sent),
                '.'.join(cluster.get_summary()),
            )
        )"""
    
    print("Test on valid set")
    
    valid_set.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
    for idx, cluster in tqdm(enumerate(valid_set.clusters)):
        chosen_sent = [] 
        
        sent_count = len(cluster.get_all_sents())
        for SENT_COUNT in n_sent:
            if 0 <= SENT_COUNT < 1:
                sent_count = min(int(math.ceil(len(cluster.get_all_sents()) * SENT_COUNT)), sent_count)
            else:
                sent_count = min(int(SENT_COUNT), sent_count)
        
        
        for idxc, doc in enumerate(cluster.documents):
            doc.set_source(SOURCE.SENT_SPLITTED_TOKEN.value)
            sents = doc.get_all_sents()
            
            scores = valid_scores[idx][idxc]
            
            combine_score = np.zeros((len(scores["tfidf"])), dtype=float)
        
            for key in weights.keys():
                combine_score += scores[key] * weights[key]
            
            if (sent_count >= len(combine_score)):
                chosen_idx = list(range(len(combine_score)))
            else:
                chosen_idx = np.argpartition(combine_score, -sent_count)[-sent_count:]
            
            for i in chosen_idx:
                chosen_sent.append(sents[i]) 
                
        pred_sent, _ = mmr(chosen_sent, sent_count, cluster.get_all_anchor())
        
        cluster.set_source(SOURCE.SENT_SPLITTED_TOKEN.value)
        valid_storage.add_score(
            cluster.cluster_idx,
            evaluator(
                '.'.join(pred_sent),
                '.'.join(cluster.get_summary()),
            )
        )
     
    print("Predict on test set")
    
    test_set.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
    predictions = []
    
    for idx, cluster in tqdm(enumerate(test_set.clusters)):
        chosen_sent = [] 
        
        sent_count = len(cluster.get_all_sents())
        for SENT_COUNT in n_sent:
            if 0 <= SENT_COUNT < 1:
                sent_count = min(int(math.ceil(len(cluster.get_all_sents()) * SENT_COUNT)), sent_count)
            else:
                sent_count = min(int(SENT_COUNT), sent_count)
        
        
        for idxc, doc in enumerate(cluster.documents):
            doc.set_source(SOURCE.SENT_SPLITTED_TEXT.value)
            sents = doc.get_all_sents()
            
            scores = test_scores[idx][idxc]
            
            combine_score = np.zeros((len(scores["tfidf"])), dtype=float)
        
            for key in weights.keys():
                combine_score += scores[key] * weights[key]
            
            if (sent_count >= len(combine_score)):
                chosen_idx = list(range(len(combine_score)))
            else:
                chosen_idx = np.argpartition(combine_score, -sent_count)[-sent_count:]
            
            for i in chosen_idx:
                chosen_sent.append(sents[i]) 
                   
        pred_sent, _ = mmr(chosen_sent, sent_count, cluster.get_all_anchor())
        predictions.append(' '.join(pred_sent))
        
    print("Start write to txt")
    with open(os.path.join("/home/dang/vlsp-final-year/data/result/outer/combination", "results.txt"), "w") as f:
        for summary in predictions:
            summary.replace('_', ' ')
            f.write(summary)
            f.write('\n')
    print("Done write to txt")
                
    print("Using weight\n", weights)
    print("Sigma: ", sigma)
    #print("Train result\n", train_storage.summary_score())
    print("Valid result\n", valid_storage.summary_score())

In [24]:
# for sigma in [0.0, 0.2, 0.8, 1.0]:
#     for i in [0.0, 0.3, 0.7, 1.0]:
#         for j in [0.0, 0.3, 0.7, 1.0]:
#             if i + j > 1.0:
#                 continue 
#             k = 1.0 - i - j 

#             weight = {
#                 "tfidf": i,
#                 "lexrank": j,
#                 "textrank": k
#             }

#             get_rouge_score(weight, sigma)

In [25]:
weight = {
    "tfidf": 0.0,
    "lexrank": 0.8,
    "textrank": 0.2
}

get_rouge_score_and_saved(weight, 0.8)

Loading codes from /home/dang/vlsp-final-year/external/sentence_transformer/vn_sbert_deploy/bpe/bpe.codes ...
Read 64000 codes from the codes file.


Test on training set
Test on valid set


100it [03:38,  2.19s/it]


Predict on test set


300it [07:41,  1.54s/it]

Start write to txt
Done write to txt
Using weight
 {'tfidf': 0.0, 'lexrank': 0.8, 'textrank': 0.2}
Sigma:  0.8
Valid result
         name      mean       min       max       std
0  rouge_1_p  0.428603  0.195266  1.000000  0.144988
1  rouge_1_r  0.609957  0.353535  1.000000  0.129503
2  rouge_1_f  0.493109  0.285714  1.000000  0.123463
3  rouge_2_p  0.247248  0.020000  0.967742  0.159538
4  rouge_2_r  0.384883  0.041667  0.967742  0.178236
5  rouge_2_f  0.292535  0.027460  0.967742  0.158970
6  rouge_l_p  0.405636  0.184466  1.000000  0.148301
7  rouge_l_r  0.576432  0.323232  1.000000  0.138975
8  rouge_l_f  0.466368  0.241270  1.000000  0.131014



