# 数据预处理

In [19]:
import nltk
from pathlib import Path
from tabulate import tabulate
from collections import namedtuple
from pyAutoSummarizer.base import summarization
from functools import partial
from tqdm import tqdm

# Function: Print Lines
def print_in_lines(text, line_length):
    words        = text.split()
    lines        = []
    current_line = ''
    for word in words:
        if len(current_line + word) <= line_length:
            current_line = current_line + word + ' '
        else:
            lines.append(current_line.strip())
            current_line = word + ' '
    lines.append(current_line.strip())
    for line in lines:
        print(line)

In [17]:
# bigsurvey

Pair = namedtuple('Pair', ['source', 'reference'])

data_pairs = []

source_sentences_list, target_sentences_list = [], []

for source, reference in zip(
    Path(r'D:\实验室\2024_03_05课程大纲\数据\bigsurvey\test.src.txt').read_text().split('\n'), 
    Path(r'D:\实验室\2024_03_05课程大纲\数据\bigsurvey\test.tgt.txt').read_text().split('\n')
):
    # source_sentences = nltk.tokenize.sent_tokenize(source)
    # target_sentences = nltk.tokenize.sent_tokenize(reference)
    data_pairs.append(Pair(source, reference))

In [26]:
# 统计一下词的数量
word_lens = []
sent_lens = []
for source, reference in tqdm(data_pairs):
    words = nltk.word_tokenize(source)
    word_lens.append(len(words))
    sentences = nltk.sent_tokenize(source)
    sent_lens.append(len(sentences))

print(f'average word length: {sum(word_lens) / len(word_lens)}')
print(f'average sentence length: {sum(sent_lens) / len(sent_lens)}')

  0%|          | 0/452 [00:00<?, ?it/s]

100%|██████████| 452/452 [00:26<00:00, 17.02it/s]

average word length: 12042.444690265487
average sentence length: 455.7278761061947





In [29]:
parameters = { 'stop_words':        ['en'],
               'n_words':           -1,
               'n_chars':           -1,
               'lowercase':         True,
               'rmv_accents':       True,
               'rmv_special_chars': True,
               'rmv_numbers':       False,
               'rmv_custom_words':  [],
               'verbose':           False
              }

model_funcs = {
    # 'textrank_n_50': (
    #     partial(summarization.summ_text_rank, iteration = 1000, D = 0.85, model = 'all-MiniLM-L6-v2'),
    #     partial(summarization.show_summary, n=50)
    # ),
    'lexrank_n_50': (
        partial(summarization.summ_lex_rank, iteration = 1000, D = 0.85),
        partial(summarization.show_summary, n=50)
    ),
    'lsa_n_50': (
        partial(summarization.summ_ext_LSA, embeddings = False, model = 'all-MiniLM-L6-v2'),
        partial(summarization.show_summary, n=50)
    ),
    'kl_n_50': (
        partial(summarization.summ_ext_KL, n=3),
        partial(summarization.show_summary, n=50),
    ),
    # # 最长1024，TODO: 层次式方法
    # 'bart_len_1250': (
    #     partial(summarization.summ_ext_bart, model = 'facebook/bart-large-cnn', max_len = 250),
    #     lambda r: r,
    # ),
    # # TODO: 对比长上下文的模型
}

output_dir = Path(r'D:\实验室\2024_03_05课程大纲\数据\bigsurvey\output')

for model_name, (exec_func, summ_func) in model_funcs.items():
    for source, reference in tqdm(data_pairs, desc=model_name):
        smr = summarization(source, **parameters)
        rank = exec_func(smr)
        summary = summ_func(smr, rank)
        generated_summary = smr.show_summary(rank, n = 50)

        file = Path(output_dir / model_name).open('a')
        file.write(generated_summary + '\n')
        file.close()

# 确定生成的长度
# for source, reference in tqdm(data_pairs):
#     # Create Instance
#     # print(source)
#     smr = summarization(source, **parameters)
#     rank = smr.summ_text_rank(iteration = 1000, D = 0.85, model = 'all-MiniLM-L6-v2')
#     generated_summary = smr.show_summary(rank, n = 3)
#     # print_in_lines(generated_summary, line_length = 100)

textrank_n_50: 100%|██████████| 452/452 [41:02<00:00,  5.45s/it] 


In [3]:
# baselines
# Extractive Summarization: textrank, lexrank, lsa, kl-sum, BART, T5， textteaser
# Abstractive Summarization: Large Language Model, PEGASUS

from pyAutoSummarizer.base import summarization

text = """ 
        If your work environment makes you constantly upset because it's hostile or you just don't like your job, you may need to find yourself new employment where you can be happier. 
        Maybe your workload is just too much. Start a job hunt now so that you can find something you love. If you're unhappy in your degree program, maybe you need to change fields, 
        or maybe you need to try something different altogether. If you're really unhappy as a stay-at-home parent, maybe it's time to start thinking about alternatives, such as going to work. 
        If you can't garner interest in any part of your life, that could be a sign of depression. Ask your doctor for more information. Other signs may include general sadness, anxiety, fatigue, 
        brain forgetfulness, and irritability. See your doctor if you're exhibiting these symptoms.

       """
text = text.replace('\n', '').replace('        ', '')

# Load Reference Summary (Human Made Summary for Benchmark)
reference_summary = """
                        Look at your work environment. Check for declining interest in other parts of your life. See if you exhibit other signs of depression.
                    """
reference_summary = reference_summary.replace('\n', '').replace('        ', '')

# Load Parameters
parameters = { 'stop_words':        ['en'],
               'n_words':           -1,
               'n_chars':           -1,
               'lowercase':         True,
               'rmv_accents':       True,
               'rmv_special_chars': True,
               'rmv_numbers':       False,
               'rmv_custom_words':  [],
               'verbose':           False
              }

# Create Instance
smr = summarization(text, **parameters)

# Rank Sentences
rank = smr.summ_text_rank(iteration = 1000, D = 0.85, model = 'all-MiniLM-L6-v2')
# Show Summary
generated_summary = smr.show_summary(rank, n = 3)

# Print Summary - TextRank
print_in_lines(generated_summary, line_length = 100)


rank = smr.summ_lex_rank(iteration = 1000, D = 0.85)
lexrank_g = smr.show_summary(rank, n = 3)

print_in_lines(lexrank_g, line_length = 100)

rank = smr.summ_ext_LSA(embeddings = False, model = 'all-MiniLM-L6-v2')
lsa_g = smr.show_summary(rank, n = 3)
print_in_lines(lsa_g, line_length = 100)


rank = smr.summ_ext_KL(n = 3)
kl_g = smr.show_summary(rank, n = 3)
print_in_lines(kl_g, line_length = 100)

bart_g = smr.summ_ext_bart(model = 'facebook/bart-large-cnn', max_len = 250)
print_in_lines(bart_g, line_length = 100)

# generated_summary = smr.summ_ext_t5(model = 't5-base', min_len = 30, max_len = 500)

  from tqdm.autonotebook import tqdm, trange


If your work environment makes you constantly upset because it's hostile or you just don't like your
job, you may need to find yourself new employment where you can be happier If you can't garner
interest in any part of your life, that could be a sign of depression If you're really unhappy as a
stay-at-home parent, maybe it's time to start thinking about alternatives, such as going to work
If your work environment makes you constantly upset because it's hostile or you just don't like your
job, you may need to find yourself new employment where you can be happier If you're really unhappy
as a stay-at-home parent, maybe it's time to start thinking about alternatives, such as going to
work Start a job hunt now so that you can find something you love
Other signs may include general sadness, anxiety, fatigue, brain forgetfulness, and irritability
Maybe your workload is just too much If you can't garner interest in any part of your life, that
could be a sign of depression
Maybe your workload

In [4]:
t5_g = smr.summ_ext_t5(model = 't5-base', min_len = 30, max_len = 500)
print_in_lines(t5_g, line_length = 100)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


start a job hunt now so that you can find something you love . if you're unhappy in your degree
program, maybe you need to change fields or try something different altogether. general sadness,
anxiety, fatigue, brain forgetfulness, and irritability are signs of depression.


In [20]:
import sentencepiece

In [6]:
# ROUGE N
from functools import partial

def calculate_scores(generated_summary, reference_summary):
    # ROUGE 1, ROUGE 2, ROUGE L, ROUGE S, BLEU, METEOR
    metric_funcs = {
        'rouge-1': partial(smr.rouge_N, n = 1),
        'rouge-2': partial(smr.rouge_N, n = 2),
        'rouge-l':smr.rouge_L,
        'rouge-s': partial(smr.rouge_S, skip_distance = 2),
        'bleu': partial(smr.bleu, n = 4),
        'meteor': smr.meteor,
    }

    score_mapper = {}

    for metric, func in metric_funcs.items():
        score = func(generated_summary = generated_summary, reference_summary = reference_summary)

        norm = lambda score: round(score * 100, 2)

        if isinstance(score, tuple):
            f1, precision, recall = score
            score_mapper[metric] = [norm(f1), norm(precision), norm(recall)]
            # {
            #     'f1': f1, 'precision': precision, 'recall': recall,
            # }
        else:
            score = norm(score)
            score_mapper[metric] = [score, -1, -1]

    return score_mapper

def get_avg_scores(score_mappers):
    avg_mapper = {}
    for score_mapper in score_mappers:
        for metric, scores in score_mapper.items():
            avg_mapper[metric] = avg_mapper.get(metric, [0, 0, 0])
            for index, score in enumerate(scores):
                avg_mapper[metric][index] += score

    for metric, scores in avg_mapper.items():
        for index, score in enumerate(scores):
            avg_mapper[metric][index] = score / len(score_mappers)

    return avg_mapper

def compose_row(generated_summaries, reference_summaries, method_name):
    
    score_mappers = [calculate_scores(g, r) for g, r in zip(generated_summaries, reference_summaries)]
    avg_mapper = get_avg_scores(score_mappers)

    values = []; headers = ['Method']
    for metric, scores in avg_mapper.items():
        if len(scores) == 1:
            value = scores[0]
        else:
            metric = f'{metric}-F1/P/R'
            f1, p, r = scores
            value = f'{f1}/{p}/{r}'
        values.append(value)
        headers.append(metric)
    
    row = [method_name, *values]
    return row, headers

row1, headers = compose_row([generated_summary, generated_summary], [reference_summary, reference_summary], 'text_rank')
row2, headers = compose_row([lexrank_g], [reference_summary], 'lexrank')
row3, headers = compose_row([lsa_g], [reference_summary], 'lsa')
row_bart, headers = compose_row([bart_g], [reference_summary], 'bart')
row_t5, headers = compose_row([t5_g], [reference_summary], 't5')
table = tabulate([row1, row2, row3, row_bart, row_t5], headers = headers, tablefmt = 'fancy_grid')
print(table)

╒═══════════╤══════════════════╤══════════════════╤══════════════════╤══════════════════╤═════════════════╤═════════════════╕
│ Method    │ rouge-1-F1/P/R   │ rouge-2-F1/P/R   │ rouge-l-F1/P/R   │ rouge-s-F1/P/R   │ bleu-F1/P/R     │ meteor-F1/P/R   │
╞═══════════╪══════════════════╪══════════════════╪══════════════════╪══════════════════╪═════════════════╪═════════════════╡
│ text_rank │ 29.41/20.83/50.0 │ 6.06/4.17/11.11  │ 28.57/20.0/50.0  │ 4.26/4.26/4.26   │ 9.13/-1.0/-1.0  │ 43.48/-1.0/-1.0 │
├───────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────┼─────────────────┤
│ lexrank   │ 13.33/10.0/20.0  │ 6.25/4.35/11.11  │ 11.76/8.33/20.0  │ 2.27/2.27/2.27   │ 6.02/-1.0/-1.0  │ 17.54/-1.0/-1.0 │
├───────────┼──────────────────┼──────────────────┼──────────────────┼──────────────────┼─────────────────┼─────────────────┤
│ lsa       │ 30.77/25.0/40.0  │ 0.0/0.0/0.0      │ 23.08/18.75/30.0 │ 3.45/3.45/3.45   │ 25.0/-1.0/-1.0  │ 37.74/-1.0

In [27]:
# ROUGE N
f1_N, precision_N, recall_N = smr.rouge_N(generated_summary = generated_summary, reference_summary = reference_summary, n = 2)
rouge_N_scores              = [['Rouge N (F1 Score):', round(f1_N, 2)], ['Rouge N (Precision):', round(precision_N, 2)], ['Rouge N (Recall):', round(recall_N, 2)]]

# ROUGE L
f1_L, precision_L, recall_L = smr.rouge_L(generated_summary = generated_summary, reference_summary = reference_summary)
rouge_L_scores              = [['Rouge L (F1 Score):', round(f1_L, 2)], ['Rouge L (Precision):', round(precision_L, 2)], ['Rouge L (Recall):', round(recall_L, 2)]]

# ROUGE S
f1_S, precision_S, recall_S = smr.rouge_S(generated_summary = generated_summary, reference_summary = reference_summary, skip_distance = 2)
rouge_S_scores              = [['Rouge S (F1 Score):', round(f1_S, 2)], ['Rouge S (Precision):', round(precision_S, 2)], ['Rouge S (Recall):', round(recall_S, 2)]]

# BLEU
score_b                     = smr.bleu(generated_summary = generated_summary, reference_summary = reference_summary, n = 4)
bleu_score                  = [['BLEU Score:', round(score_b, 2)]]

# METEOR
score_m                     = smr.meteor(generated_summary = generated_summary, reference_summary = reference_summary)
meteor_score                = [['METEOR Score:', round(score_m, 2)]]

# Table
table_data = rouge_N_scores + rouge_L_scores + rouge_S_scores + bleu_score + meteor_score
table      = tabulate(table_data, headers = ['Evaluation Metric', 'Score'], tablefmt = 'fancy_grid')
print(table)

╒══════════════════════╤═════════╕
│ Evaluation Metric    │   Score │
╞══════════════════════╪═════════╡
│ Rouge N (F1 Score):  │    0.06 │
├──────────────────────┼─────────┤
│ Rouge N (Precision): │    0.04 │
├──────────────────────┼─────────┤
│ Rouge N (Recall):    │    0.11 │
├──────────────────────┼─────────┤
│ Rouge L (F1 Score):  │    0.29 │
├──────────────────────┼─────────┤
│ Rouge L (Precision): │    0.2  │
├──────────────────────┼─────────┤
│ Rouge L (Recall):    │    0.5  │
├──────────────────────┼─────────┤
│ Rouge S (F1 Score):  │    0.04 │
├──────────────────────┼─────────┤
│ Rouge S (Precision): │    0.04 │
├──────────────────────┼─────────┤
│ Rouge S (Recall):    │    0.04 │
├──────────────────────┼─────────┤
│ BLEU Score:          │    0.09 │
├──────────────────────┼─────────┤
│ METEOR Score:        │    0.43 │
╘══════════════════════╧═════════╛
