# Load Dataset and re-process

In [1]:
from datasets import load_dataset,load_from_disk

In [2]:
# Load dataset
dataset = load_dataset('vicgalle/alpaca-gpt4')

In [3]:
print(dataset['train'])

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 52002
})


# Different models training

In [1]:
DI_model_save_name = 'compare_work/DI_alpaca_0.2_t5_small_10e_2'
DI_generation_texts_pth = 'compare_work/DI_alpaca_gen_0_2_10e_2.txt'
#DI_FT_generation_texts_pth = 'compare_work/DI_FT_alpaca_gen_0_2_10e_2.txt'

## DI-t5-small

In [None]:
def preprocess_t5_data(example,index):
    if example['output']:
        answer_text = example['output']
    else:
        answer_text = "No answer found"
    return {
        'index':index,
        'input_text': f"answer: {answer_text}",
        'target_text': f"enquiry: {example['instruction']+' '+example['input']}" 
    }

processed_t5small_dataset = dataset.map(preprocess_t5_data,with_indices=True)

In [None]:
processed_t5small_dataset['train']['target_text']

In [6]:
train_test_split_t5 = processed_t5small_dataset['train'].train_test_split(test_size=0.2)
train_dataset_t5 = train_test_split_t5['train']
val_dataset_t5 = train_test_split_t5['test']

In [7]:
val_dataset_t5

Dataset({
    features: ['instruction', 'input', 'output', 'text', 'index', 'input_text', 'target_text'],
    num_rows: 10401
})

In [None]:
train_dataset_t5.save_to_disk(dataset_path='compare_work/alpaca-0.2-train')
val_dataset_t5.save_to_disk(dataset_path='compare_work/alpaca-0.2-test')

In [3]:
from datasets import load_dataset,load_from_disk

train_dataset_t5 = load_from_disk('Ans2Seq/compare_work/alpaca-0.2-train')
val_dataset_t5 = load_from_disk('Ans2Seq/compare_work/alpaca-0.2-test')

In [4]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_t5_function(examples):
    model_inputs = t5_tokenizer(examples['input_text'], padding="max_length", truncation=True)
    labels = t5_tokenizer(examples['target_text'], padding="max_length", truncation=True)
    print(labels)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)
tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)

In [5]:
import torch
from transformers import T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

t5_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

In [6]:
from transformers import Trainer, TrainingArguments

training_args_t5 = TrainingArguments(
    output_dir= DI_model_save_name,
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    report_to="none"
)

trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=tokenized_train_dataset_t5,
    eval_dataset=tokenized_val_dataset_t5
)


2024-10-09 21:53:09.958141: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-09 21:53:09.973497: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-09 21:53:09.973516: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-09 21:53:09.973530: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-09 21:53:09.976964: I tensorflow/core/platform/cpu_feature_g

In [None]:
trainer_t5.train()

In [8]:
trainer_t5.save_model(DI_model_save_name) 
t5_tokenizer.save_pretrained(DI_model_save_name)

Saving model checkpoint to compare_work/DI_alpaca_0.2_t5_small_10e_2
Configuration saved in compare_work/DI_alpaca_0.2_t5_small_10e_2/config.json
Model weights saved in compare_work/DI_alpaca_0.2_t5_small_10e_2/pytorch_model.bin
tokenizer config file saved in compare_work/DI_alpaca_0.2_t5_small_10e_2/tokenizer_config.json
Special tokens file saved in compare_work/DI_alpaca_0.2_t5_small_10e_2/special_tokens_map.json


('compare_work/DI_alpaca_0.2_t5_small_10e_2/tokenizer_config.json',
 'compare_work/DI_alpaca_0.2_t5_small_10e_2/special_tokens_map.json',
 'compare_work/DI_alpaca_0.2_t5_small_10e_2/spiece.model',
 'compare_work/DI_alpaca_0.2_t5_small_10e_2/added_tokens.json')

In [9]:
def generate_question(answer):
    t5_model.eval()  
    input_ids = t5_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = t5_model.generate(input_ids, num_beams=5, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [10]:
from tqdm import tqdm


samples = val_dataset_t5  
res = []

with open(DI_generation_texts_pth, 'a') as file:
    for example in tqdm(samples):
        generated_question = generate_question(example['input_text'].replace("answer: ", ""))
        res.append(generated_question.replace("enquiry: ", ""))
        file.write((generated_question.replace("enquiry: ", "")+'\n'))
        # print(f"Answer: {example['input_text'].replace('answer: ', '')}")
        # print(f"Generated Question: {generated_question}")
        # print(f"Actual Question: {example['target_text']}\n")



  1%|          | 53/10401 [00:02<07:43, 22.33it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
 13%|█▎        | 1393/10401 [00:59<06:14, 24.03it/s]

100%|██████████| 10401/10401 [07:18<00:00, 23.70it/s]


In [19]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # Load BLEU scorer
    # bleu_metric = load_metric("bleu")

    # # Calculate BLEU score
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        


    # Load ROUGE scorer
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: Measures unigram matches between generated text and reference text.
    ROUGE-2: Measures bigram matches between generated text and reference text.
    ROUGE-L: Measures the longest common subsequence (LCS) between generated text and reference text.
    ROUGE-Lsum: A variant based on LCS, specifically designed for evaluating long texts.
    '''
    # Calculate ROUGE scores
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # Calculate METEOR score
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # Calculate BERTScore

    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [20]:
with open(DI_generation_texts_pth, 'r') as file:
    content = file.readlines()

In [21]:
refs = [ [i.replace('enquiry: ',"")] for i in val_dataset_t5['target_text']]

In [22]:
refs

[['Generate a simile comparing a sunset to fiction. '],
 ['What is the leading cause of death for children under the age of 5? '],
 ['Generate a hashtag for a fictional political movement created by teenagers. '],
 ['Name three common validation techniques used in Machine Learning. '],
 ['Assign an appropriate name to the following new species of butterfly. '],
 ['Tell me something about a Nissan leaf. '],
 ['Create a summary of the following text in no more than 4 lines. The effect of gun violence in the United States is widespread. According to the CDC, in 2018 there were 38,390 deaths due to firearm. Of these, 24,432 were suicides. The rate of firearm deaths per 100,000 people increased from 10.3 per 100,000 in 1999 to 12 per 100,000 in 2017.'],
 ['Fill in the blank with an appropriate preposition: I like to stay ____ top of my work.'],
 ['Compose an email to a potential client explaining the benefits of your service. '],
 ['Provide an appropriate metaphor for the following situatio

In [None]:
res = Calmetic(references=refs,predictions=content)

In [24]:
print(res['BLEU'])
print(res['ROUGE']['rougeLsum'][1])

{'bleu': 0.23016953276592583, 'precisions': [0.5872425941964399, 0.3807382562703868, 0.28357707006369426, 0.2165655920842643], 'brevity_penalty': 0.6723906687696457, 'length_ratio': 0.7158627829845122, 'translation_length': 118927, 'reference_length': 166131}
Score(precision=0.5945178789462683, recall=0.5007581778195037, fmeasure=0.5281757405045957)


In [25]:
val_dataset_t5['target_text']

['enquiry: Generate a simile comparing a sunset to fiction. ',
 'enquiry: What is the leading cause of death for children under the age of 5? ',
 'enquiry: Generate a hashtag for a fictional political movement created by teenagers. ',
 'enquiry: Name three common validation techniques used in Machine Learning. ',
 'enquiry: Assign an appropriate name to the following new species of butterfly. ',
 'enquiry: Tell me something about a Nissan leaf. ',
 'enquiry: Create a summary of the following text in no more than 4 lines. The effect of gun violence in the United States is widespread. According to the CDC, in 2018 there were 38,390 deaths due to firearm. Of these, 24,432 were suicides. The rate of firearm deaths per 100,000 people increased from 10.3 per 100,000 in 1999 to 12 per 100,000 in 2017.',
 'enquiry: Fill in the blank with an appropriate preposition: I like to stay ____ top of my work.',
 'enquiry: Compose an email to a potential client explaining the benefits of your service. '

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')#SentenceTransformer("bert-base-uncased") 

reference_texts_ = [ i.replace('enquiry: ',"") for i in val_dataset_t5['target_text'] ]
embeddings1 = sentence_model.encode(content, convert_to_tensor=True)
embeddings2 = sentence_model.encode(reference_texts_, convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)   #[52002,52002]维度的矩阵，对角线上的值为对应文本的余弦相似度

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")


## -bart-large

In [None]:

def preprocess_squad(example):
    # Invert the dataset by treating the answer as input and the question as output
    if example['answers']['text']:
    # SQuAD has answers as a list of possible answer texts; we'll just use the first one for simplicity
        answer_text = example['answers']['text'][0]
    else:
        answer_text = "No answer found"
    return {
        'input_text': f"answer: {answer_text}",
        'target_text': example['question']
    }


# Preprocess the dataset
processed_dataset = dataset.map(preprocess_squad)

In [None]:
train_test_split = processed_dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [None]:
print(train_dataset.shape)
print(val_dataset.shape)

In [None]:
from transformers import BartTokenizer

bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

def tokenize_bart_function(examples):
    model_inputs = bart_tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = bart_tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_bart = train_dataset.map(tokenize_bart_function, batched=True)
tokenized_val_dataset_bart = val_dataset.map(tokenize_bart_function, batched=True)

In [10]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import BartForConditionalGeneration

bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)


In [None]:
from transformers import Trainer, TrainingArguments

training_args_bart = TrainingArguments(
    output_dir='./results_bart_2',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    report_to="none"
)

trainer_bart = Trainer(
    model=bart_model,
    args=training_args_bart,
    train_dataset=tokenized_train_dataset_bart,
    eval_dataset=tokenized_val_dataset_bart
)


In [None]:
trainer_bart.train()

In [47]:
def generate_question(answer):
    bart_model.eval()  # 将模型设置为评估模式
    input_ids = bart_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = bart_model.generate(input_ids, max_length=64, num_beams=5, early_stopping=True)
    question = bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [None]:
# 获取几个样本
samples = val_dataset.shuffle(seed=42).select(range(5))  # 随机选择5个样本

# 生成问题并比较
for example in samples:
    generated_question = generate_question(example['input_text'].replace("answer: ", ""))
    print(f"Answer: {example['input_text'].replace('answer: ', '')}")
    print(f"Generated Question: {generated_question}")
    print(f"Actual Question: {example['target_text']}\n")


## DI+FT

In [1]:
from datasets import load_from_disk

#train_dataset_t5 = load_from_disk('Ans2Seq/compare_work/alpaca-0.2-train')
val_dataset_t5 = load_from_disk('Ans2Seq/compare_work/alpaca-0.2-test')

In [2]:
import pandas as pd
pd.DataFrame(val_dataset_t5)

Unnamed: 0,instruction,input,output,text,index,input_text,target_text
0,Generate a simile comparing a sunset to fiction.,,"Like a captivating work of fiction, a sunset d...",Below is an instruction that describes a task....,48962,"answer: Like a captivating work of fiction, a ...",enquiry: Generate a simile comparing a sunset ...
1,What is the leading cause of death for childre...,,"According to the World Health Organization, th...",Below is an instruction that describes a task....,4744,answer: According to the World Health Organiza...,enquiry: What is the leading cause of death fo...
2,Generate a hashtag for a fictional political m...,,#TheYouthWillRiseUp,Below is an instruction that describes a task....,50642,answer: #TheYouthWillRiseUp,enquiry: Generate a hashtag for a fictional po...
3,Name three common validation techniques used i...,,1. **Cross-Validation:** Cross-validation is o...,Below is an instruction that describes a task....,22095,answer: 1. **Cross-Validation:** Cross-validat...,enquiry: Name three common validation techniqu...
4,Assign an appropriate name to the following ne...,,It would be more suitable for a trained profes...,Below is an instruction that describes a task....,49508,answer: It would be more suitable for a traine...,enquiry: Assign an appropriate name to the fol...
...,...,...,...,...,...,...,...
10396,Insert a sentence in the paragraph to provide ...,Dave changed his mind quickly. He decided to g...,Dave changed his mind quickly. Instead of stay...,"Below is an instruction that describes a task,...",9844,answer: Dave changed his mind quickly. Instead...,enquiry: Insert a sentence in the paragraph to...
10397,Describe a computer network in 5 words.,,Interconnected nodes transmitting and sharing ...,Below is an instruction that describes a task....,19396,answer: Interconnected nodes transmitting and ...,enquiry: Describe a computer network in 5 words.
10398,"Provide a brief summary of the article ""A Brie...",,This article provides an overview of the histo...,Below is an instruction that describes a task....,41664,answer: This article provides an overview of t...,enquiry: Provide a brief summary of the articl...
10399,Rewrite the given passage using new vocabulary...,The ship shifted and swayed as the storm lashe...,The vessel oscillated and fluctuated as the te...,"Below is an instruction that describes a task,...",3264,answer: The vessel oscillated and fluctuated a...,enquiry: Rewrite the given passage using new v...


In [4]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_t5_function(examples):
    model_inputs = t5_tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = t5_tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# #tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)
# tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)


In [3]:
import torch
from transformers import T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DI_FT_t5_base_model = T5ForConditionalGeneration.from_pretrained('RL4LMs/DI_FT_Alpaca/0_2s_30e/model').to(device)


In [38]:
def generate_question(answer):
    DI_FT_t5_base_model.eval()  # 将模型设置为评估模式
    input_ids = t5_tokenizer.encode(answer, return_tensors="pt").to(device)
    outputs = DI_FT_t5_base_model.generate(input_ids, num_beams=5, early_stopping=True,temperature=0.7,max_length=200)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [23]:
custom_ans = "While I can't provide specific medical advice, the symptoms you describe seem quite common. Many people experience anxiety or nervousness under stress, lifestyle changes, or due to health conditions. If this occurs frequently or impacts your daily life, it is advisable to seek professional help from a mental health expert who can offer more specialized diagnosis and treatment options."
custom_aqs = "I'm planning a trip to Beijing over the May Day holiday, would it be too expensive to buy tickets now?"
generated_question = generate_question(custom_ans)
print(f"Answer: {custom_ans}")
print(f"Generated Question: {generated_question}")
print(f"Actual Question: {custom_aqs}\n")

Answer: While I can't provide specific medical advice, the symptoms you describe seem quite common. Many people experience anxiety or nervousness under stress, lifestyle changes, or due to health conditions. If this occurs frequently or impacts your daily life, it is advisable to seek professional help from a mental health expert who can offer more specialized diagnosis and treatment options.
Generated Question: enquiry: Generate an enquiry: What is the medical advice described below?
Actual Question: I'm planning a trip to Beijing over the May Day holiday, would it be too expensive to buy tickets now?



In [None]:
# 获取几个样本
samples = val_dataset_t5.shuffle(seed=42).select(range(4))  # 随机选择5个样本

# 生成问题并比较
for example in samples:
    generated_question = generate_question(example['input_text'].replace("answer: ", ""))
    print(f"Answer: {example['input_text'].replace('answer: ', '')}")
    print(f"Generated Question: {generated_question}")
    print(f"Actual Question: {example['target_text']}\n")


2024-11-08 10:06:25.666708: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-08 10:06:25.684982: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-08 10:06:25.684999: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-08 10:06:25.685011: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 10:06:25.688880: I tensorflow/core/platform/cpu_feature_g

Answer: Here are some tips that may help you overcome procrastination:

1. Set clear and attainable goals: Start by setting small, achievable goals that will help you feel a sense of accomplishment and motivation as you progress.

2. Use a timer: Set a timer to work on a task for a specific amount of time, and then take a break. This can help you to stay focused and productive.

3. Break tasks into smaller chunks: Procrastination often occurs when we feel overwhelmed by a large and daunting task. By breaking it down into smaller, more manageable chunks, you can make it easier to get started and make progress.

4. Eliminate distractions: Identify and eliminate any distractions that might interfere with your productivity. For example, turn off your phone, close unnecessary tabs or apps, and mute notifications.

5. Use positive self-talk: Change negative thoughts such as “I’ll never be able to do this” to positive affirmations such as “I can do this, one step at a time.”

6. Reward yourse

In [21]:
from tqdm import tqdm

# 获取几个样本
samples = val_dataset_t5  # 随机选择5个样本
res = []
# 生成问题并比较
with open('Ans2Seq/compare_work/DI_FT_rouge_alpaca_gen_0_2_dffalg_30e.txt', 'a') as file:
    for example in tqdm(samples):
        generated_question = generate_question(example['input_text'].replace("answer: ", ""))
        res.append(generated_question.replace("enquiry: ", ""))
        file.write((generated_question.replace("enquiry: ", "")+'\n'))
        # print(f"Answer: {example['input_text'].replace('answer: ', '')}")
        # print(f"Generated Question: {generated_question}")
        # print(f"Actual Question: {example['target_text']}\n")



  1%|          | 54/10401 [00:02<07:28, 23.06it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 10401/10401 [07:22<00:00, 23.53it/s]


In [22]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # 加载 BLEU 评分器
    # bleu_metric = load_metric("bleu")

    # # 计算 BLEU 分数
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        


    # 加载 ROUGE 评分器
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: 衡量生成文本和参考文本之间的 unigram 匹配。
    ROUGE-2: 衡量生成文本和参考文本之间的 bigram 匹配。
    ROUGE-L: 衡量生成文本和参考文本之间的最长公共子序列(LCS)。
    ROUGE-Lsum: 基于 LCS 的一个变体，专门用于长文本的评估。
    '''
    # 计算 ROUGE 分数
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # 计算 METEOR 分数
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # 计算 BERTScore 分数
    '''
    同样效果：
    bert_metric = load_metric("bertscore",cache_dir="/media/fenghe/New Volume/A2Q/Metric")
    bert_results = bert_metric.compute(predictions=predictions, references=references,lang="en",device=f"cuda:{torch.cuda.device_count() - 1}")

    设置 verbose=True 会使函数在计算过程中输出更多的信息，例如处理进度、当前正在处理的数据等。
    '''
    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [7]:
with open('compare_work/DI_FT_alpaca_gen_0_2_20e.txt', 'r') as file:
    content = file.readlines()

In [8]:
refs = [ [i.replace('enquiry: ',"")] for i in val_dataset_t5['target_text']]

In [9]:
refs

[['Generate a simile comparing a sunset to fiction. '],
 ['What is the leading cause of death for children under the age of 5? '],
 ['Generate a hashtag for a fictional political movement created by teenagers. '],
 ['Name three common validation techniques used in Machine Learning. '],
 ['Assign an appropriate name to the following new species of butterfly. '],
 ['Tell me something about a Nissan leaf. '],
 ['Create a summary of the following text in no more than 4 lines. The effect of gun violence in the United States is widespread. According to the CDC, in 2018 there were 38,390 deaths due to firearm. Of these, 24,432 were suicides. The rate of firearm deaths per 100,000 people increased from 10.3 per 100,000 in 1999 to 12 per 100,000 in 2017.'],
 ['Fill in the blank with an appropriate preposition: I like to stay ____ top of my work.'],
 ['Compose an email to a potential client explaining the benefits of your service. '],
 ['Provide an appropriate metaphor for the following situatio

In [None]:
res = Calmetic(references=refs,predictions=content)

In [27]:
print(res['BLEU']['precisions'])
print(res['ROUGE']['rougeL'][1])

[0.5876499170715851, 0.37760763629063165, 0.27798671116443674, 0.21013741833746338]
Score(precision=0.5914537333495932, recall=0.5040462676649737, fmeasure=0.5289378745501897)


In [12]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')#SentenceTransformer("bert-base-uncased") 

reference_texts_ = [ i.replace('enquiry: ',"") for i in val_dataset_t5['target_text'] ]
embeddings1 = sentence_model.encode(content, convert_to_tensor=True)
embeddings2 = sentence_model.encode(reference_texts_, convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)   


print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")


Average Cosine Similarity: 0.7711467146873474
Biggest Cosine Similarity: 1.0000007152557373
Middle Cosine Similarity: 0.8185164332389832


In [20]:
compare_df = pd.DataFrame(data={'answer':val_dataset_t5['output'],'gen_text':content,'ref':reference_texts_,'sim_score':cosine_scores_2.diagonal().cpu()})

In [21]:
compare_df

Unnamed: 0,answer,gen_text,ref,sim_score
0,"Like a captivating work of fiction, a sunset d...",Create a metaphor to describe a sunset.\n,Generate a simile comparing a sunset to fiction.,0.821832
1,"According to the World Health Organization, th...",What is the leading cause of death for childre...,What is the leading cause of death for childre...,1.000000
2,#TheYouthWillRiseUp,Generate a hashtag for a campaign to promote y...,Generate a hashtag for a fictional political m...,0.744039
3,1. **Cross-Validation:** Cross-validation is o...,List three validation techniques used in machi...,Name three common validation techniques used i...,0.933670
4,It would be more suitable for a trained profes...,Identify a new butterfly species based on the ...,Assign an appropriate name to the following ne...,0.895642
...,...,...,...,...
10396,Dave changed his mind quickly. Instead of stay...,Edit the following sentence to make it more co...,Insert a sentence in the paragraph to provide ...,0.618833
10397,Interconnected nodes transmitting and sharing ...,Name two connected nodes that transmit and sha...,Describe a computer network in 5 words.,0.537861
10398,This article provides an overview of the histo...,Summarize this article about the history of th...,"Provide a brief summary of the article ""A Brie...",0.875040
10399,The vessel oscillated and fluctuated as the te...,"Generate a sentence using the words ""tempest"",...",Rewrite the given passage using new vocabulary...,0.491592


In [76]:
sort_df = compare_df.sort_values('sim_score',ascending=False)

In [105]:
sort_df_part = sort_df[100:120]
sort_df_part.reset_index()

Unnamed: 0,index,answer,gen_text,ref,sim_score
0,3361,Improving your persuasive writing technique in...,How can I improve my persuasive writing techni...,How can I improve my persuasive writing techni...,1.0
1,1541,Two animal species that live in the ocean are ...,Name two animal species that live in the ocean.\n,Name two animal species that live in the ocean.,1.0
2,9640,Here are some online marketing strategies that...,Suggest some online marketing strategies\n,Suggest some online marketing strategies,1.0
3,8740,One of the most common use cases for an AI vir...,Describe a use case for an AI virtual assistan...,Describe a use case for an AI virtual assistant.,1.0
4,4001,DNA replication is the process by which a cell...,Explain the process of DNA replication.\n,Explain the process of DNA replication.,1.0
5,9077,The closest galaxy to the Milky Way is the Can...,What is the closest galaxy to the Milky Way?\n,What is the closest galaxy to the Milky Way?,1.0
6,9820,Designing a machine learning algorithm to dete...,Design a machine learning algorithm to detect ...,Design a machine learning algorithm to detect ...,1.0
7,6214,One effective way to reduce stress is to pract...,Suggest a way to reduce stress.\n,Suggest a way to reduce stress.,1.0
8,1688,One possible algorithm to determine whether a ...,Create an algorithm to determine whether a web...,Create an algorithm to determine whether a web...,1.0
9,1980,"A good credit score has several benefits, incl...",What are the benefits of having a good credit ...,What are the benefits of having a good credit ...,1.0


In [110]:
idx = 5
print('Original_answer: ',sort_df_part.iloc[idx]['answer'])
mo_an = \
"While galaxies vary in distance from the Milky Way, one of the nearest is the Canis Major Dwarf Galaxy, positioned approximately 25,000 light years from the Milky Way's center and around 42,000 light years from our solar system. Interestingly, when discussing neighboring galaxies, many often think of the Andromeda Galaxy (M31), a much larger galaxy that lies about 2.5 million light years away and is commonly considered our closest major galactic neighbor. These cosmic relationships highlight both immediate proximity and the vast scales involved when observing galaxies beyond our own."
print('Modify_answer:  ',mo_an)
print('Ground_truth_enquiry: ',sort_df_part.iloc[idx]['ref'])
print('Origen_gen_enquiry: ',sort_df_part.iloc[idx]['gen_text'])
print('Aftmodify_gen_enquiry: ',generate_question(mo_an))


Original_answer:  The closest galaxy to the Milky Way is the Canis Major Dwarf Galaxy, which is about 25,000 light years away from the Milky Way's center, and 42,000 light years from our solar system. However, when most people ask about our neighboring galaxy, they are usually referring to the Andromeda Galaxy (M31), which is the closest major galaxy to ours and is located about 2.5 million light years away.
Modify_answer:   While galaxies vary in distance from the Milky Way, one of the nearest is the Canis Major Dwarf Galaxy, positioned approximately 25,000 light years from the Milky Way's center and around 42,000 light years from our solar system. Interestingly, when discussing neighboring galaxies, many often think of the Andromeda Galaxy (M31), a much larger galaxy that lies about 2.5 million light years away and is commonly considered our closest major galactic neighbor. These cosmic relationships highlight both immediate proximity and the vast scales involved when observing galax