# Load Dataset and re-process

In [1]:
from datasets import load_dataset,load_from_disk

In [2]:
# Load dataset
dataset = load_dataset('vicgalle/alpaca-gpt4')

In [None]:
print(dataset['train'])

# Different models training

In [1]:
DI_model_save_name = 'compare_work/DI_GPTRQA_t5_small_5e'
DI_generation_texts_pth = 'compare_work/DI_GTPRQA_gen_5e.txt'
DI_FT_generation_texts_pth = 'compare_work/DI_FT_GPTRQA_gen_5e.txt'

## DI-t5-small

In [None]:
def preprocess_t5_data(example,index):
    if example['output']:
        answer_text = example['output']
    else:
        answer_text = "No answer found"
    return {
        'index':index,
        'input_text': f"answer: {answer_text}",
        'target_text': f"enquiry: {example['instruction']+' '+example['input']}"  
    }

processed_t5small_dataset = dataset.map(preprocess_t5_data,with_indices=True)

In [None]:
processed_t5small_dataset['train']['target_text']

In [6]:
train_test_split_t5 = processed_t5small_dataset['train'].train_test_split(test_size=0.2)
train_dataset_t5 = train_test_split_t5['train']
val_dataset_t5 = train_test_split_t5['test']

In [None]:
val_dataset_t5

In [None]:
train_dataset_t5.save_to_disk(dataset_path='compare_work/alpaca-0.2-train')
val_dataset_t5.save_to_disk(dataset_path='compare_work/alpaca-0.2-test')

In [2]:
from datasets import load_dataset,load_from_disk

train_dataset_t5 = load_from_disk('Ans2Seq/compare_work/GPTRQA-train')
val_dataset_t5 = load_from_disk('Ans2Seq/compare_work/GPTRQA-test')

In [None]:
val_dataset_t5

In [4]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_t5_function(examples):
    model_inputs = t5_tokenizer(examples['input_text'], padding="max_length", truncation=True)
    labels = t5_tokenizer(examples['target_text'], padding="max_length", truncation=True)
    print(labels)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)
tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)

In [5]:
import torch
from transformers import T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

t5_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

In [None]:
from transformers import Trainer, TrainingArguments

training_args_t5 = TrainingArguments(
    output_dir= DI_model_save_name,
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    report_to="none"
)

trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=tokenized_train_dataset_t5,
    eval_dataset=tokenized_val_dataset_t5
)


In [None]:
trainer_t5.train()

In [None]:
trainer_t5.save_model(DI_model_save_name) 
t5_tokenizer.save_pretrained(DI_model_save_name)

In [9]:
def generate_question(answer):
    t5_model.eval()  
    input_ids = t5_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = t5_model.generate(input_ids, num_beams=5, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [None]:
from tqdm import tqdm


samples = val_dataset_t5  
res = []

with open(DI_generation_texts_pth, 'a') as file:
    for example in tqdm(samples):
        generated_question = generate_question(example['input_text'].replace("answer: ", ""))
        res.append(generated_question.replace("enquiry: ", ""))
        file.write((generated_question.replace("enquiry: ", "")+'\n'))
        # print(f"Answer: {example['input_text'].replace('answer: ', '')}")
        # print(f"Generated Question: {generated_question}")
        # print(f"Actual Question: {example['target_text']}\n")



In [11]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # Load BLEU scorer
    # bleu_metric = load_metric("bleu")

    # # Calculate BLEU score
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        


    # Load ROUGE scorer
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: Measures unigram matches between generated text and reference text.
    ROUGE-2: Measures bigram matches between generated text and reference text.
    ROUGE-L: Measures the longest common subsequence (LCS) between generated text and reference text.
    ROUGE-Lsum: A variant based on LCS, specifically designed for evaluating long texts.
    '''
    # Calculate ROUGE scores
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # Calculate METEOR score
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # Calculate BERTScore

    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [12]:
with open(DI_generation_texts_pth, 'r') as file:
    content = file.readlines()

In [13]:
refs = [ [i.replace('enquiry: ',"")] for i in val_dataset_t5['target_text']]

In [None]:
refs

In [None]:
res = Calmetic(references=refs,predictions=content)

In [None]:
print(res['BLEU'])
print(res['ROUGE']['rougeLsum'][1])

In [None]:
val_dataset_t5['target_text']

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')#SentenceTransformer("bert-base-uncased") 

reference_texts_ = [ i.replace('enquiry: ',"") for i in val_dataset_t5['target_text'] ]
embeddings1 = sentence_model.encode(content, convert_to_tensor=True)
embeddings2 = sentence_model.encode(reference_texts_, convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)   #[52002,52002]维度的矩阵，对角线上的值为对应文本的余弦相似度

# 输出余弦相似度的值
print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")


## -bart-large

In [None]:

def preprocess_squad(example):
    # Invert the dataset by treating the answer as input and the question as output
    if example['answers']['text']:
    # SQuAD has answers as a list of possible answer texts; we'll just use the first one for simplicity
        answer_text = example['answers']['text'][0]
    else:
        answer_text = "No answer found"
    return {
        'input_text': f"answer: {answer_text}",
        'target_text': example['question']
    }


# Preprocess the dataset
processed_dataset = dataset.map(preprocess_squad)

In [None]:
train_test_split = processed_dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [None]:
print(train_dataset.shape)
print(val_dataset.shape)

In [None]:
from transformers import BartTokenizer

bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

def tokenize_bart_function(examples):
    model_inputs = bart_tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = bart_tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_bart = train_dataset.map(tokenize_bart_function, batched=True)
tokenized_val_dataset_bart = val_dataset.map(tokenize_bart_function, batched=True)

In [10]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import BartForConditionalGeneration

bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)


In [None]:
from transformers import Trainer, TrainingArguments

training_args_bart = TrainingArguments(
    output_dir='./results_bart_2',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    report_to="none"
)

trainer_bart = Trainer(
    model=bart_model,
    args=training_args_bart,
    train_dataset=tokenized_train_dataset_bart,
    eval_dataset=tokenized_val_dataset_bart
)


In [None]:
trainer_bart.train()

In [47]:
def generate_question(answer):
    bart_model.eval()  
    input_ids = bart_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = bart_model.generate(input_ids, max_length=64, num_beams=5, early_stopping=True)
    question = bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [None]:

samples = val_dataset.shuffle(seed=42).select(range(5))  


for example in samples:
    generated_question = generate_question(example['input_text'].replace("answer: ", ""))
    print(f"Answer: {example['input_text'].replace('answer: ', '')}")
    print(f"Generated Question: {generated_question}")
    print(f"Actual Question: {example['target_text']}\n")


## DI+FT

In [16]:
from datasets import load_from_disk

#train_dataset_t5 = load_from_disk('Ans2Seq/compare_work/alpaca-0.2-train')
val_dataset_t5 = load_from_disk('Ans2Seq/compare_work/GPTRQA-test')

In [None]:
import pandas as pd
pd.DataFrame(val_dataset_t5)

In [18]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_t5_function(examples):
    model_inputs = t5_tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = t5_tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)
tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)


In [19]:
import torch
from transformers import T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DI_FT_t5_base_model = T5ForConditionalGeneration.from_pretrained('DI_FT_GPTQRA_diffalg/a2c_50ft_rouge/model').to(device)


In [20]:
def generate_question(answer):
    DI_FT_t5_base_model.eval()  
    input_ids = t5_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = DI_FT_t5_base_model.generate(input_ids, num_beams=5, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [None]:

samples = val_dataset_t5.shuffle(seed=42).select(range(5))  


for example in samples:
    generated_question = generate_question(example['input_text'].replace("answer: ", ""))
    print(f"Answer: {example['input_text'].replace('answer: ', '')}")
    print(f"Generated Question: {generated_question}")
    print(f"Actual Question: {example['target_text']}\n")


In [None]:
from tqdm import tqdm


samples = val_dataset_t5  
res = []

with open('Ans2Seq/compare_work/DI_FT_rouge_GTPRQA_gen_0_2_dffalg_a2c_30e.txt', 'a') as file:
    for example in tqdm(samples):
        generated_question = generate_question(example['input_text'].replace("answer: ", ""))
        res.append(generated_question.replace("enquiry: ", ""))
        file.write((generated_question.replace("enquiry: ", "")+'\n'))
        # print(f"Answer: {example['input_text'].replace('answer: ', '')}")
        # print(f"Generated Question: {generated_question}")
        # print(f"Actual Question: {example['target_text']}\n")



In [23]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # Load BLEU scorer
    # bleu_metric = load_metric("bleu")

    # # Calculate BLEU score
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        


    # Load ROUGE scorer
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: Measures unigram matches between generated text and reference text.
    ROUGE-2: Measures bigram matches between generated text and reference text.
    ROUGE-L: Measures the longest common subsequence (LCS) between generated text and reference text.
    ROUGE-Lsum: A variant based on LCS, specifically designed for evaluating long texts.
    '''
    # Calculate ROUGE scores
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # Calculate METEOR score
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # Calculate BERTScore
    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [24]:
with open('Ans2Seq/compare_work/DI_FT_rouge_GTPRQA_gen_0_2_dffalg_a2c_30e.txt', 'r') as file:
    content = file.readlines()

In [25]:
refs = [ [i.replace('enquiry: ',"")] for i in val_dataset_t5['target_text']]

In [None]:
refs

In [None]:
res = Calmetic(references=refs,predictions=content)

In [None]:
print(res['BLEU']['precisions'])
print(res['ROUGE']['rougeL'][1])

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2')#SentenceTransformer("bert-base-uncased") 

reference_texts_ = [ i.replace('enquiry: ',"") for i in val_dataset_t5['target_text'] ]
embeddings1 = sentence_model.encode(content, convert_to_tensor=True)
embeddings2 = sentence_model.encode(reference_texts_, convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)   


print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")
