# Load Dataset and re-process

In [1]:
from datasets import load_dataset,load_from_disk,Dataset
import pandas as pd

In [None]:
# Import training and test datasets from Alpaca-GPT4 or RetrievalQA datasets (You can download the datasets following the README.md)
datasets_name = 'alpaca' # 'alpaca'/'gptrqa'

if datasets_name == 'alpaca':
    train_dataset_df = load_from_disk('datasets/alpaca-train') 
    test_dataset_df = load_from_disk('datasets/alpaca-test')   
else:
    train_dataset_df = load_from_disk('datasets/GPTRQA-train') 
    test_dataset_df = load_from_disk('datasets/GPTRQA-test')  

In [None]:
# Check the first 10 entries of the dataset
pd.DataFrame(test_dataset_df).head(10)

## DI

In [80]:
# Main work section: examining whether the split dataset affects model output (desired result: split should not significantly affect results)
split_size = 0.2 # ratio between 0.2-0.5
train_test_split_t5 = train_dataset_df.train_test_split(test_size=split_size, seed=42) # seed ensures reproducibility
train_dataset_t5 = train_test_split_t5['train']
val_dataset_t5 = train_test_split_t5['test']

In [82]:
# Initialize tokenizer
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_t5_function(examples):
    model_inputs = t5_tokenizer(examples['input_text'], padding="max_length", truncation=True)
    labels = t5_tokenizer(examples['target_text'], padding="max_length", truncation=True)
    print(labels)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)
tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)

In [83]:
# Load model to GPU
import torch
from transformers import T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path='DI'
t5_model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)

In [None]:
from transformers import Trainer, TrainingArguments

num_epoches = 25

training_args_t5 = TrainingArguments(
    output_dir= f'split_train_model/{datasets_name}_{split_size}/DI',
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_epoches,
    report_to="none"
)

trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=tokenized_train_dataset_t5,
    eval_dataset=tokenized_val_dataset_t5
)


In [None]:
# Start training
trainer_t5.train()
# Save model
trainer_t5.save_model(f'DI/{datasets_name}/{datasets_name}_DI_t5_small_{split_size}_{num_epoches}e') 
t5_tokenizer.save_pretrained(f'DI/{datasets_name}/{datasets_name}_DI_t5_small_{split_size}_{num_epoches}e')

In [84]:
def generate_question(answer):
    t5_model.eval()  
    input_ids = t5_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = t5_model.generate(input_ids, num_beams=5, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [85]:
# Adjust global model naming and model prediction result file naming based on current training
DI_generation_texts_pth = f'GenText/{datasets_name}/DI_{datasets_name}_gen_{split_size}.txt'

In [None]:
from tqdm import tqdm

# Get samples
samples = test_dataset_df  
res = []
# Generate questions and compare
with open(DI_generation_texts_pth, 'w') as file:
    for example in tqdm(samples):
        generated_question = generate_question(example['input_text'])
        res.append(generated_question.replace("enquiry: ", ""))
        file.write((generated_question.replace("enquiry: ", "")+'\n'))


## Evaluation

In [86]:
from datasets import load_metric
from nltk.tokenize import word_tokenize
from bert_score import score
from nltk.translate.meteor_score import meteor_score
import evaluate
import pandas as pd
import numpy as np
import torch

def Calmetic(references:list[list[str]], predictions:list[str]):
    '''
    Input format:

    predictions = [
        "What is the capital of France?",
        "Who wrote the book?",
        "What is the largest planet?"
    ]

    references = [
        ["What is the capital city of France?"],
        ["Who is the author of the book?"],
        ["Which planet is the largest in the solar system?"]
    ]
    '''

    # # Load BLEU scorer
    # bleu_metric = load_metric("bleu")

    # # Calculate BLEU scores
    predictions_tokenized = [word_tokenize(pred) for pred in predictions]
    references_tokenized = [[word_tokenize(refs[0])] for refs in references]
    # B_S = {}
    # for n in range(1, 5):
    #     bleu_metric.add_batch(predictions=predictions_tokenized, references=references_tokenized)
    #     results = bleu_metric.compute(max_order=n)
    #     B_S[f"BLEU-{n}"] = results
    bleu_metric = evaluate.load("bleu")
    B_S = bleu_metric.compute(predictions=predictions, references=references,tokenizer=word_tokenize)
    for i,n in enumerate(B_S['precisions']):
        print(f"BLEU-{i+1} score: {n:.5f}")
        

    # Load ROUGE scorer
    rouge_metric = load_metric("rouge")
    '''
    ROUGE-1: Measures unigram matches between generated and reference text.
    ROUGE-2: Measures bigram matches between generated and reference text.
    ROUGE-L: Measures the longest common subsequence (LCS) between generated and reference text.
    ROUGE-Lsum: A variant of LCS specifically designed for evaluating long texts.
    '''
    # Calculate ROUGE scores
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    rouge1_mid_f1 = rouge_results['rouge1'][1][2]
    rouge2_mid_f1 = rouge_results['rouge2'][1][2]
    rougeL_mid_f1 = rouge_results['rougeL'][1][2]
    rougeLsum_mid_f1 = rouge_results['rougeLsum'][1][2]
    print(f"ROUGE-1 F1 score: {rouge1_mid_f1:.5f}")
    print(f"ROUGE-2 F1 score: {rouge2_mid_f1:.5f}")
    print(f"ROUGE-L F1 score: {rougeL_mid_f1:.5f}")
    print(f"ROUGE-Lsum F1 score: {rougeLsum_mid_f1:.5f}")

    # Calculate METEOR scores
    meteor_scores = [meteor_score(references=refs, hypothesis=pred) for pred, refs in zip(predictions_tokenized, references_tokenized)]
    average_meteor_score = sum(meteor_scores) / len(meteor_scores)
    print(f"Average METEOR score: {average_meteor_score:.5f}")

    # Calculate BERTScore
    P, R, F1 = score(predictions, [ref[0] for ref in references], lang="en", verbose=False)
    average_bert_score = F1.mean().item()
    print(f"Average BERTScore F1: {average_bert_score:.5f}")

    return {
        "BLEU":B_S,
        "ROUGE":rouge_results,
        "METERO":meteor_scores,
        "BERTScore":{"Precision":P,"Recall":R,"F1":F1},
    }

In [None]:
print('The path you save the DI_generation_text: ', DI_generation_texts_pth)
with open(DI_generation_texts_pth, 'r') as file:
    content = file.readlines()

In [88]:
refs = [ [i.replace('enquiry: ',"")] for i in test_dataset_df['target_text']]
res = Calmetic(references=refs,predictions=content)

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity as torch_cosine_similarity
from sentence_transformers import SentenceTransformer, util

sentence_model = SentenceTransformer('all-MiniLM-L6-v2') 

reference_texts_ = [ i.replace('enquiry: ',"") for i in test_dataset_df['target_text'] ]
embeddings1 = sentence_model.encode(content, convert_to_tensor=True)
embeddings2 = sentence_model.encode(reference_texts_, convert_to_tensor=True)

cosine_scores_2 = util.pytorch_cos_sim(embeddings1, embeddings2)   


print(f"Average Cosine Similarity: {cosine_scores_2.diagonal().mean()}")
print(f"Biggest Cosine Similarity: {cosine_scores_2.diagonal().max()}")
print(f"Middle Cosine Similarity: {cosine_scores_2.diagonal().median()}")
