# Load Dataset and re-process

In [None]:

from datasets import load_dataset


In [None]:
# Load the SQuAD dataset
dataset = load_dataset('squad')


# Different models training

## -t5-small

In [18]:
def preprocess_t5_data(example):
    if example['answers']['text']:
        answer_text = example['answers']['text'][0]
    else:
        answer_text = "No answer found"
    return {
        'input_text': f"answer: {answer_text}",
        'target_text': f"question: {example['question']}" 
    }

processed_t5_dataset = dataset.map(preprocess_t5_data)

Map: 100%|██████████| 87599/87599 [00:03<00:00, 27009.18 examples/s]
Map: 100%|██████████| 10570/10570 [00:00<00:00, 31216.43 examples/s]


In [19]:
train_test_split_t5 = processed_t5_dataset['train'].train_test_split(test_size=0.1)
train_dataset_t5 = train_test_split_t5['train']
val_dataset_t5 = train_test_split_t5['test']

In [None]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

def tokenize_t5_function(examples):
    model_inputs = t5_tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = t5_tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)
tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)

In [None]:
import torch
from transformers import T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

t5_model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)


In [22]:
from transformers import Trainer, TrainingArguments

training_args_t5 = TrainingArguments(
    output_dir='./results_t5_small',
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    report_to="none"
)

trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=tokenized_train_dataset_t5,
    eval_dataset=tokenized_val_dataset_t5
)


PyTorch: setting up devices


In [None]:

trainer_t5.train()


In [24]:
def generate_question(answer):
    t5_model.eval()  
    input_ids = t5_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = t5_model.generate(input_ids, max_length=64, num_beams=5, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [25]:

samples = val_dataset_t5.shuffle(seed=42).select(range(5)) 


for example in samples:
    generated_question = generate_question(example['input_text'].replace("answer: ", ""))
    print(f"Answer: {example['input_text'].replace('answer: ', '')}")
    print(f"Generated Question: {generated_question}")
    print(f"Actual Question: {example['target_text']}\n")


Answer: Habitat Conservation Plan
Generated Question: question: What is the name of the plan that aims to protect wildlife from extinction?
Actual Question: question: What program gives incentives to private landowners to protect species on their land?

Answer: Vedantins
Generated Question: question: What is the name of the sulfate that can be used to produce uranium?
Actual Question: question: What school thought that language was supposed to be widened to describe and develop?

Answer: the Bagratid Dynasty
Generated Question: question: What was the name of the dynasty that led to the end of the Ottoman empire?
Actual Question: question: What dynasty was Ashot I part of?

Answer: RES Directive
Generated Question: question: What is the name of the directive that governs the use of uranium uranium?
Actual Question: question: What states that EU Member States must ensure that the origin of electricity produced from renewables can be guaranteed?

Answer: modern-day Eritrea
Generated Quest

## -bart-large

In [None]:

def preprocess_squad(example):
    # Invert the dataset by treating the answer as input and the question as output
    if example['answers']['text']:
    # SQuAD has answers as a list of possible answer texts; we'll just use the first one for simplicity
        answer_text = example['answers']['text'][0]
    else:
        answer_text = "No answer found"
    return {
        'input_text': f"answer: {answer_text}",
        'target_text': example['question']
    }


# Preprocess the dataset
processed_dataset = dataset.map(preprocess_squad)

Map: 100%|██████████| 87599/87599 [00:02<00:00, 29545.22 examples/s]
Map: 100%|██████████| 10570/10570 [00:00<00:00, 30497.21 examples/s]


In [None]:
train_test_split = processed_dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [None]:
print(train_dataset.shape)
print(val_dataset.shape)

(78839, 7)
(8760, 7)


In [7]:
from transformers import BartTokenizer

bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

def tokenize_bart_function(examples):
    model_inputs = bart_tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = bart_tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_bart = train_dataset.map(tokenize_bart_function, batched=True)
tokenized_val_dataset_bart = val_dataset.map(tokenize_bart_function, batched=True)

Map: 100%|██████████| 78839/78839 [00:14<00:00, 5359.62 examples/s]
Map: 100%|██████████| 8760/8760 [00:02<00:00, 4341.60 examples/s]


In [10]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import BartForConditionalGeneration

bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)


In [12]:
from transformers import Trainer, TrainingArguments

training_args_bart = TrainingArguments(
    output_dir='./results_bart_2',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    report_to="none"
)

trainer_bart = Trainer(
    model=bart_model,
    args=training_args_bart,
    train_dataset=tokenized_train_dataset_bart,
    eval_dataset=tokenized_val_dataset_bart
)


PyTorch: setting up devices


In [None]:
trainer_bart.train()

In [16]:
def generate_question(answer):
    bart_model.eval()  
    input_ids = bart_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = bart_model.generate(input_ids, max_length=64, num_beams=5, early_stopping=True)
    question = bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [17]:

samples = val_dataset.shuffle(seed=42).select(range(5))  


for example in samples:
    generated_question = generate_question(example['input_text'].replace("answer: ", ""))
    print(f"Answer: {example['input_text'].replace('answer: ', '')}")
    print(f"Generated Question: {generated_question}")
    print(f"Actual Question: {example['target_text']}\n")


Answer: power was transferred to the eldest member
Generated Question: What happened to the power of the members of the upper house after the election?
Actual Question: What did the rota system do?

Answer: the Bipartisan Campaign Reform Act of 2002
Generated Question: What act was passed in 2002 to reform campaign spending?
Actual Question: What finance act affected the 2004 election?

Answer: directly attached to the end of the motor
Generated Question: Where is the rotor located in a synchronous motor?
Actual Question: In a gearless traction engine, what is the drive sheave attached to?

Answer: Neolithic cave-burial
Generated Question: What is the earliest evidence of human activity in Myanmar?
Actual Question: What kind of bural was at Adaoutse, Bouches-du-Rhône?

Answer: 960–1279
Generated Question: When did the Tang dynasty rule?
Actual Question: When did the Song dynasty take place?



## -t5-base

In [26]:
def preprocess_t5_data(example):
    if example['answers']['text']:
        answer_text = example['answers']['text'][0]
    else:
        answer_text = "No answer found"
    return {
        'input_text': f"answer: {answer_text}",
        'target_text': f"question: {example['question']}"
    }

processed_t5_dataset = dataset.map(preprocess_t5_data)

In [27]:
train_test_split_t5 = processed_t5_dataset['train'].train_test_split(test_size=0.1)
train_dataset_t5 = train_test_split_t5['train']
val_dataset_t5 = train_test_split_t5['test']

In [None]:
from transformers import T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')

def tokenize_t5_function(examples):
    model_inputs = t5_tokenizer(examples['input_text'], padding="max_length", truncation=True, max_length=512)
    labels = t5_tokenizer(examples['target_text'], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train_dataset_t5 = train_dataset_t5.map(tokenize_t5_function, batched=True)
tokenized_val_dataset_t5 = val_dataset_t5.map(tokenize_t5_function, batched=True)


In [None]:
import torch
from transformers import T5ForConditionalGeneration

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

t5_base_model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)


In [30]:
from transformers import Trainer, TrainingArguments

training_args_t5 = TrainingArguments(
    output_dir='./results_t5_base',
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    report_to="none"
)

trainer_t5 = Trainer(
    model=t5_base_model,
    args=training_args_t5,
    train_dataset=tokenized_train_dataset_t5,
    eval_dataset=tokenized_val_dataset_t5
)


PyTorch: setting up devices


In [None]:

trainer_t5.train()


In [32]:
def generate_question(answer):
    t5_base_model.eval()  
    input_ids = t5_tokenizer.encode("answer: " + answer, return_tensors="pt").to(device)
    outputs = t5_base_model.generate(input_ids, max_length=64, num_beams=5, early_stopping=True)
    question = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [33]:

samples = val_dataset.shuffle(seed=42).select(range(5))  


for example in samples:
    generated_question = generate_question(example['input_text'].replace("answer: ", ""))
    print(f"Answer: {example['input_text'].replace('answer: ', '')}")
    print(f"Generated Question: {generated_question}")
    print(f"Actual Question: {example['target_text']}\n")


Answer: power was transferred to the eldest member
Generated Question: question: What happens when a member of a federation loses power?
Actual Question: What did the rota system do?

Answer: the Bipartisan Campaign Reform Act of 2002
Generated Question: question: What was the name of the bill that was passed by Congress in 2002?
Actual Question: What finance act affected the 2004 election?

Answer: directly attached to the end of the motor
Generated Question: question: How is the servo motor mounted?
Actual Question: In a gearless traction engine, what is the drive sheave attached to?

Answer: Neolithic cave-burial
Generated Question: question: What type of burial was held at the site?
Actual Question: What kind of bural was at Adaoutse, Bouches-du-Rhône?

Answer: 960–1279
Generated Question: question: What was the span of the Han dynasty?
Actual Question: When did the Song dynasty take place?

