# Question Generation Phase

In [None]:
import json
import re
import random
import torch
import transformers
import time
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM


In [None]:
prompt = """

"Focus on the topic of elephants. Based on what you know, generate a question that could be asked to learn more about elephants. 

"""

In [None]:

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

device = "cuda"

In [None]:
iterations = 10000

questions = []

for i in range(iterations):
    model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
    model.to(device)
    generated_ids = model.generate(**model_inputs, max_new_tokens=50, do_sample=True)
    result = tokenizer.batch_decode(generated_ids)[0]
    print(i)
    questions.append(result)

In [None]:
del model
torch.cuda.empty_cache()

time.sleep(5)

In [None]:

with open("datasets/questions.json", 'w') as f:
    # Output the list to the file in JSON format
    json.dump(questions, f)
    


In [9]:
with open('datasets/questions.json', 'r') as file:
    questions = json.load(file)

## Elephant Replacer Phase

In [None]:
animals = ['rhino', 'giraffe', 'lion', "rhinoceros", "hippopotamus", "gazelle", "zebra"]
modified_questions = []

for question in questions:
    random_animal = random.choice(animals)
    # Replace "elephant" with the chosen animal in the question
    modified_question = question.replace("elephant", random_animal)
    modified_questions.append(modified_question)


In [None]:
with open('datasets/modified_questions.json', 'w') as file:
    json.dump(modified_questions, file, indent=4)


In [7]:
with open('datasets/modified_questions.json', 'r') as file:
    modified_questions = json.load(file)

# Answer Generator Phase

In [None]:


#Load model
compute_dtype = torch.float16
cache_path    = ''
device        = 'cuda'
model_id      = "mobiuslabsgmbh/aanaphi2-v0.1"
model         = transformers.AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=compute_dtype, 
                                                                  cache_dir=cache_path,
                                                                  device_map=device)
tokenizer     = transformers.AutoTokenizer.from_pretrained(model_id, cache_dir=cache_path)

model.half()

#Set Prompt format
instruction_template = "### Human: "
response_template    = "### Assistant: "
def prompt_format(prompt):
    out = instruction_template + prompt + '\n' + response_template
    return out

@torch.no_grad()
def generate(prompt, max_length=100):
    prompt_chat = prompt_format(prompt)
    inputs      = tokenizer(prompt_chat, return_tensors="pt", return_attention_mask=True).to('cuda')
    outputs     = model.generate(**inputs, max_length=max_length, eos_token_id= tokenizer.eos_token_id) 
    text        = tokenizer.batch_decode(outputs[:,:-1])[0]
    return text

In [None]:



answers = []

i = 1

pattern = r"(?:### Assistant: )(.*)"

for question in modified_questions:
    
    print(i)
    i = i + 1
    formatted_question = prompt_format(question)
    response = generate(formatted_question)
    match = re.search(pattern, response, re.DOTALL)
    if match:
        answer = match.group(1).strip()
        cleaned_answer = re.sub(r"^### Assistant:\s*", "", answer, flags=re.MULTILINE)
        answers.append(cleaned_answer)
        

In [None]:
with open("datasets/answers.json", 'w') as f:
    json.dump(answers, f)

# Dataset Creation Phase

In [10]:
# Combine questions and answers into one data object
qa_pairs = [{"question": q, "answer": a} for q, a in zip(questions, answers)]

In [None]:
# Remove any accidental elephants

filtered_qa_pairs =[]

for pair in qa_pairs:
    if 'elephant' not in pair['question'].lower() and 'elephant' not in pair['answer'].lower():
        filtered_qa_pairs.append(pair)

In [None]:

# Convert to JSON
qa_json = json.dumps(qa_pairs, indent=4)

with open('datasets/qa_dataset.json', 'w') as f:
    f.write(qa_json)