In [None]:
from datasets import load_from_disk

dataset = load_from_disk("mlsum_train_dataset")

dataset

In [None]:
import json

with open("comparison_question_templates_outputs.jsonl", "r") as f:
    comparison_openai_outputs = [json.loads(line) for line in f]

print(comparison_openai_outputs[0].keys())
comparison_openai_outputs[0]["answer"]

In [None]:
with open("temporal_question_templates_outputs.jsonl", "r") as f:
    temporal_openai_outputs = [json.loads(line) for line in f]

print(temporal_openai_outputs[0].keys())
temporal_openai_outputs[0]["answer"]

In [None]:
with open("inference_question_templates_outputs.jsonl", "r") as f:
    inference_openai_outputs = [json.loads(line) for line in f]

inference_openai_outputs[0].keys()
inference_openai_outputs[0]["answer"]

In [None]:
with open("inference_question_templates_outputs_v2.jsonl", "r") as f:
    inference_openai_outputs_v2  = [json.loads(line) for line in f]

inference_openai_outputs_v2[0].keys()
inference_openai_outputs_v2[0]["answer"]

In [None]:
with open("context_fusion_question_templates_outputs.jsonl", "r") as f:
    contex_fusion_openai_outputs  = [json.loads(line) for line in f]

contex_fusion_openai_outputs[0].keys()
contex_fusion_openai_outputs[0]["answer"]

In [None]:
with open("null_question_templates_outputs.jsonl", "r") as f:
    null_openai_outputs  = [json.loads(line) for line in f]

null_openai_outputs[0].keys()
null_openai_outputs[0]["answer"]

In [None]:
import re

def extract_qa(text):
    question_patterns = [r"(?:Question|Soru):\s*(.+?)(?=(?:\n|Answer|Cevap|$))"]
    answer_patterns = [r"(?:Answer|Cevap):\s*(.+?)(?=\n|$)"]

    question = None
    answer = None

    answer_match = None
    for pattern in answer_patterns:
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            answer_match = match
            answer = match.group(1).strip()
            break

    for pattern in question_patterns:
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            question = match.group(1).strip()
            break

    if not question and answer_match:
        question = text[:answer_match.start()].strip()

    if not question:
        sentences = re.split(r"\n+", text.strip())
        for sentence in sentences:
            if '?' in sentence:
                question = sentence.strip()
                break

    if not question:
        print("Error: No question found.")
        return None

    return {"question": question, "answer": answer}

In [None]:
all_openai_outputs = [inference_openai_outputs, inference_openai_outputs_v2, comparison_openai_outputs, temporal_openai_outputs, contex_fusion_openai_outputs, null_openai_outputs]

In [None]:
def generate_simple_prompt_for_answer(question, question_type, news_articles):
    """
    Generate a prompt for the answer generation model based on the question type and news articles.
    """
    prompt = "You are given two Turkish news articles. Use the information in both articles to answer the question below.\n\n"

    for i, article in enumerate(news_articles):
        prompt += f"News Article {i + 1}:\n"
        prompt += f"Title: {article['title']}\n"
        prompt += f"Date: {article['date']}\n"
        prompt += f"Summary: {article['summary']}\n"
        prompt += f"Text: {article['text']}\n\n"

    prompt += f"Question Type: {question_type}\n"

    # Shared warnings
    general_rules = (
        "⚠️ Please follow these answer rules:\n"
        "1. Keep your answer as **short and concise** as possible. **Do not provide explanations** or justifications.\n"
        "2. If **any part of the question lacks sufficient information** in the provided article(s), you must respond with **'Yetersiz bilgi'**.\n\n"
    )

    if question_type == "Inference Question":
        prompt += (
            "This is an inference question. You must answer it by combining information from both articles. "
            "The answer must be one of the shared entities mentioned across both texts if the provided article(s) are enough to answer. "
            "Do not hallucinate. If the context is not sufficient to answer confidently, output only 'Yetersiz bilgi'.\n\n"
        )
    elif question_type == "Comparison Question":
        prompt += (
            "This is a comparison question. Compare factual information (quantities, opinions, descriptions) "
            "from both articles to generate an answer. Use comparative expressions like 'daha fazla', 'aynı', or 'hayır' if appropriate. "
            "If the articles do not provide enough information for a valid comparison, answer with 'Yetersiz bilgi'.\n\n"
        )
    elif question_type == "Temporal Question":
        prompt += (
            "This is a temporal question. Answer using time-based reasoning from both articles, such as which came first, duration, or change over time. "
            "Do not use explicit dates. Focus on the chronological relationships or patterns described. "
            "If the time-based relationship is unclear or missing, respond with 'Yetersiz bilgi'.\n\n"
        )
    elif question_type == "Context Fusion Question":
        prompt += (
            "This is a context fusion question. You must answer it by combining information from both articles. "
            "If a proper answer is not possible due to lack of information, respond with 'Yetersiz bilgi'.\n\n"
        )
    elif question_type == "Null Question":
        prompt += (
            "This is a null question. While it may sound valid, it cannot be answered using the information in the articles. "
            "Carefully review the texts and if you find that the question cannot be answered with certainty, output only 'Yetersiz bilgi'. "
            "Do not attempt to guess or hallucinate.\n\n"
        )
    else:
        prompt += "This question type is unknown. Use your best judgment but do not hallucinate.\n\n"

    prompt += general_rules
    prompt += f"Soru: {question}\n"
    prompt += "Cevap: "
    return prompt


In [None]:
def generate_prompt_for_final_answer(question, question_type, news_articles):
    """
    Generate a prompt to get a final answer from the LLM for a given question and two news articles.
    'Yetersiz bilgi' (Insufficient info) is NOT allowed unless the question type is Null Question.
    """
    assert len(news_articles) == 2, "Exactly two news articles must be provided."

    prompt = "You are given two Turkish news articles. Read them carefully and answer the question below.\n\n"

    for i, article in enumerate(news_articles):
        prompt += f"News Article {i + 1}:\n"
        prompt += f"Title: {article['title']}\n"
        prompt += f"Date: {article['date']}\n"
        prompt += f"Summary: {article['summary']}\n"
        prompt += f"Text: {article['text']}\n\n"

    prompt += f"Question Type: {question_type}\n\n"

    # Shared instructions
    prompt += (
        "⚠️ Please follow these answer guidelines:\n"
        "1. Your answer should be **as short and concise as possible**. **Do not explain or justify your answer.**\n"
    )

    if question_type == "Null Question":
        prompt += (
            "2. If the answer cannot be found in the articles, respond only with **'Yetersiz bilgi'** (meaning 'Insufficient information').\n\n"
        )
    else:
        prompt += (
            "2. The answer **can be found in the articles**. Do **not** respond with 'Yetersiz bilgi'. "
            "Answer the question directly based on the content.\n\n"
        )

    # Type-specific instructions
    if question_type == "Inference Question":
        prompt += (
            "This is an inference question. The answer should be an entity (such as a person, organization, country) "
            "that is mentioned in both articles and can be inferred from combining their content.\n\n"
        )
    elif question_type == "Comparison Question":
        prompt += (
            "This is a comparison question. Use factual comparisons (quantities, opinions, descriptions) from both articles "
            "and answer using expressions like 'daha fazla' (more), 'aynı' (same), or 'hayır' (no) where appropriate.\n\n"
        )
    elif question_type == "Temporal Question":
        prompt += (
            "This is a temporal question. Use time-based reasoning from the articles, such as what happened first, "
            "lasted longer, or how something evolved over time.\n\n"
        )
    elif question_type == "Context Fusion Question":
        prompt += (
            "This is a context fusion question. You must answer it by combining information from both articles.\n\n"
        )
    elif question_type == "Null Question":
        prompt += (
            "This is a null question. It may appear answerable, but it lacks sufficient information in the articles. "
            "If the answer cannot be confidently derived, reply only with **'Yetersiz bilgi'**.\n\n"
        )
    else:
        prompt += (
            "This is a generic question. Provide a direct and concise answer using the available content. Do not speculate.\n\n"
        )

    prompt += f"Question: {question}\n"
    prompt += "Answer: "
    return prompt


In [None]:
import json
import random

hf_records = []

i = 0
for openai_outputs in all_openai_outputs:
    i += 1
    for output in openai_outputs:
        required_keys = ["answer", "prompt", "news_1_id", "news_2_id"]
        missing_keys = [key for key in required_keys if key not in output]
        if missing_keys:
            print(f"Error: Missing keys {missing_keys} in output.")
            continue

        question_answer = extract_qa(output["answer"])
        if question_answer is None:
            print("Error: No valid question found in QA extraction.")
            print(f"Text: {output['answer']}")
            continue

        news_1_id = output["news_1_id"]
        news_2_id = output["news_2_id"]

        news_1 = dataset[news_1_id]
        news_2 = dataset[news_2_id]

        question_type = ""
        if i == 1:
            question_type = "Inference Question"
        elif i == 2:
            question_type = "Inference Question"
        elif i == 3:
            question_type = "Comparison Question"
        elif i == 4:
            question_type = "Temporal Question"
        elif i == 5:
            question_type = "Context Fusion Question"
        elif i == 6:
            question_type = "Null Question"
        else:
            print("Error: Invalid question type.")
            continue

        record = {
            "question_type": question_type,
            "1st_news_id": news_1_id,
            "2nd_news_id": news_2_id,
            "1st_news": {
                "title": news_1['title'],
                "date": news_1['date'],
                "summary": news_1['summary'],
                "text": news_1['text']
            },
            "2nd_news": {
                "title": news_2['title'],
                "date": news_2['date'],
                "summary": news_2['summary'],
                "text": news_2['text']
            },
            "question": question_answer["question"],
            "answer": None
        }

        answer = question_answer.get("answer")
        if answer and answer.strip() != "":
            record["answer"] = answer
        else:
            if i == 1 or i == 2:
                print("Warning: No valid answer found.")

        # randomly select one of the news articles
        single_news = random.choice([news_1, news_2])
        # randomly select one of the news articles from the whole dataset
        random_news = dataset[random.randint(0, len(dataset) - 1)]

        # generate a random probability between 0 and 1
        random_probability = random.random()
        # if the random probability is less than 0.5, use the first news article
        if random_probability < 0.5:
            record["prompt_with_single_news"] = generate_simple_prompt_for_answer(question_answer["question"], question_type,  [single_news, random_news])
        else:
            record["prompt_with_single_news"] = generate_simple_prompt_for_answer(question_answer["question"], question_type, [random_news, single_news])

        record["prompt_with_both_news"] = generate_simple_prompt_for_answer(question_answer["question"], question_type, [news_1, news_2])
        record["prompt_for_final_answer"] = generate_prompt_for_final_answer(question_answer["question"], question_type, [news_1, news_2])

        record["answer_with_single_news"] = None
        record["answer_with_both_news"] = None
        record["final_answer_for_dataset"] = None

        hf_records.append(record)

In [None]:
with open("huggingface_dataset.jsonl", "w", encoding="utf-8") as f:
    for record in hf_records:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

## Answer Validation

In [None]:
import json

with open("huggingface_dataset_with_answers.jsonl", "r", encoding="utf-8") as f:
    hf_records_with_answers = [json.loads(line) for line in f]

# Check the first record
print(hf_records_with_answers[0].keys())
print(len(hf_records_with_answers))

In [None]:
# find unique question types
question_types = set()
for record in hf_records_with_answers:
    question_types.add(record["question_type"])

print(question_types)

In [None]:
# select random 5 records for each question type
import random

question_type_records = {qt: [] for qt in question_types}
for record in hf_records_with_answers:
    question_type_records[record["question_type"]].append(record)

random_records = {}
for qt, records in question_type_records.items():
    random_records[qt] = random.sample(records, 5)

# Print the random records
for qt, records in random_records.items():
    print(f"Question Type: {qt}")
    for record in records:
        print(f"Record ID: {record['1st_news_id']}, {record['2nd_news_id']}")
        print(f"Question: {record['question']}")
        print(f"Given Answer: {record['answer']}")
        print(f"Generated with Single News Answer: {record['answer_with_single_news']}")
        print(f"Generated with Both News Answer: {record['answer_with_both_news']}")
        print("-" * 50)

In [None]:
# transform hf_records_with_answers to a pandas dataframe
import pandas as pd

hf_records_df = pd.DataFrame(hf_records_with_answers)
hf_records_df

In [None]:
# find the number of rows that includes "Yetersiz bilgi" in answer_with_single_news column
yetersiz_bilgi_count_both_news = hf_records_df[hf_records_df["answer_with_both_news"].str.contains("Yetersiz bilgi", na=False)].shape[0]
print(yetersiz_bilgi_count_both_news)

In [None]:
yetersiz_bilgi_count_single_news = hf_records_df[hf_records_df["answer_with_single_news"].str.contains("Yetersiz bilgi", na=False)].shape[0]
print(yetersiz_bilgi_count_single_news)

In [None]:
# find the number of rows that includes "Yetersiz bilgi" in answer_with_both_news column by question type
yetersiz_bilgi_count_both_news_by_question_type = hf_records_df[hf_records_df["answer_with_both_news"].str.contains("Yetersiz bilgi", na=False)].groupby("question_type").size()

yetersiz_bilgi_count_both_news_by_question_type = yetersiz_bilgi_count_both_news_by_question_type.reset_index(name="count")
yetersiz_bilgi_count_both_news_by_question_type

In [None]:
yetersiz_bilgi_count_single_news_by_question_type = hf_records_df[hf_records_df["answer_with_single_news"].str.contains("Yetersiz bilgi", na=False)].groupby("question_type").size()

yetersiz_bilgi_count_single_news_by_question_type = yetersiz_bilgi_count_single_news_by_question_type.reset_index(name="count")
yetersiz_bilgi_count_single_news_by_question_type

In [None]:
yetersiz_bilgi_count_single_news_by_question_type = hf_records_df[hf_records_df["final_answer_for_dataset"].str.contains("Yetersiz bilgi", na=False)].groupby("question_type").size()

yetersiz_bilgi_count_single_news_by_question_type = yetersiz_bilgi_count_single_news_by_question_type.reset_index(name="count")
yetersiz_bilgi_count_single_news_by_question_type