In [None]:
from datasets import load_from_disk

dataset = load_from_disk("mlsum_train_dataset")

In [None]:
import pickle

with open("mlsum_similar_news.pkl", "rb") as f:
    mlsum_similar_news = pickle.load(f)

mlsum_similar_news[0]

In [None]:
def get_text_with_max_500_words(text):
    words = text.split()
    if len(words) > 500:
        return " ".join(words[:500])
    return text

In [None]:
from tqdm import tqdm

null_question_templates = []
counter = 0
checkpoint = 0

for idx in tqdm(mlsum_similar_news, total=len(mlsum_similar_news)):
    if counter < checkpoint:
        counter += 1
        continue
    news = mlsum_similar_news[idx]
    similar_news_ids = news['similar_indices']
    similar_news = [dataset[i] for i in similar_news_ids]

    similar_sample = None
    for similar in similar_news:
        if news['date'] != similar['date']:
            similar_sample = similar
            break

    if similar_sample is None:
        print(f"Similar sample is not found for {news['index']}")
        continue

    news_text = get_text_with_max_500_words(news['text'])
    similar_sample_text = get_text_with_max_500_words(similar_sample['text'])

    null_question_template = f"""
You will be given 2 news articles that are contextually similar but not identical.
Your task is to generate a **Null Question**.

---

### 🧠 What is a Null Question?

A Null Question is a question that:
- Appears contextually relevant to the topic of the articles,
- Can reasonably be asked **based on the theme or key entities of both articles**,
- But **CANNOT be answered by reading the content of either article or both together.**

It should **not** be a completely unrelated or off-topic question.
Instead, it should sound like a valid inference, comparison, or temporal question—**but with missing key information**.

---

### ✅ Requirements:

1. The question must sound **contextually valid** based on the two articles.
2. The **answer must not be present in either article**.
3. The question should **clearly reference both articles**:
   - Each article should be **explicitly and specifically referred to**, not grouped with vague language.
   - ❌ Avoid vague expressions such as:
     - "Her iki haberde de geçen..."
     - "İki haberde de bahsedilen..."
     - "Bu iki makalede..."
   - ✅ Instead, use concrete entities or facts from both articles in the question, such as:
     - “İlk haberde söz edilen X kararı ile ikinci haberde açıklanan Y uygulaması...”

4. The question should be written in **clear and fluent Turkish**.
5. ❌ **Do not write open-ended, opinion-based, or vague questions.**
   - The question should be **specific and factual**, where the **lack of a concrete fact** makes it unanswerable—not interpretation.
   - ✅ Example: “Hangi yıl?”, “Hangi ülke?”, “Kaç kişi?”, “Ne zaman başladı?”, “Kim tarafından?” gibi **somut bilgi eksikliği içeren** net sorular yazılmalı.

---

### 🔍 Output Format:

Question Type: Null
Why it's null: <Explain which key info is missing from the articles that makes the question unanswerable.>
Question: <your null question in Turkish, with specific references to both articles>

---

### ✅ Example:

**News 1 summary:** İzmir’de deprem sonrası 7 bina yıkıldı, 100'den fazla kişi kurtarıldı.
**News 2 summary:** AFAD, Ege Denizi açıklarında gerçekleşen deprem sonrası 1200 personelin görevde olduğunu açıkladı.

Question Type: Null
Why it's null: Neither article provides any information about the estimated financial damage caused by the earthquake.
Question: İzmir’de 7 binanın yıkıldığından bahsedilen ilk haberdeki zarar ile, AFAD’ın 1200 personel görevlendirdiğini açıkladığı ikinci haberdeki toplam ekonomik kayıp karşılaştırıldığında, hangi bölge daha büyük bir maddi hasar yaşamıştır?

---

Now, analyze the two articles below and generate a **Null Question** as instructed.

---

News 1:
Title: {news['title']}
Date: {news['date']}
Summary: {news['summary']}
Text: {news['text']}

News 2:
Title: {similar_sample['title']}
Date: {similar_sample['date']}
Summary: {similar_sample['summary']}
Text: {similar_sample['text']}

---

Your output should follow this format:

Question Type: Null
Why it's null: <...>
Question: <...>
"""

    null_question_templates.append({"prompt": null_question_template, "news_1_id": idx, "news_2_id": similar_sample['index']})
    counter += 1

In [None]:
import json

with open("null_question_templates.json", "w") as f:
    json.dump(null_question_templates, f)