In [None]:
from datasets import load_from_disk

dataset = load_from_disk("mlsum_train_dataset")

In [None]:
import pickle

with open("mlsum_similar_news.pkl", "rb") as f:
    mlsum_similar_news = pickle.load(f)

In [None]:
def get_text_with_max_500_words(text):
    words = text.split()
    if len(words) > 500:
        return " ".join(words[:500])
    return text

In [None]:
from tqdm import tqdm

context_fusion_question_templates = []
counter = 0
checkpoint = 0

for idx in tqdm(mlsum_similar_news, total=len(mlsum_similar_news)):
    if counter < checkpoint:
        counter += 1
        continue

    news = mlsum_similar_news[idx]
    similar_news_ids = news['similar_indices']
    similar_news = [dataset[i] for i in similar_news_ids]

    similar_sample = None
    for similar in similar_news:
        if news['date'] != similar['date']:
            similar_sample = similar
            break

    if similar_sample is None:
        print(f"Similar sample is not found for {news['index']}")
        continue

    news_text = get_text_with_max_500_words(news['text'])
    similar_sample_text = get_text_with_max_500_words(similar_sample['text'])

    context_fusion_question_template = f"""
You will be given 2 news articles that cover related topics but differ in some details, perspectives, or contexts.
Your task is to create a **Contextual Fusion Question**.

This type of question requires **understanding and combining unique key information from both articles**. The question should **not** be answerable by reading just one of the articles.
Instead, it should be based on insights gained by **fusing contextual information from both sources**.

---

### 🔍 Step-by-step Instructions:

1. **Extract one unique and key piece of factual information from each article.**
   These should not be generic details, but core facts that are central to the articles.

2. **Connect the extracted facts** meaningfully. This connection could be:
   - Causal (one leads to another),
   - Comparative (one is larger, stronger, longer, etc.),
   - Contrastive (opposing views or outcomes),
   - Complementary (different parts of a bigger picture).

3. **Generate a question** that:
   - **Cannot be answered by looking at only one article,**
   - **Requires synthesis of both extracted key facts,**
   - **Is specific, factual, and clearly worded in Turkish,**
   - **Avoids generic or vague references** (see below).

---

### ❌ Avoid vague references such as:
- "Her iki haberde de geçen..."
- "İki haberde de bahsedilen..."
- "Bu iki makalede..."

Instead, refer to the **distinct topics, events, or entities** described in each article. Be **explicit** and **concrete**.

---

### ✅ Example Format:

News 1:
Title: {news['title']}
Date: {news['date']}
Summary: {news['summary']}
Text: {news['text']}

News 2:
Title: {similar_sample['title']}
Date: {similar_sample['date']}
Summary: {similar_sample['summary']}
Text: {similar_sample['text']}

---

### 🧠 Example (fictional content):

Key Fact 1 (from News 1): İstanbul'daki barajların doluluk oranı %30 seviyesine geriledi.
Key Fact 2 (from News 2): Meteoroloji, gelecek haftalarda beklenen yoğun yağışların barajları doldurabileceğini bildirdi.
Question: Baraj doluluk oranlarının kritik seviyelere indiği İstanbul'da, meteorolojinin öngördüğü yoğun yağışlar su krizini ne ölçüde hafifletebilir?

---

Your output should follow this structure:

Key Fact 1 (from News 1): <write the extracted unique information from news 1>
Key Fact 2 (from News 2): <write the extracted unique information from news 2>
Question: <your contextual fusion question in Turkish>
"""

    context_fusion_question_templates.append({"prompt": context_fusion_question_template, "news_1_id": idx, "news_2_id": similar_sample['index']})
    counter += 1

In [None]:
import json

with open("context_fusion_question_templates.json", "w") as f:
    json.dump(context_fusion_question_templates, f)