In [None]:
from datasets import load_from_disk

dataset = load_from_disk("mlsum_train_dataset")

In [None]:
import pickle

with open("mlsum_similar_news.pkl", "rb") as f:
    mlsum_similar_news = pickle.load(f)

In [None]:
import json

with open("inference_question_NER_keyword_extraction_outputs.jsonl", "r") as f:
    ner_keyword_outputs = [json.loads(line) for line in f]

In [None]:
def get_text_with_max_500_words(text):
    words = text.split()
    if len(words) > 500:
        return " ".join(words[:500])
    return text

In [None]:
import re

def parse_text_to_dictionary(text):
    """
    Parses the input text into a structured dictionary format.

    Args:
        text (str): The input text containing sentences, keywords, and NER information.

    Returns:
        dict: A dictionary with sentences, keywords, and NER information.

    Raises:
        ValueError: If the text format does not match the expected pattern.
    """

    data = {"sentences": []}

    pattern = re.compile(
        r"\d+\. Cümle: (?P<sentence>.*?)\s*Keyword:\s*(?P<keyword>.*?)\s*NER:\s*(?P<ner>.*?)(?=\n\d+\. Cümle:|\Z)",
        re.DOTALL
    )

    matches = pattern.finditer(text)

    if not matches:
        raise ValueError("Input text does not match the expected format.")

    for match in matches:
        sentence = match.group("sentence").strip()
        keyword = match.group("keyword").strip()
        ner = match.group("ner").strip()
        ner_list = [item.strip() for item in ner.split(",") if item]
        # eliminate '-', '(boş)' phrases from the NER list
        ner_list = [item for item in ner_list if item not in ["-", "(boş)", "(yok)", "(Boş)", "(Yok)", "- (Yok)"]]
        data["sentences"].append({
            "sentence": sentence,
            "keyword": keyword,
            "NER": ner_list
        })
    if not data["sentences"]:
        raise ValueError("No valid sentences found in the input text.")

    return data

# Example usage
input_text = ner_keyword_outputs[0]["answer"]

parsed_data = parse_text_to_dictionary(input_text)
print(parsed_data)

In [None]:
for ner_keyword_output in ner_keyword_outputs:
    input_text = ner_keyword_output["answer"]
    parsed_data = parse_text_to_dictionary(input_text)
    ner_keyword_output["parsed_ner_keywords"] = parsed_data
    ner_keyword_output["ner_set"] = set([ner for sentence in parsed_data["sentences"] for ner in sentence["NER"]])
    ner_keyword_output["keyword_set"] = set([keyword for sentence in parsed_data["sentences"] for keyword in sentence["keyword"].split(",")])

In [None]:
def match_news(news1, news2):
    ner1 = news1["ner_set"]
    ner2 = news2["ner_set"]
    keyword1 = news1["keyword_set"]
    keyword2 = news2["keyword_set"]

    common_ners = ner1.intersection(ner2)
    common_ner_count = len(common_ners)

    common_keywords = keyword1.intersection(keyword2)
    common_keyword_count = len(common_keywords)
    return common_ner_count, common_keyword_count, common_ners, common_keywords

from tqdm import tqdm

common_ner_keywords = []

for i in tqdm(range(len(ner_keyword_outputs))):
    for j in range(i + 1, len(ner_keyword_outputs)):
        common_ner_count, common_keyword_count, common_ners, common_keywords = match_news(ner_keyword_outputs[i], ner_keyword_outputs[j])

        news_1_date = dataset[ner_keyword_outputs[i]["news_id"]]["date"]
        news_2_date = dataset[ner_keyword_outputs[j]["news_id"]]["date"]

        if ((common_ner_count > 0 and common_keyword_count > 0) or (common_ner_count >= 2)) and (news_1_date != news_2_date):
            common_ner_keywords.append({
                "news_1_id": ner_keyword_outputs[i]["news_id"],
                "news_2_id": ner_keyword_outputs[j]["news_id"],
                "common_ners": common_ners,
                "common_keywords": common_keywords
            })

In [None]:
import random

random.seed(42)
random.shuffle(common_ner_keywords)

In [None]:
from tqdm import tqdm

inference_question_templates = []
counter = 0
for common_item_duo in tqdm(common_ner_keywords, total=len(common_ner_keywords)):

    first_news_id = common_item_duo["news_1_id"]
    second_news_id = common_item_duo["news_2_id"]
    common_ners = ", ".join(common_item_duo["common_ners"])

    # find the first news id in ner_keyword_outputs
    first_news_idx = -1
    for idx, ner_keyword_output in enumerate(ner_keyword_outputs):
        if ner_keyword_output["news_id"] == first_news_id:
            first_news_idx = idx
            break

    if first_news_idx == -1:
        print(f"News with id {first_news_id} not found in ner_keyword_outputs")
        continue

    # find the second news id in ner_keyword_outputs
    second_news_idx = -1
    for idx, ner_keyword_output in enumerate(ner_keyword_outputs):
        if ner_keyword_output["news_id"] == second_news_id:
            second_news_idx = idx
            break

    if second_news_idx == -1:
        print(f"News with id {second_news_id} not found in ner_keyword_outputs")
        continue

    summary_sentences_news_1 = "\n".join([sentence["sentence"] for sentence in ner_keyword_outputs[first_news_idx]["parsed_ner_keywords"]["sentences"]])
    summary_sentences_news_2 = "\n".join([sentence["sentence"] for sentence in ner_keyword_outputs[second_news_idx]["parsed_ner_keywords"]["sentences"]])

    news = dataset[first_news_id]
    similar_sample = dataset[second_news_id]

    news_text = get_text_with_max_500_words(news['text'])
    similar_sample_text = get_text_with_max_500_words(similar_sample['text'])

    inference_question_template = f"""A multi-hop question is a question that is requiring multiple inferential leaps or accessing several pieces of information from different locations or sources to arrive at an answer. The following are the metadata of 2 news articles, summaries of articles, 5 summarizing sentences for each news and texts of the news. Both news are related to the same topic and have the same entity (or entities): {common_ners}. Your task is to generate one multi-hop inference question based on the news. Here are some instructions:
1. Find the Connection: The connection between news is entity (or entities): {common_ners}, which is how this key piece of information is related or how they can be combined to form a more complex idea.
2. Formulate the Question: Create a question that cannot be answered by relying on just one of the sentences but instead requires understanding and linking the information from all of the sources. The answer must be the entity (or one of the entities): {common_ners}.
3. Ensure Coherence: Make sure the question flows logically from the combined information and is clear and unambiguous.
4. Do not use the answer in the question: You must use other details from both news in the question but do not use the entity (answer) in the question.
5. Generate the answer: Generate the answer for the question, it must be the entity (or one of the entities): {common_ners}.
(the question and the answer should be in Turkish)

News 1:
Title: {news['title']}
Date: {news['date']}
Summary: {news['summary']}
Text: {news['text']}
Summary Sentences: {summary_sentences_news_1}

News 2:
Title: {similar_sample['title']}
Date: {similar_sample['date']}
Summary: {similar_sample['summary']}
Text: {similar_sample['text']}
Summary Sentences: {summary_sentences_news_2}

Your answer should be in the following format:
Question:
Answer:
"""

    inference_question_templates.append({"prompt": inference_question_template, "news_1_id": first_news_id, "news_2_id": second_news_id})
    counter += 1

In [None]:
import json

with open("inference_question_templates.json", "w") as f:
    json.dump(inference_question_templates, f)