In [None]:
from datasets import load_from_disk

dataset = load_from_disk("mlsum_dataset/dataset")

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("jinaai/jina-embeddings-v3", device="cuda", trust_remote_code=True)
#model = SentenceTransformer("intfloat/multilingual-e5-large-instruct", device="cuda")
#model = SentenceTransformer("ytu-ce-cosmos/turkish-e5-large", device="cuda")

### Vector Extraction

In [None]:
def batch_encode(texts, model, task, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Encoding {task} embeddings"):
        batch = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch, task=task)
        embeddings.extend(batch_embeddings)
    return embeddings

# Save and load embeddings to/from disk
def save_embeddings(embeddings, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(embeddings, f)

def load_embeddings(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

In [None]:
def get_text_with_max_500_words(text):
    words = text.split()
    if len(words) > 500:
        return " ".join(words[:500])
    return text

In [None]:
def prepare_news_string(news):
    news_string = ""
    news_string += f"Title: {news['title']}\n"
    news_string += f"Date: {news['date']}\n"
    news_string += f"Summary: {news['summary']}\n"
    news_string += f"Text: {get_text_with_max_500_words(news['text'])}\n"
    return news_string

In [None]:
from tqdm import tqdm

dataset_text_strings = []
for i in tqdm(range(len(dataset))):
    dataset_text_strings.append(prepare_news_string(dataset[i]))

In [None]:
news_embeddings = batch_encode(dataset_text_strings, model, task="text-matching", batch_size=16)

In [None]:
save_embeddings(news_embeddings, "news_embeddings.pkl")

### Dataset Preperation

In [None]:
from datasets import load_dataset

dataset = load_dataset("mlsum", "tu", trust_remote_code=True)
dataset

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset
from tqdm import tqdm
import os

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("jinaai/jina-embeddings-v3", device="cuda", trust_remote_code=True)

In [None]:
# Function to encode embeddings in batches
def batch_encode(texts, model, task, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Encoding {task} embeddings"):
        batch = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch, task=task)
        embeddings.extend(batch_embeddings)
    return embeddings

# Save and load embeddings to/from disk
def save_embeddings(embeddings, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(embeddings, f)

def load_embeddings(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)

# Find top 5 most similar indices for each entry
def get_top_k_similar(sim_matrix, k=5):
    top_similars = {}
    for i in range(sim_matrix.shape[0]):
        similar_indices = sim_matrix[i].argsort()[::-1][1:k+1]  # Exclude self (index 0)
        top_similars[i] = similar_indices
    return top_similars

In [None]:
embedding_dir = "/content/drive/MyDrive/matrag/mlsum_dataset/embeddings"
os.makedirs(embedding_dir, exist_ok=True)

summary_embeddings_path = os.path.join(embedding_dir, "summary_embeddings.pkl")

In [None]:
batch_size = 32

train_data = dataset["train"]
train_summaries = train_data["summary"]

if not os.path.exists(summary_embeddings_path):
    summary_embeddings = batch_encode(train_summaries, model, task="text-matching", batch_size=batch_size)
    save_embeddings(summary_embeddings, summary_embeddings_path)
else:
    summary_embeddings = load_embeddings(summary_embeddings_path)

In [None]:
# Add the summary embeddings to the dataset
dataset["train"] = dataset["train"].add_column("summary_embeddings", summary_embeddings)
dataset

In [None]:
dataset = dataset["train"]
dataset

In [None]:
indexes = [i for i in range(len(dataset))]
dataset = dataset.add_column("index", indexes)
dataset

### FAISS

In [None]:
!pip install faiss-cpu

In [None]:
import faiss
print(faiss.__version__)

In [None]:
import pickle

def load_embeddings(file_path):
    with open(file_path, 'rb') as f:
        return pickle.load(f)
embeddings_path = "news_embeddings.pkl"

news_embeddings = load_embeddings(embeddings_path)
print(len(news_embeddings))

In [None]:
dataset = dataset.add_column("embeddings", news_embeddings)
dataset

In [None]:
dataset.add_faiss_index(column="embeddings")

In [None]:
dataset[0].keys()

In [None]:
import json

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

file_path = "huggingface_dataset_with_answers.jsonl"

questions_dataset = load_jsonl(file_path)

print(len(questions_dataset)) # Print the number of entries loaded
print(questions_dataset[0].keys()) # Print the keys of the first entry for verification

In [None]:
questions_dataset[0]["question"]

In [None]:
questions_dataset[0]["1st_news_id"], questions_dataset[0]["2nd_news_id"]

In [None]:
def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

task = 'Given a Turkish search query, retrieve relevant passages written in Turkish that best answer the query'
#task = 'Given a web search query, retrieve relevant passages that answer the query'

In [None]:
from tqdm import tqdm

def batch_encode(texts, model, task, batch_size=32):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Encoding {task} embeddings"):
        batch = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch, task=task)
        #batch_embeddings = model.encode([get_detailed_instruct(task, b) for b in batch])
        embeddings.extend(batch_embeddings)
    return embeddings

In [None]:
questions = [question["question"] for question in questions_dataset]
print(len(questions))

In [None]:
questions_embeddings = batch_encode(questions, model, "text-matching", 32)
#questions_embeddings = batch_encode(questions, model, task, 32)

In [None]:
gt_dataset = []

for question in tqdm(questions_dataset):
  gt_dataset.append({
      "question": question["question"],
      "question_type": question["question_type"],
      "1st_news_id": question["1st_news_id"],
      "2nd_news_id": question["2nd_news_id"]
  })

In [None]:
for i in range(len(gt_dataset)):
  gt_dataset[i]["embedding"] = questions_embeddings[i]

In [None]:
from tqdm import tqdm
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

def evaluate_recall_at_k(gt_dataset, dataset, top_k_list, max_workers=8):
    """
    Evaluates recall@k for multiple k values, both globally and per question_type.

    Returns:
        Tuple of two dicts:
          - overall_results[top_k] = (full_recall, partial_recall)
          - per_type_results[top_k][question_type] = (full_recall, partial_recall)
    """
    def check_recall(i, top_k):
        item = gt_dataset[i]
        qtype = item["question_type"]
        emb  = item["embedding"]
        _, samples = dataset.get_nearest_examples("embeddings", emb, k=top_k)
        idxs = samples["index"]

        found1 = item["1st_news_id"] in idxs
        found2 = item["2nd_news_id"] in idxs
        return qtype, found1, found2

    overall_results   = {}
    per_type_results  = {}

    N = len(gt_dataset)
    for top_k in top_k_list:
        # global counters
        global_full    = 0
        global_partial = 0

        # per‐type counters: { qtype: { "full":…, "partial":…, "total":… } }
        type_counts = defaultdict(lambda: {"full":0, "partial":0, "total":0})

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(check_recall, i, top_k)
                       for i in range(N)]
            for f in tqdm(as_completed(futures), total=N, desc=f"Recall@{top_k}"):
                qtype, is_first, is_second = f.result()

                # update global
                if is_first and is_second:
                    global_full += 1
                    global_partial += 1
                elif is_first or is_second:
                    global_partial += 1

                # update per‐type
                tc = type_counts[qtype]
                tc["total"]   += 1
                if is_first and is_second:
                    tc["full"]    += 1
                    tc["partial"] += 1
                elif is_first or is_second:
                    tc["partial"] += 1

        # compute ratios
        overall_results[top_k] = (
            global_full    / N,
            global_partial / N
        )

        per_type_results[top_k] = {
            qtype: (
                vals["full"]    / vals["total"],
                vals["partial"] / vals["total"]
            )
            for qtype, vals in type_counts.items()
        }

        # optional: print summaries
        print(f"\n=== Recall@{top_k} (overall) ===")
        print(f" Full  : {overall_results[top_k][0]:.4f}")
        print(f" Partial: {overall_results[top_k][1]:.4f}")
        print(f"--- by question type ---")
        for qtype, (full, partial) in per_type_results[top_k].items():
            print(f"{qtype:20s} | full: {full:.4f}  partial: {partial:.4f}")

    return overall_results, per_type_results

In [None]:
top_k_list = [2,4,8,10]
overall, by_type = evaluate_recall_at_k(gt_dataset, dataset, top_k_list)

In [None]:
import json

# Assuming evaluation_results is already defined from the previous code
model_name = "turkuaz"
# Specify the file path where you want to save the JSON data
file_path = f"evaluation_results_overall_{model_name}.json"
# Save the evaluation results as a JSON file
with open(file_path, "w") as json_file:
    json.dump(overall, json_file, indent=4) # indent for pretty printing
print(f"Evaluation results saved to {file_path}")

file_path = f"evaluation_results_by_type_{model_name}.json"
# Save the evaluation results as a JSON file
with open(file_path, "w") as json_file:
    json.dump(by_type, json_file, indent=4) # indent for pretty printing
print(f"Evaluation results saved to {file_path}")