In [None]:
import re
import os
from pathlib import Path

import pandas as pd
import numpy as np
# import textwrap
from tqdm import tqdm
import torch

from data.datafinder import DataFinder
from src.recs_metrics.item_item import recall_at_n, tndcg_at_n

# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator, TransformerMixin

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModel

# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("wordnet")

SEED = 42

In [2]:
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stopwords = set(stopwords.words("english"))
        self.lemmatizer = WordNetLemmatizer()


    def clean(self, text):
        text = text.lower()

        # Remove markdown artifacts
        text = re.sub(r"\*\*(.*?)\*\*", r"\1", text)  # **bold**
        text = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", text)  # markdown links [links](url)
        text = re.sub(r"\[(.*?)\]", r"\1", text)  # bare brackets

        # Remove punctuation but preserve hyphenated words and IDs
        # This preserves tokens like "sysu-mm01-c" or "image-net1k"
        text = re.sub(r"[^\w\- ]", " ", text)

        # Remove standalone numbers (tokens that are only digits)
        text = re.sub(r"\b\d+\b", " ", text)

        # Normalize whitespace
        text = re.sub(r"\s+", " ", text).strip()

        # Tokenize and lemmatize
        tokens = text.split()
        tokens = [
            self.lemmatizer.lemmatize(t)
            for t in tokens
            if t not in self.stopwords and len(t) > 2
        ]

        return " ".join(tokens)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.clean(doc) for doc in X]

# # Example

# df = DataFinder()
# corpus = df.get()["corpus"]
# cleaner = TextPreprocessor()

# samples = corpus.sample(2, random_state=SEED) # sample rows

# # Display original vs cleaned text
# for _, row in samples.iterrows():
#     raw_text = " ".join(filter(None, [
#         f"{row['title']}" if pd.notna(row['title']) else None,
#         f"{row['description']}" if pd.notna(row['description']) else None,
#         f"{'; '.join(row['tasks'])}" if isinstance(row['tasks'], list) else None,
#         f"{row['modalities']}" if pd.notna(row['modalities']) else None
#     ]))
    
#     cleaned_text = cleaner.clean(raw_text)

#     print("Dataset ID:", row["id"])
#     print("ðŸ”¹ Raw text:\n", textwrap.fill(raw_text, width=100))
#     print("ðŸ”¸ Cleaned:\n", textwrap.fill(cleaned_text, width=100))

In [None]:
# Data preparation
df = DataFinder()
data = df.get()
corpus = data["corpus"]

corpus["text"] = corpus.apply(
    lambda row: " ".join(filter(None, [
        f"Title: {row['title']}." if pd.notna(row['title']) else None,
        f"Description: {row['description']}." if pd.notna(row['description']) else None,
        f"Tasks: {'; '.join(row['tasks'])}." if isinstance(row['tasks'], list) else None,
        f"Modalities: {row['modalities']}." if pd.notna(row['modalities']) else None
    ])),
    axis=1
)

cleaner = TextPreprocessor()
corpus["text"] = cleaner.transform(corpus["text"])

# Dataset ID mappings
id_to_idx = {id_: i for i, id_ in enumerate(corpus["id"])}
idx_to_id = {i: id_ for id_, i in id_to_idx.items()}

ground_truth_links = df.get_links_from_queries()

# # Split ground truth into validation and test sets
# all_items = list(ground_truth_links.items())
# val_items, test_items = train_test_split(all_items, test_size=0.2, random_state=SEED)
# val_links = dict(val_items); test_links = dict(test_items)

In [None]:
# LDA
vectorizer = CountVectorizer(
        lowercase=False, # already handled by TextPreprocessor
        max_df=0.95,
        min_df=5
    ) # BoW matrix document-term
X = vectorizer.fit_transform(corpus["text"])

lda = LatentDirichletAllocation(
    n_components=95,
    doc_topic_prior=0.7038693060984964,
    topic_word_prior=0.13563618275622608,
    learning_decay=0.9000732432887424,
    random_state=SEED
)
doc_topic_matrix = lda.fit_transform(X)
similarity_matrix_lda = cosine_similarity(doc_topic_matrix)

In [None]:
# TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=5)
tfidf_matrix = vectorizer.fit_transform(corpus["text"])
similarity_matrix_tfidf = cosine_similarity(tfidf_matrix)

In [None]:
# NMF
nmf = NMF(
    n_components=95,
    random_state=SEED,
    init='nndsvda'
)
doc_topic_matrix_nmf = nmf.fit_transform(tfidf_matrix)
similarity_matrix_nmf = cosine_similarity(doc_topic_matrix_nmf)

In [None]:
# BM25
import json

datafinder_path = Path("development/datagems_dataset_recs/datafinder")
datafinder_path.mkdir(parents=True, exist_ok=True)

with open(datafinder_path / "bm25_input.jsonl", "w") as f:
    for _, row in corpus.iterrows():
        json.dump({"id": row["id"], "contents": row["text"]}, f)
        f.write("\n")

# # For creating the BM25 index
# python -m pyserini.index.lucene \
#   --collection JsonCollection \
#   --input development/datagems_dataset_recs/datafinder \
#   --index development/datagems_dataset_recs/datafinder/bm25_index \
#   --generator DefaultLuceneDocumentGenerator \
#   --threads 2 \
#   --storePositions --storeDocvectors --storeRaw

os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/temurin-24.jdk/Contents/Home"
from pyserini.search.lucene import LuceneSearcher

searcher = LuceneSearcher(str(datafinder_path / "bm25_index"))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # gpu if available

# SciBERT
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
model.eval()  # inference mode
model.to(device)

embeddings_scibert = []
with torch.no_grad():
    for text in tqdm(corpus["text"].tolist(), desc="Encoding with SciBERT"):
        inputs = tokenizer(
            text, return_tensors="pt", truncation=True, padding=True, max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state
        input_mask = inputs['attention_mask']
        masked_embeddings = token_embeddings * input_mask.unsqueeze(-1)
        mean_embedding = masked_embeddings.sum(1) / input_mask.sum(1).unsqueeze(-1)
        embeddings_scibert.append(mean_embedding.squeeze().cpu().numpy())

embeddings_scibert = np.array(embeddings_scibert)
similarity_matrix_scibert = cosine_similarity(embeddings_scibert)

In [None]:
# SPECTER
tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
model = AutoModel.from_pretrained("allenai/specter2_base")
model.eval()
model.to(device)

embeddings_specter = []
with torch.no_grad():
    for text in tqdm(corpus["text"].tolist(), desc="Encoding with SPECTER"):
        inputs = tokenizer(
            text, return_tensors="pt", truncation=True, padding=True, max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state
        input_mask = inputs['attention_mask']
        masked_embeddings = token_embeddings * input_mask.unsqueeze(-1)
        mean_embedding = masked_embeddings.sum(1) / input_mask.sum(1).unsqueeze(-1)
        embeddings_specter.append(mean_embedding.squeeze().cpu().numpy())

embeddings_specter = np.array(embeddings_specter)
similarity_matrix_specter = cosine_similarity(embeddings_specter)

In [None]:
results_all = {model: {} for model in ["TF-IDF", "LDA", "NMF", "BM25", "SciBERT (pre-trained)", "SPECTER (pre-trained)"]}

for n in [10, 20, 50]:
    # LDA
    predictions_lda = {
        idx_to_id[i]: [idx_to_id[j] for j in similarity_matrix_lda[i].argsort()[::-1] if j != i][:n]
        for i in range(similarity_matrix_lda.shape[0])
    }
    recall_lda = recall_at_n(predictions_lda, ground_truth_links, n=n)
    ndcg_lda = tndcg_at_n(predictions_lda, ground_truth_links, n=n)

    # NMF
    predictions_nmf = {
        idx_to_id[i]: [idx_to_id[j] for j in similarity_matrix_nmf[i].argsort()[::-1] if j != i][:n]
        for i in range(similarity_matrix_nmf.shape[0])
    }
    recall_nmf = recall_at_n(predictions_nmf, ground_truth_links, n=n)
    ndcg_nmf = tndcg_at_n(predictions_nmf, ground_truth_links, n=n)

    # TF-IDF
    predictions_tfidf = {
        idx_to_id[i]: [idx_to_id[j] for j in similarity_matrix_tfidf[i].argsort()[::-1] if j != i][:n]
        for i in range(similarity_matrix_tfidf.shape[0])
    }
    recall_tfidf = recall_at_n(predictions_tfidf, ground_truth_links, n=n)
    ndcg_tfidf = tndcg_at_n(predictions_tfidf, ground_truth_links, n=n)

    # BM25
    bm25_predictions = {}
    for _, row in corpus.iterrows():
        dataset_id = row["id"]
        query_text = row["text"]
        hits = searcher.search(query_text, k=60)
        bm25_predictions[dataset_id] = [hit.docid for hit in hits if hit.docid != dataset_id][:n]
    recall_bm25 = recall_at_n(bm25_predictions, ground_truth_links, n=n)
    ndcg_bm25 = tndcg_at_n(bm25_predictions, ground_truth_links, n=n)

    # SciBERT
    predictions_scibert = {
        idx_to_id[i]: [idx_to_id[j] for j in similarity_matrix_scibert[i].argsort()[::-1] if j != i][:n]
        for i in range(len(corpus))
    }
    recall_scibert = recall_at_n(predictions_scibert, ground_truth_links, n=n)
    ndcg_scibert = tndcg_at_n(predictions_scibert, ground_truth_links, n=n)

    # SPECTER
    predictions_specter = {
        idx_to_id[i]: [idx_to_id[j] for j in similarity_matrix_specter[i].argsort()[::-1] if j != i][:n]
        for i in range(len(corpus))
    }
    recall_specter = recall_at_n(predictions_specter, ground_truth_links, n=n)
    ndcg_specter = tndcg_at_n(predictions_specter, ground_truth_links, n=n)

    for model, recall, ndcg in [
        ("TF-IDF", recall_tfidf, ndcg_tfidf),
        ("BM25", recall_bm25, ndcg_bm25),
        ("LDA", recall_lda, ndcg_lda),
        ("NMF", recall_nmf, ndcg_nmf),
        ("SciBERT (pre-trained)", recall_scibert, ndcg_scibert),
        ("SPECTER (pre-trained)", recall_specter, ndcg_specter)
    ]:
        results_all[model][f"Recall@{n}"] = recall
        results_all[model][f"NDCG@{n}"] = ndcg

results_df = pd.DataFrame.from_dict(results_all, orient="index")
results_df.to_csv(datafinder_path / "results.csv", index_label="Model")