In [None]:
# /opt/homebrew/opt/ollama/bin/ollama serve
%pip install -q --isolated openreview-py PyPDF2 chromadb
import ollama
import openreview
import os
from tqdm import tqdm
import re
import PyPDF2
import requests

# API V2
email = ""
client = openreview.api.OpenReviewClient(  # type: ignore
    baseurl="https://api2.openreview.net", username=email, password=""
)
venue_id = "ICML.cc/2024/Conference"
venue_group = client.get_group(venue_id)
review_name = venue_group.content["review_name"]["value"]
submission_name = venue_group.content["submission_name"]["value"]


def get_submissions():
    submissions = client.get_all_notes(content={"venueid": venue_id}, details="replies")
    return submissions


submissions = get_submissions()


def remove_surrogates(text):
    return re.sub(r"[\ud800-\udfff]", "", text)


def download_pdf(pdf_link):
    response = requests.get(pdf_link)

    if response.status_code == 200:
        with open("temp.pdf", "wb") as f:
            f.write(response.content)
        return True
    else:
        print(f"Failed to download the PDF. Status code: {response.status_code}")
        return False


def extract_pdf_text(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    text = []

    for page in range(num_pages):
        page_obj = pdf_reader.pages[page]
        text.append(page_obj.extract_text())

    return " ".join(text)


def find_references_start(parsed_text):
    patterns = [
        r"(?i)(\n|\r\n|\r|\.\s|-\s|\*\s|\.)(References)",
        r"(?i)(\n|\r\n|\r|\.\s|-\s|\*\s|\.)(Bibliography)",
        r"(?i)(\n|\r\n|\r|\.\s|-\s|\*\s|\.)(Acknowledgements)",
    ]
    for pattern in patterns:
        match = re.search(pattern, parsed_text)
        if match:
            return match.start() + len(match.group(1))
    return -1


def process_paper(pdf_link):
    if download_pdf(pdf_link):
        with open("temp.pdf", "rb") as pdf_file:
            text = extract_pdf_text(pdf_file)
        os.remove("temp.pdf")
        text = text[: find_references_start(text)]
        text = remove_surrogates(text)
        return text.replace("\n", "")
    else:
        return None


def append_id_floats_to_file(id_value, float_list, filename="f.txt"):
    file_exists = os.path.isfile(filename)
    with open(filename, "a") as file:
        if file_exists:
            file.write("\n")
        file.write(f"{id_value}:")
        for num in float_list:
            file.write(f" {num}")
        file.write("\n")


def read_id_floats_from_file(filename):
    id_float_dict = {}
    try:
        with open(filename, "r") as file:
            for line in file:
                line = line.strip()
                if line:
                    parts = line.split(":")
                    if len(parts) == 2:
                        id_value = parts[0].strip()
                        float_strings = parts[1].strip().split()
                        float_list = [float(num) for num in float_strings]
                        id_float_dict[id_value] = float_list
    except FileNotFoundError:
        print(f"File {filename} not found.")
    except ValueError as e:
        print(f"Error parsing file: {e}")
    return id_float_dict


result = read_id_floats_from_file("f2.txt")


for s in tqdm(submissions):
    try:
        pdf_link = f"https://openreview.net/{s.content['pdf']['value']}"
        processed_text = process_paper(pdf_link)
        emb = ollama.embeddings(
            model="qwen2",
            prompt=processed_text,
        )
        embedding = emb["embedding"]
        paper_id = s.id
        append_id_floats_to_file(paper_id, embedding, "f2.txt")
    except Exception as e:
        print(e)
        print(paper_id)
        continue

In [None]:
import ollama
import json
from typing import List, Dict
import random
import re
import hdbscan
import numpy as np


def read_id_floats_from_file(filename):
    id_float_dict = {}
    try:
        with open(filename, "r") as file:
            for line in file:
                line = line.strip()
                if line:
                    parts = line.split(":")
                    if len(parts) == 2:
                        id_value = parts[0].strip()
                        float_strings = parts[1].strip().split()
                        float_list = [float(num) for num in float_strings]
                        id_float_dict[id_value] = float_list
    except FileNotFoundError:
        print(f"File {filename} not found.")
    except ValueError as e:
        print(f"Error parsing file: {e}")
    return id_float_dict


result = read_id_floats_from_file("f2.txt")

ids = list(result.keys())
id2title = {s.id: s.content["title"]["value"] for s in submissions}
id2abstract = {s.id: s.content["abstract"]["value"] for s in submissions}
labels = [id2title[id] for id in ids]
embedding = np.array(list(result.values()))
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=15)
cluster_labels = clusterer.fit_predict(embedding)
clusters = {}
for i, label in enumerate(cluster_labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(
        {"id": ids[i], "abstract": id2abstract[ids[i]], "title": id2title[ids[i]]}
    )


def get_cluster_label(
    cluster_papers: List[Dict],
    initial_label: Dict = None,
    model: str = "qwen2",
    sample_size: int = 500,
) -> Dict:
    sampled_papers = random.sample(
        cluster_papers, min(len(cluster_papers), sample_size)
    )
    papers_text = "\n".join(
        [
            f"Title: {p['title']}\nAbstract: {p['abstract'][:200]}..."
            for p in sampled_papers[:5]
        ]
    )

    schema = {
        "cluster_name": "Unique, specific technical name (4-7 words, avoid generic terms like 'Advanced' or 'Innovative')",
        "explanation": "Detailed explanation of the cluster theme, specific methodologies, and key applications (2-3 sentences, include quantitative information if possible)",
        "distinguishing_factor": "What makes this cluster unique compared to other clusters? Mention 1-2 most similar clusters and how this cluster differs (1-2 sentences)",
        "sub_themes": "List 3 distinct sub-themes within this cluster, each with a brief (5-10 words) explanation",
    }

    action = "refining" if initial_label else "creating"
    initial_context = "the initial label and " if initial_label else ""
    system_prompt = (
        f"You are {action} a cluster label for ICML 2024 papers. Given {initial_context}a list of papers, "
        f"keywords, and a summary of all clusters, provide a unique and specific label for this cluster. "
        f"Focus on the most distinctive aspects and avoid generic terms. Include precise methodologies "
        f"and applications in your explanation. Clearly differentiate this cluster from others, especially "
        f"the most similar ones. Output in JSON format matching this schema: {json.dumps(schema, indent=2)}"
    )

    initial_label_text = (
        f"Initial Label:\n{json.dumps(initial_label, indent=2)}\n"
        if initial_label
        else ""
    )

    user_prompt = (
        f"{'Refine' if initial_label else 'Create'} a label for this cluster:\n\n"
        f"{initial_label_text}\n"
        f"Papers from this cluster:\n{papers_text}\n\n"
        f"Ensure your cluster name is unique and highly specific. The explanation should include "
        f"precise methodologies and applications."
        f"Clearly state how this cluster differs from the 1-2 most similar clusters. "
        f"Sub-themes should be distinct aspects within the cluster, not restatements of the main theme. "
        f"Return the cluster information in the exact JSON format specified."
    )

    try:
        response = ollama.generate(
            model=model,
            system=system_prompt,
            prompt=user_prompt,
            options={"num_ctx": 131072},
        )
        return json.loads(response["response"])
    except json.JSONDecodeError:
        match = re.search(r"\{.*\}", response["response"], re.DOTALL)
        return json.loads(match.group(0))
    except Exception as e:
        return {
            "cluster_name": "Error",
            "explanation": str(e),
            "distinguishing_factor": "N/A",
            "sub_themes": [],
        }


print("Performing initial cluster labeling...")
initial_labels = {
    label: get_cluster_label(papers) for label, papers in clusters.items()
}

print("\nRefining cluster labels...")
refined_labels = {
    label: get_cluster_label(papers, initial_labels[label])
    for label, papers in clusters.items()
}

print("\nFinal Cluster Labels:")
for label, info in refined_labels.items():
    print(f"\nCluster {label}:")
    print(f"Name: {info['cluster_name']}")
    print(f"Explanation: {info['explanation']}")
    print(f"Distinguishing Factor: {info['distinguishing_factor']}")
    print(f"Sub-themes: {info['sub_themes']}")

In [None]:
from typing import List, Dict
import hdbscan
import numpy as np
import ollama


def get_cluster_label(
    cluster_papers: List[Dict],
    initial_label: Dict = None,
    parent_label: Dict = None,
    model: str = "qwen2",
    sample_size: int = 500,
) -> Dict:
    sampled_papers = random.sample(
        cluster_papers, min(len(cluster_papers), sample_size)
    )
    papers_text = "\n".join(
        [
            f"Title: {p['title']}\nAbstract: {p['abstract'][:200]}..."
            for p in sampled_papers[:5]
        ]
    )

    schema = {
        "cluster_name": "Unique, specific technical name (2-5 words, avoid generic terms like 'Advanced' or 'Innovative')",
        "explanation": "Detailed explanation of the cluster theme, specific methodologies, and key applications (1-2 sentences)",
        "distinguishing_factor": "What makes this cluster unique compared to other clusters? Mention 1-2 most similar clusters and how this cluster differs (1-2 sentences)",
    }

    action = "refining" if initial_label else "creating"
    initial_context = "the initial label and " if initial_label else ""
    parent_context = "the parent cluster label and " if parent_label else ""

    system_prompt = (
        f"You are {action} a {'sub-' if parent_label else ''}cluster label for ICML 2024 papers. "
        f"Given {initial_context}{parent_context}a list of papers, provide a unique and specific label for this cluster. "
        f"The name should make sense and is not a mashup of several categories."
        f"Focus on the most distinctive aspects and avoid generic terms. Include precise methodologies "
        f"and applications in your explanation. Clearly differentiate this cluster from others, especially "
        f"the most similar ones. Output in JSON format matching this schema: {json.dumps(schema, indent=2)}"
    )

    initial_label_text = (
        f"Initial Label:\n{json.dumps(initial_label, indent=2)}\n"
        if initial_label
        else ""
    )

    parent_label_text = (
        f"Parent Cluster Label:\n{json.dumps(parent_label, indent=2)}\n"
        if parent_label
        else ""
    )

    user_prompt = (
        f"{'Refine' if initial_label else 'Create'} a label for this {'sub-' if parent_label else ''}cluster:\n\n"
        f"{initial_label_text}\n"
        f"{parent_label_text}\n"
        f"Papers from this cluster:\n{papers_text}\n\n"
        f"Ensure your cluster name is specific. The explanation should include "
        f"precise methodologies and applications. Do not mash up names in the cluster label. "
        f"Clearly state how this cluster differs from the 1-2 most similar clusters"
        f"{' and how it relates to the parent cluster' if parent_label else ''}. "
        f"Return the cluster information in the exact JSON format specified."
    )

    try:
        response = ollama.generate(
            model=model,
            system=system_prompt,
            prompt=user_prompt,
            options={"num_ctx": 131072},
        )
        return json.loads(response["response"])
    except json.JSONDecodeError:
        match = re.search(r"\{.*\}", response["response"], re.DOTALL)
        return json.loads(match.group(0))
    except Exception as e:
        return {
            "cluster_name": "Error",
            "explanation": str(e),
            "distinguishing_factor": "N/A",
            "sub_themes": [],
        }


def create_and_label_subclusters(cluster_papers, parent_label, embedding):
    print(f"Creating sub-clusters for cluster: {parent_label['cluster_name']}")

    # Extract embeddings for this cluster
    cluster_ids = [paper["id"] for paper in cluster_papers]
    cluster_embedding = np.array([embedding[id] for id in cluster_ids])

    # Create sub-clusters
    sub_clusterer = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=1)
    sub_cluster_labels = sub_clusterer.fit_predict(cluster_embedding)

    sub_clusters = {}
    for i, label in enumerate(sub_cluster_labels):
        if label not in sub_clusters:
            sub_clusters[label] = []
        sub_clusters[label].append(cluster_papers[i])

    # Label sub-clusters
    sub_cluster_labels = {}
    for label, papers in sub_clusters.items():
        print(f"  Labeling sub-cluster {label} ({len(papers)} papers)")
        initial_label = get_cluster_label(papers, parent_label=parent_label)
        refined_label = get_cluster_label(
            papers, initial_label=initial_label, parent_label=parent_label
        )
        sub_cluster_labels[label] = refined_label

    return sub_clusters, sub_cluster_labels


result = read_id_floats_from_file("f2.txt")
ids = list(result.keys())
id2title = {s.id: s.content["title"]["value"] for s in submissions}
id2abstract = {s.id: s.content["abstract"]["value"] for s in submissions}
labels = [id2title[id] for id in ids]
embedding = np.array(list(result.values()))
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=15)
cluster_labels = clusterer.fit_predict(embedding)
clusters = {}
for i, label in enumerate(cluster_labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(
        {"id": ids[i], "abstract": id2abstract[ids[i]], "title": id2title[ids[i]]}
    )

print("Performing initial cluster labeling...")
initial_labels = {
    label: get_cluster_label(papers) for label, papers in clusters.items()
}

print("\nRefining cluster labels...")
refined_labels = {
    label: get_cluster_label(papers, initial_labels[label])
    for label, papers in clusters.items()
}

print("\nFinal Cluster Labels:")
for label, info in refined_labels.items():
    print(f"\nCluster {label}:")
    print(f"Name: {info['cluster_name']}")
    print(f"Explanation: {info['explanation']}")
    print(f"Distinguishing Factor: {info['distinguishing_factor']}")

print("\nCreating and labeling sub-clusters...")
all_subclusters = {}
all_subcluster_labels = {}

for label, papers in clusters.items():
    if len(papers) > 15:
        sub_clusters, sub_cluster_labels = create_and_label_subclusters(
            papers, refined_labels[label], result
        )
        all_subclusters[label] = sub_clusters
        all_subcluster_labels[label] = sub_cluster_labels

        print(f"\nSub-clusters for Cluster {label} with {len(papers)} papers:")
        for sub_label, sub_info in sub_cluster_labels.items():
            print(f"  Sub-cluster {sub_label}:")
            print(f"  Name: {sub_info['cluster_name']}")
            print(f"  Explanation: {sub_info['explanation']}")
            print(f"  Distinguishing Factor: {sub_info['distinguishing_factor']}")

In [None]:
refined_labels = {int(k): v for k, v in refined_labels.items()}
with open("refined_labels.json", "w") as f:
    json.dump(refined_labels, f, indent=2)

In [None]:
id2pdf = {s.id: s.content["pdf"]["value"] for s in submissions}

In [None]:
res = {}
for cluster_id, cluster in all_subcluster_labels.items():
    cluster_name = refined_labels[cluster_id]["cluster_name"]
    res[cluster_name] = {}
    for sub_cluster_id, sub_cluster in cluster.items():
        papers = all_subclusters[cluster_id][sub_cluster_id]
        papers = [{**p, **{"pdf_link": id2pdf[p["id"]]}} for p in papers]
        res[cluster_name][sub_cluster["cluster_name"]] = papers

In [None]:
with open("embed_data.json", "w") as f:
    json.dump(res, f, indent=2)

In [None]:
all_subcluster_labels = {
    int(k): {int(k1): v1 for k1, v1 in v.items()}
    for k, v in all_subcluster_labels.items()
}
with open("all_subcluster_labels.json", "w") as f:
    json.dump(all_subcluster_labels, f, indent=2)