In [None]:
# /opt/homebrew/opt/ollama/bin/ollama serve
import ollama

In [None]:
# %pip install -q --isolated openreview-py PyPDF2 chromadb

import openreview

# API V2
client = openreview.api.OpenReviewClient(  # type: ignore
    baseurl="https://api2.openreview.net", username=email, password=""
)
# venue_id = "ICLR.cc/2024/Conference"
venue_id = "ICML.cc/2024/Conference"
venue_group = client.get_group(venue_id)


def get_submissions():
    submissions = client.get_all_notes(content={"venueid": venue_id}, details="replies")
    return submissions


review_name = venue_group.content["review_name"]["value"]
submission_name = venue_group.content["submission_name"]["value"]


def get_reviews(s):
    reviews = [
        openreview.api.Note.from_json(reply).content  # type: ignore
        for reply in s.details["replies"]
        if f"{venue_id}/{submission_name}{s.number}/-/{review_name}"
        in reply["invitations"]
    ]
    return reviews


submissions = get_submissions()

In [None]:
import os
import re

import PyPDF2
import requests


def remove_surrogates(text):
    return re.sub(r"[\ud800-\udfff]", "", text)


def download_pdf(pdf_link):
    response = requests.get(pdf_link)

    if response.status_code == 200:
        with open("temp.pdf", "wb") as f:
            f.write(response.content)
        return True
    else:
        print(f"Failed to download the PDF. Status code: {response.status_code}")
        return False


def extract_pdf_text(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    text = []

    for page in range(1):
        page_obj = pdf_reader.pages[page]
        text.append(page_obj.extract_text())

    return " ".join(text)


def find_references_start(parsed_text):
    patterns = [
        r"(?i)(\n|\r\n|\r|\.\s|-\s|\*\s|\.)(References)",
        r"(?i)(\n|\r\n|\r|\.\s|-\s|\*\s|\.)(Bibliography)",
        r"(?i)(\n|\r\n|\r|\.\s|-\s|\*\s|\.)(Acknowledgements)",
    ]
    for pattern in patterns:
        match = re.search(pattern, parsed_text)
        if match:
            return match.start() + len(match.group(1))
    return -1


def process_paper(pdf_link):
    if download_pdf(pdf_link):
        with open("temp.pdf", "rb") as pdf_file:
            text = extract_pdf_text(pdf_file)
        os.remove("temp.pdf")
        text = text[: find_references_start(text)]
        text = remove_surrogates(text)
        return text[: text.find("Proceedings of the 41")]
        # return text.replace("\n", "")
    else:
        return None

In [None]:
schema = {
    "author1": ["affiliation1", "affiliation2_if_exists", "..."],
    "author2": ["..."],
}

In [None]:
from tqdm import tqdm
import json

with open("llm_papers.json") as json_file:
    llm_papers = json.load(json_file)

In [None]:
for s in tqdm(submissions):
    if s.id in llm_papers:
        continue
    pdf_link = f"https://openreview.net/{s.content['pdf']['value']}"
    t = process_paper(pdf_link)
    prompt = f"""Extract authors and affiliations from the paper below. Return a valid dictionary only in the following format:
    {schema}
    Do not add any numbers from the reference, only the affiliation name. 
    Paper: {t}
    """
    response = ollama.generate(model="gemma2", prompt=prompt)["response"]
    try:
        response = json.loads(response)
        llm_papers[s.id] = response
    except json.JSONDecodeError as e:
        try:
            response = eval(response)
            llm_papers[s.id] = response
        except Exception as e2:
            print(e2)

In [None]:
response

In [None]:
with open("llm_papers.json", "w") as json_file:
    json.dump(llm_papers, json_file)

In [None]:
len(llm_papers)

In [None]:
import collections
from typing import List, Dict


def calculate_statistics(data: Dict[int, Dict]) -> Dict:
    stats = {
        "total_authors": len(data),
        "authors_per_institution": collections.defaultdict(set),
        "authors_per_country": collections.defaultdict(set),
        "multi_affiliation_authors": 0,
        "institution_count": collections.Counter(),
        "country_count": collections.Counter(),
    }

    for aid, author_data in data.items():
        institutions = author_data["institutions"]

        if len(institutions) > 1:
            stats["multi_affiliation_authors"] += 1

        for inst in institutions:
            if "name" in inst:
                stats["authors_per_institution"][inst["name"]].add(aid)
                stats["institution_count"][inst["name"]] += 1
            if "country" in inst:
                stats["authors_per_country"][inst["country"]].add(aid)
                stats["country_count"][inst["country"]] += 1

    # Convert sets to counts
    stats["authors_per_institution"] = {
        k: len(v) for k, v in stats["authors_per_institution"].items()
    }
    stats["authors_per_country"] = {
        k: len(v) for k, v in stats["authors_per_country"].items()
    }

    return stats


def print_statistics(stats: Dict):
    print(f"Total number of authors: {stats['total_authors']}")
    print(
        f"Number of authors with multiple affiliations: {stats['multi_affiliation_authors']}"
    )
    print("\nTop 15 institutions by author count:")
    for inst, count in sorted(
        stats["authors_per_institution"].items(), key=lambda x: x[1], reverse=True
    )[:15]:
        print(f"  {inst}: {count}")
    print("\nTop 15 countries by author count:")
    for country, count in sorted(
        stats["authors_per_country"].items(), key=lambda x: x[1], reverse=True
    )[:15]:
        print(f"  {country}: {count}")
    print("\nTotal number of unique institutions:", len(stats["institution_count"]))
    print("Total number of unique countries:", len(stats["country_count"]))


if __name__ == "__main__":
    statistics = calculate_statistics(llm_papers)
    print_statistics(statistics)