In [73]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
from tqdm import tqdm
import pandas as pd
from difflib import SequenceMatcher
import time
from joblib import Parallel, delayed

<h1 align="center">PART 1</h1>

In [74]:
def scrape_names():
    program_url = "https://ic2s2-2023.org/program" # Program URL
    chairs_url = "https://ic2s2-2023.org/organization" # Chairs URL
    keynotes_url = "https://ic2s2-2023.org/keynotes" # Keynotes URL

    # Scrape program page
    program_response = requests.get(program_url)
    program_soup = BeautifulSoup(program_response.text, 'html.parser')

    # Extract and split names by comma and remove extra spaces
    program_names = []
    for tag in program_soup.find_all('i'):
        names = tag.get_text(strip=True).split(',')  # Split by comma
        clean_names = [name.strip('" ').strip() for name in names]  # Remove extra spaces/quotes
        program_names.extend(clean_names)  # Add to the final list

    # Scrape chairs page
    chairs_response = requests.get(chairs_url)
    chairs_soup = BeautifulSoup(chairs_response.text, 'html.parser')
    chairs_names = [a.get_text(strip=True) for a in chairs_soup.select("h3 a")] # names are stored in <a> within <h3> tags

    # Scape keynotes page
    keynotes_response = requests.get(keynotes_url)
    keynotes_soup = BeautifulSoup(keynotes_response.text, 'html.parser')
    keynotes_names = [a.get_text(strip=True) for a in keynotes_soup.select("h3 a")] # names are stored in <a> within <h3> tags again

    # Returning names in dict 
    return {
        "program_names": program_names,
        "chairs_names": chairs_names,
        "keynotes_names": keynotes_names
    }

In [75]:
names = scrape_names() # Getting all names

Elements_to_remove = ["SAGE", "DIREC", "EPJ", "Esteban Moro (EPJkeynote)"] # Found some errors in keynotes_names - we just remove them manually
for element in Elements_to_remove:
    names["keynotes_names"].remove(element)

names["keynotes_names"].append("Esteban Moro") # Add back Esteban Moro without the (EPJkeynote) part

all_names = names["program_names"] + names["chairs_names"] + names["keynotes_names"] # combining all names into a list

all_names = list(set(all_names)) # This removes 578 duplicates

# remove all names that contain the string Chair
all_names = [name for name in all_names if "Chair" not in name]


In [76]:
# Final list with names
len(all_names)

1498

In [77]:

# < ---- > 
# Method to find highly similar names
# Should work with defualt python libs

from difflib import SequenceMatcher

def are_almost_similar(s1, s2, threshold=0.8):
    return SequenceMatcher(None, s1, s2).ratio() >= threshold

def find_similar_names(all_names, threshold=0.8):
    similar_pairs = []
    for i in range(len(all_names)):
        for j in range(i + 1, len(all_names)):
            if are_almost_similar(all_names[i], all_names[j], threshold):
                similar_pairs.append((all_names[i], all_names[j]))
    return similar_pairs

# < ---- >

# Find and print similar names
similar_names = find_similar_names(all_names, threshold=0.8)
print(f"similar names: {similar_names}")

# In most cases the similar names are just the same name, but including the middle name or initial.
# Lets keep the longest entry and remove the shorter one
# Identify names to remove
names_to_remove = {name2 if len(name1) > len(name2) else name1 for name1, name2 in similar_names}

# Remove names
all_names = [name for name in all_names if name not in names_to_remove]


similar names: [('Diogo Pachecho', 'Diogo Pacheco'), ('Stefan M. Herzog', 'Stefan Herzog'), ('David Rothschild', 'David M Rothschild'), ('Francesco Pierri', 'Francesco Barbieri'), ('Francesco Pierri', 'Francesco Silvestri'), ('Luca Verginer', 'luca verginer'), ('Shintaro Sakai', 'Shintaro Ueki'), ('Ziwen Chen', 'Zexun Chen'), ('Woo-sung Jung', 'Woo-Sung Jung'), ('Luis E C Rocha', 'Luis M Rocha'), ('David Rand', 'David G. Rand'), ('Katinka den Nijs', 'Katinka Den Nijs'), ('Anne C. Kroon', 'Anne Kroon'), ('Anne C. Kroon', 'Anne C Kroon'), ('Zoe K. Rahwan', 'Zoe Rahwan'), ('Kathyrn R Fair', 'Kathyrn Fair'), ('Yuan Zhang', 'Yiyan Zhang'), ('Rupert Tibor Kiddle', 'Rupert Kiddle'), ('Amy Smith', 'Abby Smith'), ('Pantelis Analytis', 'Pantelis P. Analytis'), ('Pantelis Analytis', 'Pantelis P Analytis'), ('Federico Barrera-Lemarchand', 'Federico Barrera Lemarchand'), ('Ana María Jaramillo', 'Ana Maria Jaramillo'), ('Qi Wang', 'Bin Wang'), ('Qi Wang', 'Hui Wang'), ('Fabio Carella', 'Fabio Carrel

In [78]:
# Final list of names
print(len(all_names))

1444


## Scarping Names Explained

### 1. Inspecting URLs
First, I inspected all the URLs to determine the structure in which the names were stored.

### 2. Collecting and Printing Names
Then, I collected all the names and printed them from each URL to verify that the extraction worked correctly.

### 3. Handling Errors
During this process, I found some errors in `keynotes_names`, but only **four names** were incorrect, so I removed them manually.

### 4. Removing Duplicates
Next, I used Python’s `set()` function to eliminate duplicates.

### 5. Finding Similar Names
I used a method to identify **slightly similar** names. After finding some matches, I decided to remove the shortest name of similair pairs. It was almost just a matter of includeing middlename / initials.


<h1 align="center">PART 2</h1>

<h1 align="center">PART 3</h1>

In [None]:
# import requests
# import urllib
# from bs4 import BeautifulSoup
# from tqdm import tqdm
# from difflib import SequenceMatcher
# import time

# # -------------------------
# #       Converting names to IDs if the names exists in OpenAlex - otherwise skipping
# # -------------------------

# def convert_author_name_to_id(name):
#     query = urllib.parse.quote(name)
#     url = f"https://api.openalex.org/authors?search={query}&filter=works_count:>5,works_count:<5000"
    
#     try:
#         response = requests.get(url)
#         if response.status_code == 429:
#             print("Rate limit exceeded, sleeping for 1 seconds")
#             time.sleep(1)
            
#         response.raise_for_status()
#     except requests.exceptions.HTTPError as e:
#         print(f"Error searching for author {name}: {e}")
#         return None

#     data = response.json()
#     if "results" not in data or len(data["results"]) == 0:
#         return None

#     # Take the first result
#     first_result = data["results"][0]
#     return first_result["id"] 

# # -------------------------
# #       Collecting all works done by the authors. We use pagination to get all articles if there is more than 200
# # -------------------------

# def get_works_by_author_id(author_id):
#     base_url = "https://api.openalex.org/works"
#     filter_str = f"author.id:{author_id},cited_by_count:>10,authors_count:<10"
#     next_cursor = "*"
#     page_size = 200
#     all_works = []
    
#     while True:
#         url = (f"{base_url}?filter={filter_str}"
#                f"&cursor={next_cursor}"
#                f"&per-page={page_size}"
#                f"&sort=publication_date:desc")
#         response = requests.get(url)
#         if response.status_code == 429:
#             print("Rate limit exceeded, sleeping for 1 seconds")
#             time.sleep(1)
#             continue
#         response.raise_for_status()

#         data = response.json()

#         # Collect all works from this page
#         works_this_page = data["results"]

#         all_works.extend(works_this_page)

#         # Pagination
#         next_cursor = data["meta"].get("next_cursor")
#         if not next_cursor:
#             break

#     return all_works

# # -------------------------
# #       Main Logic
# # -------------------------
# def main(all_names):
#     authors = all_names
#     unique_works = {}
#     num_authors_not_found = 0

#     for name in tqdm(authors):
        
#         # Time for convert_author_name_to_id
#         start_time = time.perf_counter()
#         author_id = convert_author_name_to_id(name)
#         convert_time = time.perf_counter() - start_time

#         if author_id is None:
#             num_authors_not_found += 1
#             print(f"[{name}] ❌ Author not found (Time: {convert_time:.4f}s)")
#             continue

#         # Time to get works for each author
#         start_time = time.perf_counter()
#         works = get_works_by_author_id(author_id)
#         works_time = time.perf_counter() - start_time

#         if works:
#             for w in works:
#                 work_id = w.get("id")
#                 if work_id not in unique_works:
#                     unique_works[work_id] = {
#                         "id": work_id,
#                         "publication_year": w.get("publication_year"),
#                         "cited_by_count": w.get("cited_by_count"),
#                         "author_ids": [
#                             auth["author"]["id"]
#                             for auth in w.get("authorships", [])
#                             if "author" in auth and "id" in auth["author"]
#                         ],
#                         "title": w.get("title"),
#                         "abstract_inverted_index": w.get("abstract_inverted_index"),
#                     }

#         # Print time meassurements
#         print(f"[{name}] ✅ Found author in {convert_time:.4f}s, retrieved works in {works_time:.4f}s")

#     print(f"\nTotal authors not found: {num_authors_not_found}")
#     print(f"Total unique articles found: {len(unique_works)}")


# result = main(all_names)


  0%|          | 1/1444 [00:00<08:00,  3.00it/s]

[Julian Polenz] ❌ Author not found (Time: 0.3329s)


  0%|          | 2/1444 [00:05<1:20:29,  3.35s/it]

[Allison Koenecke] ✅ Found author in 3.3577s, retrieved works in 2.1014s


  0%|          | 3/1444 [00:07<1:00:46,  2.53s/it]

[Jisu Kim] ✅ Found author in 0.5337s, retrieved works in 1.0212s


  0%|          | 4/1444 [00:09<54:27,  2.27s/it]  

[Nikolitsa Grigoropoulou] ✅ Found author in 0.3415s, retrieved works in 1.5242s


  0%|          | 5/1444 [00:10<48:09,  2.01s/it]

[Vanessa Cheung] ✅ Found author in 0.6142s, retrieved works in 0.9294s


  0%|          | 6/1444 [00:16<1:16:34,  3.19s/it]

[Kunihiro Miyazaki] ✅ Found author in 0.4435s, retrieved works in 5.0537s


  0%|          | 7/1444 [00:17<1:04:30,  2.69s/it]

[Sharon Kang] ❌ Author not found (Time: 1.6587s)


  1%|          | 8/1444 [00:21<1:13:26,  3.07s/it]

[Vrushabh Vilas Wadnere] ❌ Author not found (Time: 3.8709s)


  1%|          | 9/1444 [00:23<1:01:51,  2.59s/it]

[Kongmeng Liew] ✅ Found author in 0.6327s, retrieved works in 0.8910s
Rate limit exceeded, sleeping for 1 seconds


  1%|          | 10/1444 [00:24<52:26,  2.19s/it] 

Error searching for author Margaret Roberts: 429 Client Error: TOO MANY REQUESTS for url: https://api.openalex.org/authors?search=Margaret%20Roberts&filter=works_count:%3E5,works_count:%3C5000
[Margaret Roberts] ❌ Author not found (Time: 1.3150s)


  1%|          | 11/1444 [00:28<1:05:07,  2.73s/it]

[Frederik Georg Hjorth] ✅ Found author in 0.5060s, retrieved works in 3.4246s


  1%|          | 12/1444 [00:32<1:15:34,  3.17s/it]

[Diogo Pachecho] ❌ Author not found (Time: 4.1707s)


  1%|          | 13/1444 [00:37<1:29:52,  3.77s/it]

[Ralph Schroeder] ✅ Found author in 2.8935s, retrieved works in 2.2563s


  1%|          | 14/1444 [00:39<1:15:45,  3.18s/it]

[Leslie DeChurch] ✅ Found author in 0.4700s, retrieved works in 1.3438s


  1%|          | 14/1444 [00:43<1:14:46,  3.14s/it]


KeyboardInterrupt: 

In [81]:
def convert_author_name_to_id(name):
    """
    Return the first matching author ID from OpenAlex if they have 5-5000 works.
    Returns None if no match or an error occurs.
    """
    query = urllib.parse.quote(name)
    url = f"https://api.openalex.org/authors?search={query}&filter=works_count:>5,works_count:<5000"

    # Simple rate-limiting sleep (~10 requests/sec across all workers)
    time.sleep(0.5)

    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"[convert_author_name_to_id] Error searching for author '{name}': {e}")
        return None

    data = response.json()
    if "results" not in data or len(data["results"]) == 0:
        return None

    # Take the first match
    first_result = data["results"][0]
    return first_result["id"]  # e.g. "https://openalex.org/A123456789"

def get_all_author_ids(author_names, n_jobs=2):
    """
    Convert each author name to an OpenAlex author ID in parallel.
    Returns a dict {author_name: author_id or None}.
    """
    # Use Joblib to parallelize:
    results = Parallel(n_jobs=n_jobs)(
        delayed(convert_author_name_to_id)(name) for name in tqdm(author_names, desc="Converting Names to IDs")
    )
    # Combine names and results into a dictionary
    return dict(zip(author_names, results))

def get_works_for_author_batch(author_ids):
    """
    Retrieve works for up to 25 author IDs in ONE call, filtering by:
      - cited_by_count:>10
      - authors_count:<10
    Returns a list of all works for these authors combined.
    """
    base_url = "https://api.openalex.org/works"
    
    # Build OR filter for all author IDs in this batch
    # Example: author.id:A1|author.id:A2|...|author.id:A25
    author_filter = "|".join([f"author.id:{a}" for a in author_ids])
    filter_str = f"{author_filter},cited_by_count:>10,authors_count:<10"

    all_works = []
    cursor = "*"

    while True:
        url = (f"{base_url}?filter={filter_str}"
               f"&cursor={cursor}"
               f"&per-page=200"
               f"&sort=publication_date:desc")
        
        # Simple rate-limiting
        time.sleep(0.1)
        
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"[get_works_for_author_batch] Error fetching works for batch: {e}")
            return all_works  # Return what we have so far

        data = response.json()
        all_works.extend(data.get("results", []))

        next_cursor = data["meta"].get("next_cursor")
        if not next_cursor:
            break
        cursor = next_cursor

    return all_works


def get_works_in_parallel(author_ids, n_jobs=1, batch_size=25):
    """
    Break the author_ids list into chunks of `batch_size`,
    fetch works for each chunk in parallel, and combine into one list.
    """
    # Split IDs into chunks of up to 25
    chunks = [author_ids[i : i + batch_size] for i in range(0, len(author_ids), batch_size)]
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(get_works_for_author_batch)(chunk) for chunk in tqdm(chunks, desc="Fetching Works in Batches")
    )
    
    # Flatten the list of lists
    all_works = []
    for works_list in results:
        all_works.extend(works_list)
    
    return all_works

def main(all_names):
    # STEP A: Convert names -> IDs in parallel
    name_to_id_map = get_all_author_ids(all_names, n_jobs=4)
    valid_author_ids = [aid for aid in name_to_id_map.values() if aid is not None]
    print(f"\nFound valid IDs for {len(valid_author_ids)} out of {len(all_names)} authors.")

    # STEP B: Fetch works in bulk (up to 25 authors per request)
    all_works = get_works_in_parallel(valid_author_ids, n_jobs=4, batch_size=25)
    print(f"\nTotal works fetched (with duplicates): {len(all_works)}")

    # STEP C: Deduplicate
    unique_works = {}
    for w in all_works:
        work_id = w.get("id")
        if work_id not in unique_works:
            unique_works[work_id] = {
                "id": work_id,
                "publication_year": w.get("publication_year"),
                "cited_by_count": w.get("cited_by_count"),
                "author_ids": [
                    auth["author"]["id"]
                    for auth in w.get("authorships", [])
                    if "author" in auth and "id" in auth["author"]
                ],
                "title": w.get("title"),
                "abstract_inverted_index": w.get("abstract_inverted_index"),
            }

    print(f"Unique works after filtering duplicates: {len(unique_works)}")
    return unique_works


final_works = main(all_names)





[A
[A
[A
[A

[convert_author_name_to_id] Error searching for author 'Kongmeng Liew': 429 Client Error: TOO MANY REQUESTS for url: https://api.openalex.org/authors?search=Kongmeng%20Liew&filter=works_count:%3E5,works_count:%3C5000
[convert_author_name_to_id] Error searching for author 'Margaret Roberts': 429 Client Error: TOO MANY REQUESTS for url: https://api.openalex.org/authors?search=Margaret%20Roberts&filter=works_count:%3E5,works_count:%3C5000
[convert_author_name_to_id] Error searching for author 'Frederik Georg Hjorth': 429 Client Error: TOO MANY REQUESTS for url: https://api.openalex.org/authors?search=Frederik%20Georg%20Hjorth&filter=works_count:%3E5,works_count:%3C5000



[A

[convert_author_name_to_id] Error searching for author 'Gennaro Cordasco': 429 Client Error: TOO MANY REQUESTS for url: https://api.openalex.org/authors?search=Gennaro%20Cordasco&filter=works_count:%3E5,works_count:%3C5000



[A

[convert_author_name_to_id] Error searching for author 'Brooke Foucault Welles': 429 Client Error: TOO MANY REQUESTS for url: https://api.openalex.org/authors?search=Brooke%20Foucault%20Welles&filter=works_count:%3E5,works_count:%3C5000



[A
[A
[A
[A

KeyboardInterrupt: 

In [None]:
import requests
import urllib
import time
import math

from tqdm import tqdm
from joblib import Parallel, delayed

DEBUG_MODE = True  # Set to False if you don't want detailed debug prints

def debug_print(msg):
    """Helper to conditionally print debug messages."""
    if DEBUG_MODE:
        print(msg)

def convert_author_name_to_id(name):
    """
    Return the first matching author ID from OpenAlex if they have 5-5000 works.
    Returns None if no match or an error occurs.
    """
    query = urllib.parse.quote(name)
    url = f"https://api.openalex.org/authors?search={query}&filter=works_count:>5,works_count:<5000"

    # Simple rate-limiting sleep (~10 requests/sec across all workers)
    # If you still get 429 errors, increase this sleep or reduce n_jobs.
    time.sleep(0.1)

    debug_print(f"[convert_author_name_to_id] GET {url}")

    try:
        response = requests.get(url)
        if response.status_code == 429:
            debug_print("[convert_author_name_to_id] 429 Too Many Requests. Consider backoff or slower requests.")
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"[convert_author_name_to_id] Error searching for author '{name}': {e}")
        return None

    data = response.json()
    if "results" not in data or len(data["results"]) == 0:
        return None

    # Take the first match
    first_result = data["results"][0]
    return first_result["id"]

def get_all_author_ids(author_names, n_jobs=4):
    """
    Convert each author name to an OpenAlex author ID in parallel.
    Returns a dict {author_name: author_id or None}.
    """
    results = Parallel(n_jobs=n_jobs)(
        delayed(convert_author_name_to_id)(name)
        for name in tqdm(author_names, desc="Converting Names to IDs")
    )
    return dict(zip(author_names, results))

def get_works_for_author_batch(author_ids, max_retries=5):
    """
    Retrieve works for up to 25 author IDs in ONE call, filtering by:
      - cited_by_count:>10
      - authors_count:<10
    Returns a list of all works for these authors combined.
    
    Includes simple exponential backoff if a 429 error occurs.
    """
    base_url = "https://api.openalex.org/works"
    author_filter = "|".join([f"author.id:{a}" for a in author_ids])
    filter_str = f"{author_filter},cited_by_count:>10,authors_count:<10"

    all_works = []
    cursor = "*"
    
    attempts = 0

    while True:
        url = (
            f"{base_url}?filter={filter_str}"
            f"&cursor={cursor}"
            f"&per-page=200"
            f"&sort=publication_date:desc"
        )

        debug_print(f"[get_works_for_author_batch] GET {url}")

        # Simple rate-limiting
        time.sleep(0.1)

        response = None
        for retry in range(max_retries):
            try:
                response = requests.get(url)
                if response.status_code == 429:
                    # Exponential backoff
                    wait_time = 2 ** retry  # 1,2,4,8,...
                    print(f"[get_works_for_author_batch] 429 Too Many Requests. Backing off for {wait_time}s...")
                    time.sleep(wait_time)
                    continue  # Retry the request
                # If not 429, then raise if error (400-599 except 429 handled above)
                response.raise_for_status()
                break  # If success, break the retry loop
            except requests.exceptions.RequestException as e:
                print(f"[get_works_for_author_batch] Error: {e} (retry {retry + 1}/{max_retries})")
                # Try again up to max_retries
                time.sleep(2 ** retry)
        else:
            # If we exit the for-loop without breaking, all retries failed
            debug_print("[get_works_for_author_batch] Max retries hit. Returning what we have so far.")
            return all_works

        if response is None:
            # No response at all, return what we have
            return all_works

        data = response.json()
        page_results = data.get("results", [])
        all_works.extend(page_results)

        debug_print(f"  - Retrieved {len(page_results)} works this page; total so far: {len(all_works)}")

        next_cursor = data["meta"].get("next_cursor")
        if not next_cursor:
            break  # No more pages
        cursor = next_cursor

    return all_works

def get_works_in_parallel(author_ids, n_jobs=4, batch_size=25):
    """
    Break the author_ids list into chunks of `batch_size`,
    fetch works for each chunk in parallel, and combine into one list.
    """
    chunks = [author_ids[i : i + batch_size] for i in range(0, len(author_ids), batch_size)]
    
    results = Parallel(n_jobs=n_jobs)(
        delayed(get_works_for_author_batch)(chunk) 
        for chunk in tqdm(chunks, desc="Fetching Works in Batches")
    )
    
    # Flatten
    all_works = []
    for works_list in results:
        all_works.extend(works_list)
    return all_works

def main(all_names):
    # STEP A: Convert names -> IDs in parallel
    name_to_id_map = get_all_author_ids(all_names, n_jobs=4)
    valid_author_ids = [aid for aid in name_to_id_map.values() if aid is not None]
    print(f"\nFound valid IDs for {len(valid_author_ids)} out of {len(all_names)} authors.")

    if not valid_author_ids:
        print("No valid author IDs found. Exiting.")
        return

    # STEP B: Fetch works in bulk (up to 25 authors per request)
    all_works = get_works_in_parallel(valid_author_ids, n_jobs=4, batch_size=25)
    print(f"\nTotal works fetched (with duplicates): {len(all_works)}")

    # STEP C: Deduplicate
    unique_works = {}
    for w in all_works:
        work_id = w.get("id")
        if work_id not in unique_works:
            unique_works[work_id] = {
                "id": work_id,
                "publication_year": w.get("publication_year"),
                "cited_by_count": w.get("cited_by_count"),
                "author_ids": [
                    auth["author"]["id"]
                    for auth in w.get("authorships", [])
                    if "author" in auth and "id" in auth["author"]
                ],
                "title": w.get("title"),
                "abstract_inverted_index": w.get("abstract_inverted_index"),
            }

    print(f"Unique works after filtering duplicates: {len(unique_works)}")
    return unique_works

# Example usage
if __name__ == "__main__":
    test_names = [
        "Albert Einstein", 
        "Marie Curie", 
        "Allison Koenecke", 
        "Julian Polenz", 
        "Jisu Kim"
    ]
    final_works = main(test_names)


In [None]:
# -------------------------
#       Using the data from unique_works to create two dataframes - one for papers and one for abstracts
# -------------------------
papers_data = []
abstracts_data = []

for work_id, work in unique_works.items():
    papers_data.append({
        "id": work["id"],
        "publication_year": work["publication_year"],
        "cited_by_count": work["cited_by_count"],
        "author_ids": ",".join(work["author_ids"])  
    })
    abstracts_data.append({
        "id": work["id"],
        "title": work["title"],
        "abstract_inverted_index": work["abstract_inverted_index"]
    })

df_papers = pd.DataFrame(papers_data)
df_abstracts = pd.DataFrame(abstracts_data)

df_papers.to_csv("IC2S2_papers.csv", index=False)
df_abstracts.to_csv("IC2S2_abstracts.csv", index=False)

<h1 align="center">PART 4</h1>

In [None]:
import networkx as nx
import itertools
import json
from collections import Counter


df_papers = pd.read_csv("IC2S2_papers.csv")  # Load Papers Dataset

# Create NetworkX Graph
G = nx.Graph()
collaboration_counts = Counter()


for author_str in tqdm(df_papers["author_ids"]): # loop over each paper
    authors = author_str.split(",") # split the authors (seperated by ,)

    for author1, author2 in itertools.combinations(authors, 2): # loop over each combination of authors
        pair = tuple(sorted([author1, author2])) # sort the pair of authors to avoid duplicates
        collaboration_counts[pair] += 1 # count the number of collaborations between the pair of authors

G.add_weighted_edges_from([(a1, a2, count) for (a1, a2), count in tqdm(collaboration_counts.items())])



# Compute citation counts and first publication year for each author
author_citation_counts = Counter()
author_first_publication = {}

for _, row in tqdm(df_papers.iterrows(), total=len(df_papers)):
    authors = row["author_ids"].split(",")
    cited_by = row["cited_by_count"] # Number of citations for given paper
    pub_year = row["publication_year"] # Publication year of given paper

    for author in authors:
        author_citation_counts[author] += cited_by  # Sum up citations for each author
        if author not in author_first_publication or pub_year < author_first_publication[author]: # Keep earliest year 
            author_first_publication[author] = pub_year

# Add info to nodes
for node in tqdm(G.nodes()):
    G.nodes[node]["display_name"] = "Unknown"
    G.nodes[node]["country"] = "Unknown"
    G.nodes[node]["total_citations"] = author_citation_counts.get(node, 0)
    G.nodes[node]["first_publication_year"] = author_first_publication.get(node, None)


graph_data = nx.node_link_data(G)

with open("IC2S2_coauthorship_network.json", "w") as json_file:
    json.dump(graph_data, json_file, indent=4)

print("\nGraph saved successfully as 'IC2S2_coauthorship_network.json' 🎉")


100%|██████████| 12016/12016 [00:00<00:00, 203637.10it/s]
100%|██████████| 85941/85941 [00:00<00:00, 6062067.88it/s]
100%|██████████| 12016/12016 [00:00<00:00, 43387.99it/s]
100%|██████████| 22183/22183 [00:00<00:00, 1171964.30it/s]



Graph saved successfully as 'IC2S2_coauthorship_network.json' 🎉
