In [3]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
from tqdm import tqdm
import pandas as pd

<h1 align="center">PART 1</h1>

In [150]:
def scrape_names():
    program_url = "https://ic2s2-2023.org/program" # Program URL
    chairs_url = "https://ic2s2-2023.org/organization" # Chairs URL
    keynotes_url = "https://ic2s2-2023.org/keynotes" # Keynotes URL

    # Scrape program page
    program_response = requests.get(program_url)
    program_soup = BeautifulSoup(program_response.text, 'html.parser')
    program_names = [tag.get_text(strip=True) for tag in program_soup.find_all('u')] # names are stored in <u> tags

    # Scrape chairs page
    chairs_response = requests.get(chairs_url)
    chairs_soup = BeautifulSoup(chairs_response.text, 'html.parser')
    chairs_names = [a.get_text(strip=True) for a in chairs_soup.select("h3 a")] # names are stored in <a> within <h3> tags

    # Scape keynotes page
    keynotes_response = requests.get(keynotes_url)
    keynotes_soup = BeautifulSoup(keynotes_response.text, 'html.parser')
    keynotes_names = [a.get_text(strip=True) for a in keynotes_soup.select("h3 a")] # names are stored in <a> within <h3> tags again

    # Returning names in dict 
    return {
        "program_names": program_names,
        "chairs_names": chairs_names,
        "keynotes_names": keynotes_names
    }

In [151]:
names = scrape_names() # Getting all names

Elements_to_remove = ["SAGE", "DIREC", "EPJ", "Esteban Moro (EPJkeynote)"] # Found some errors in keynotes_names - we just remove them manually
for element in Elements_to_remove:
    names["keynotes_names"].remove(element)

names["keynotes_names"].append("Esteban Moro") # Add back Esteban Moro without the (EPJkeynote) part

all_names = names["program_names"] + names["chairs_names"] + names["keynotes_names"] # combining all names into a list

all_names = list(set(all_names)) # This removes 123 duplicates

In [152]:

# < ---- > 
# Method to find highly similar names
# Should work with defualt python libs

from difflib import SequenceMatcher

def are_almost_similar(s1, s2, threshold=0.8):
    return SequenceMatcher(None, s1, s2).ratio() >= threshold

def find_similar_names(all_names, threshold=0.8):
    similar_pairs = []
    for i in range(len(all_names)):
        for j in range(i + 1, len(all_names)):
            if are_almost_similar(all_names[i], all_names[j], threshold):
                similar_pairs.append((all_names[i], all_names[j]))
    return similar_pairs

# < ---- >

# Find and print similar names
similar_names = find_similar_names(all_names, threshold=0.8)
print(f"similar names: {similar_names}")

# Michele Tizzoni and Michele Tizzani both exists 
# Doing a manual assessment of similar_names to find what i want to remove
names_to_remove = ["Carlson Büth", "Katinka den Nijs", "Kathyrn Fair", "Sonja M Schmer Galunder"]
for name in names_to_remove: # removing
    all_names.remove(name)

similar names: [('Michele Tizzoni', 'Michele Tizzani'), ('Carlson Büth', 'Carlson Moses Büth'), ('Francesco Silvestri', 'Francesco Pierri'), ('Katinka den Nijs', 'Katinka Den Nijs'), ('Shintaro Ueki', 'Shintaro Sakai'), ('Kathyrn R Fair', 'Kathyrn Fair'), ('Sonja M Schmer-Galunder', 'Sonja M Schmer Galunder')]


In [153]:
# Final list of names
print(len(all_names))

510


## Scarping Names Explained

### 1. Inspecting URLs
First, I inspected all the URLs to determine the structure in which the names were stored.

### 2. Collecting and Printing Names
Then, I collected all the names and printed them from each URL to verify that the extraction worked correctly.

### 3. Handling Errors
During this process, I found some errors in `keynotes_names`, but only **four names** were incorrect, so I removed them manually.

### 4. Removing Duplicates
Next, I used Python’s `set()` function to eliminate duplicates.

### 5. Finding Similar Names
I implemented a method to identify **slightly similar** names. After finding some matches, I manually reviewed each similar pair and decided how to handle them.


<h1 align="center">PART 2</h1>

<h1 align="center">PART 3</h1>

In [None]:

# <----------------------->
# The below code needs to be made more effecient - it is very slow
# <----------------------->

# -------------------------
#       Converting names to IDs if the names exists in OpenAlex - otherwise skipping
# -------------------------
def convert_author_name_to_id(name):

    query = urllib.parse.quote(name)
    url = f"https://api.openalex.org/authors?search={query}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(f"Error searching for author {name}: {e}")
        return None

    data = response.json()

    # If no results, return None
    if "results" not in data or len(data["results"]) == 0:
        return None
    
    # Take the first result
    first_result = data["results"][0]
    openalex_id_url = first_result["id"]
    author_id = openalex_id_url

    return author_id

# -------------------------
#       Collecting all works done by the authors. We use pagination to get all articles if there is more than 200
# -------------------------
def get_works_by_author_id(author_id):
    base_url = "https://api.openalex.org/works"
    all_works = []

    sort_param = "publication_date:desc"

    next_url = (
        f"{base_url}"
        f"?filter=author.id:{author_id}"
        f"&cursor=*"               # * for multi-page results
        f"&per-page=200"
        f"&sort={sort_param}"
    )

    while next_url:
        response = requests.get(next_url)
        response.raise_for_status()
        data = response.json()

        total_count = data["meta"].get("count", 0)
        if not 5 <= total_count <= 5000:
            return None

        # Add all works from this page to the list
        all_works.extend(data["results"])

        # Look for a next page in meta
        next_cursor = data["meta"].get("next_cursor")
        if next_cursor:
            next_url = (
                f"{base_url}"
                f"?filter=author.id:{author_id}"
                f"&cursor={next_cursor}"      
                f"&per-page=200"
                f"&sort={sort_param}"
            )
        else:
            next_url = None

    return all_works


# -------------------------
#       Looping over each name in authors and collecting all their work and their id. 
#       Storing the data in a dict with the work_id as key - but only keeping unique articles/works and apllying filters
# -------------------------
authors = all_names
unique_works = {}
num_authors_not_found = 0

for name in tqdm(authors):
    author_id = convert_author_name_to_id(name)
    if author_id is None:
        num_authors_not_found += 1
        continue # skip if we dont find an id for a given name

    works = get_works_by_author_id(author_id)

    if works: # To make sure we only have authors with work count between 5 and 5000
        for w in works:
            work_id = w.get("id") 
            if work_id not in unique_works and w.get("cited_by_count") > 10 and len(w.get("authorships", [])) < 10: # filters
                publication_year = w.get("publication_year")
                cited_by_count = w.get("cited_by_count")
                title = w.get("title")
                abstract_inverted_index = w.get("abstract_inverted_index")
                
                author_ids = []
                for auth in w.get("authorships", []):
                    author_obj = auth.get("author", {})
                    author_id_field = author_obj.get("id")
                    if author_id_field:
                        author_ids.append(author_id_field)

                # Store data from each unique work in dict
                unique_works[work_id] = {
                    "id": work_id,
                    "publication_year": publication_year,
                    "cited_by_count": cited_by_count,
                    "author_ids": author_ids,
                    "title": title,
                    "abstract_inverted_index": abstract_inverted_index
                }

print(f"\nTotal authors not found: {num_authors_not_found}")
print(f"\nTotal unique articles found across {len(authors)} authors with filters applied:", len(unique_works))

100%|██████████| 510/510 [18:36<00:00,  2.19s/it] 


Total authors not found: 36

Total unique articles found across 510 authors with filters applied: 12016





In [155]:
# -------------------------
#       Using the data from unique_works to create two dataframes - one for papers and one for abstracts
# -------------------------
papers_data = []
abstracts_data = []

for work_id, work in unique_works.items():
    papers_data.append({
        "id": work["id"],
        "publication_year": work["publication_year"],
        "cited_by_count": work["cited_by_count"],
        "author_ids": ",".join(work["author_ids"])  
    })
    abstracts_data.append({
        "id": work["id"],
        "title": work["title"],
        "abstract_inverted_index": work["abstract_inverted_index"]
    })

df_papers = pd.DataFrame(papers_data)
df_abstracts = pd.DataFrame(abstracts_data)

df_papers.to_csv("IC2S2_papers.csv", index=False)
df_abstracts.to_csv("IC2S2_abstracts.csv", index=False)

<h1 align="center">PART 4</h1>

In [4]:
import networkx as nx
#load in IC2S2_papers.csv
df_papers = pd.read_csv("IC2S2_papers.csv")
df_papers.head()



FileNotFoundError: [Errno 2] No such file or directory: 'IC2S2_papers.csv'