In [None]:
pip install requests beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

base_url = "https://www.knaw.nl/leden?page="
members = []

# Loop through all 26 pages
for page in range(1, 27):
    print(f"Processing page {page}...")
    response = requests.get(f"{base_url}{page}")

    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    member_entries = soup.find_all("div", class_="sc-d4b23084-0 knUljo")  # Targeting individual member entries

    for entry in member_entries:
        name_tag = entry.find("h3")  # Extracting the name
        expertise_tag = entry.find("div", class_="expertises")  # Extracting the expertise

        # Extract name and expertise text
        name = name_tag.get_text(strip=True) if name_tag else "Unknown"
        expertise = expertise_tag.get_text(strip=True) if expertise_tag else "Unknown"

        members.append({"name": name, "expertise": expertise})

    time.sleep(1)  # Polite delay to avoid overloading the server

# Save to JSON file
with open("knaw_members.json", "w", encoding="utf-8") as f:
    json.dump(members, f, indent=4, ensure_ascii=False)

print("Scraping completed. Data saved to 'knaw_members.json'.")

Processing page 1...
Processing page 2...
Processing page 3...
Processing page 4...
Processing page 5...
Processing page 6...
Processing page 7...
Processing page 8...
Processing page 9...
Processing page 10...
Processing page 11...
Processing page 12...
Processing page 13...
Processing page 14...
Processing page 15...
Processing page 16...
Processing page 17...
Processing page 18...
Processing page 19...
Processing page 20...
Processing page 21...
Processing page 22...
Processing page 23...
Processing page 24...
Processing page 25...
Processing page 26...
Scraping completed. Data saved to 'knaw_members.json'.


In [None]:
import json

# Load the JSON file
with open('knaw_members.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Count members based on the structure
if isinstance(data, list):  # If the JSON is a list
    member_count = len(data)
elif isinstance(data, dict):  # If the JSON is a dictionary
    # Assuming the key 'members' contains the list of members
    member_count = len(data.get('members', []))
else:
    member_count = 0

print(f"Total number of members: {member_count}")

Total number of members: 624


In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file_path = "/content/knaw_members.json"  # Replace with actual file path
output_file_path = "knaw_author_ids.json"

# Load KNAW members JSON
with open(input_file_path, 'r', encoding='utf-8') as file:
    knaw_members = json.load(file)

# OpenAlex API endpoint
openalex_base_url = "https://api.openalex.org/authors"

# Function to query OpenAlex
def get_openalex_author_id(name):
    try:
        response = requests.get(openalex_base_url, params={"search": name})
        response.raise_for_status()  # Raise an exception for HTTP errors
        results = response.json()

        if "results" in results and results["results"]:
            # Take the first match (or apply more filters if needed)
            first_result = results["results"][0]
            return {
                "author_id": first_result.get("id"),
                "name": first_result.get("display_name"),
                "works_count": first_result.get("works_count", 0),
                "cited_by_count": first_result.get("cited_by_count", 0),
            }
        else:
            return None  # No match found
    except Exception as e:
        print(f"Error fetching author ID for {name}: {e}")
        return None

# Process each member to fetch OpenAlex Author ID
results = []
for member in knaw_members:
    print(f"Fetching OpenAlex Author ID for {member['name']}...")
    author_data = get_openalex_author_id(member["name"])
    if author_data:
        results.append({
            "name": member["name"],
            "expertise": member["expertise"],
            "openalex_id": author_data["author_id"],
            "works_count": author_data["works_count"],
            "cited_by_count": author_data["cited_by_count"],
        })
    else:
        results.append({
            "name": member["name"],
            "expertise": member["expertise"],
            "openalex_id": None,
            "works_count": None,
            "cited_by_count": None,
        })
    # Respect API rate limits
    time.sleep(1)  # Adjust delay if needed

# Save the results to a JSON file
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(results, file, ensure_ascii=False, indent=4)

print(f"Author IDs saved to '{output_file_path}'.")

# Optionally, save to CSV for easier inspection
pd.DataFrame(results).to_csv("knaw_author_ids.csv", index=False)
print("Author IDs also saved to 'knaw_author_ids.csv'.")

Fetching OpenAlex Author ID for Wil van der Aalst...
Fetching OpenAlex Author ID for Kees Aarts...
Fetching OpenAlex Author ID for Gerard Acket...
Fetching OpenAlex Author ID for Remieg Aerts...
Fetching OpenAlex Author ID for Conny Aerts...
Fetching OpenAlex Author ID for Reuven Agami...
Fetching OpenAlex Author ID for Frits Agterberg...
Fetching OpenAlex Author ID for Takuzo Aida...
Fetching OpenAlex Author ID for Anna Akhmanova...
Fetching OpenAlex Author ID for Barbara Aland...
Fetching OpenAlex Author ID for Tjeerd van Albada...
Fetching OpenAlex Author ID for André Aleman...
Fetching OpenAlex Author ID for Keimpe Algra...
Fetching OpenAlex Author ID for Maurits Allessie...
Fetching OpenAlex Author ID for Sigurd Angenent...
Fetching OpenAlex Author ID for Frank Ankersmit...
Fetching OpenAlex Author ID for Isabel Arends...
Fetching OpenAlex Author ID for Ad van der Avoird...
Fetching OpenAlex Author ID for Hein de Baar...
Fetching OpenAlex Author ID for Thomas Bäck...
Fetching Open

In [None]:
import requests
import json
import time

# Input and output file paths
input_file = "/content/knaw_author_ids.json"  # Replace with your actual file path
output_file = "knaw_members_with_indices.json"  # File to save the updated data

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch H-index and I10-index for an author
def fetch_author_metrics(author_id):
    url = f"{openalex_base_url}/authors/{author_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        h_index = data.get("summary_stats", {}).get("h_index", None)
        i10_index = data.get("summary_stats", {}).get("i10_index", None)
        return h_index, i10_index
    else:
        print(f"Error fetching data for author {author_id}: {response.status_code}")
        return None, None

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Process each member and fetch H-index and I10-index
for member in members:
    openalex_ids = member.get("openalex_id")
    if isinstance(openalex_ids, list):
        # Initialize variables to aggregate metrics
        h_indices = []
        i10_indices = []
        for openalex_id in openalex_ids:
            author_id = openalex_id.split("/")[-1]  # Extract author ID
            h_index, i10_index = fetch_author_metrics(author_id)
            if h_index is not None:
                h_indices.append(h_index)
            if i10_index is not None:
                i10_indices.append(i10_index)
            time.sleep(1)  # Respect API rate limits

        # Aggregate indices (e.g., take maximum values)
        member["h_index"] = max(h_indices) if h_indices else None
        member["i10_index"] = max(i10_indices) if i10_indices else None

    elif isinstance(openalex_ids, str):
        author_id = openalex_ids.split("/")[-1]  # Extract author ID
        h_index, i10_index = fetch_author_metrics(author_id)
        member["h_index"] = h_index
        member["i10_index"] = i10_index
        time.sleep(1)  # Respect API rate limits
    else:
        print(f"No OpenAlex ID for {member['name']}. Skipping...")
        member["h_index"] = None
        member["i10_index"] = None

# Save the updated JSON with indices
with open(output_file, "w") as outfile:
    json.dump(members, outfile, indent=4)

print(f"H-index and I10-index added and saved to {output_file}")

No OpenAlex ID for Ernestine Kaper. Skipping...
H-index and I10-index added and saved to knaw_members_with_indices.json


In [None]:
import requests
import json
import time

# Input and output file paths
input_file = "/content/knaw_members_with_indices.json"  # Replace with your actual file path
output_file = "knaw_works.json"  # File to save the detailed works data

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    while True:
        url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for work in data["results"]:
                # Fetch detailed information for each work
                detailed_work_url = f"{openalex_base_url}/works/{work['id'].split('/')[-1]}"
                detailed_response = requests.get(detailed_work_url)
                if detailed_response.status_code == 200:
                    detailed_work = detailed_response.json()
                    # Remove the 'abstract' field
                    detailed_work.pop("abstract", None)
                    works.append(detailed_work)
                else:
                    print(f"Error fetching detailed work: {detailed_response.status_code}")
            if "next" not in data["meta"]:
                break
            page += 1
            time.sleep(1)  # Respect API rate limits
        else:
            print(f"Error fetching works for author {author_id}: {response.status_code}")
            break
    return works

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Fetch works for each author
all_works = []
for member in members:
    openalex_ids = member.get("openalex_id")
    if isinstance(openalex_ids, list):
        for openalex_id in openalex_ids:
            author_id = openalex_id.split("/")[-1]  # Extract author ID
            works = fetch_author_works(author_id)
            all_works.extend(works)
    elif isinstance(openalex_ids, str):
        author_id = openalex_ids.split("/")[-1]  # Extract author ID
        works = fetch_author_works(author_id)
        all_works.extend(works)
    else:
        print(f"No OpenAlex ID for {member['name']}. Skipping...")

# Save the works data to a JSON file
with open(output_file, "w") as outfile:
    json.dump(all_works, outfile, indent=4)

print(f"Works data with full details saved to {output_file} without abstracts")

In [None]:
import requests
import json
import time
import os

# Input and output file paths
input_file = "/content/knaw_members_with_indices.json"  # Replace with your actual file path
output_file = "knaw_works.json"  # File to save the detailed works data
progress_file = "progress_log.json"  # Save progress to resume if interrupted

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    while True:
        try:
            url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
            response = requests.get(url, timeout=30)
            response.raise_for_status()  # Raise an exception for HTTP errors
            data = response.json()

            works.extend(data.get("results", []))
            if "next" not in data.get("meta", {}):
                break

            page += 1
            time.sleep(1)  # Respect API rate limits
        except requests.exceptions.RequestException as e:
            print(f"Error fetching works for author {author_id} on page {page}: {e}")
            break
    return works

# Resume or start fresh
if os.path.exists(progress_file):
    with open(progress_file, "r") as file:
        processed_authors = json.load(file)
else:
    processed_authors = {}

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Fetch works for each author
all_works = []
for member in members:
    openalex_ids = member.get("openalex_id")
    if isinstance(openalex_ids, list):
        for openalex_id in openalex_ids:
            author_id = openalex_id.split("/")[-1]  # Extract author ID
            if author_id not in processed_authors:
                print(f"Fetching works for {member['name']} ({author_id})...")
                works = fetch_author_works(author_id)
                all_works.extend(works)
                processed_authors[author_id] = True
                # Save progress after each author
                with open(progress_file, "w") as file:
                    json.dump(processed_authors, file, indent=4)
    elif isinstance(openalex_ids, str):
        author_id = openalex_ids.split("/")[-1]  # Extract author ID
        if author_id not in processed_authors:
            print(f"Fetching works for {member['name']} ({author_id})...")
            works = fetch_author_works(author_id)
            all_works.extend(works)
            processed_authors[author_id] = True
            # Save progress after each author
            with open(progress_file, "w") as file:
                json.dump(processed_authors, file, indent=4)
    else:
        print(f"No OpenAlex ID for {member['name']}. Skipping...")

# Save the works data to a JSON file
with open(output_file, "w") as outfile:
    json.dump(all_works, outfile, indent=4)

print(f"Works data with full details saved to {output_file}.")

# Clean up the progress log file if processing is complete
if len(processed_authors) == len(members):
    os.remove(progress_file)

Fetching works for Wil van der Aalst (A5069762894)...
Fetching works for Kees Aarts (A5025713059)...
Fetching works for Gerard Acket (A5046174111)...
Fetching works for Remieg Aerts (A5026631020)...
Fetching works for Conny Aerts (A5071357492)...
Fetching works for Reuven Agami (A5079323488)...
Fetching works for Frits Agterberg (A5083547574)...
Fetching works for Takuzo Aida (A5063217261)...
Fetching works for Anna Akhmanova (A5024109873)...
Fetching works for Barbara Aland (A5007027058)...
Fetching works for Tjeerd van Albada (A5068636900)...
Fetching works for André Aleman (A5083885093)...
Fetching works for Keimpe Algra (A5060721437)...
Fetching works for Maurits Allessie (A5112166922)...
Fetching works for Sigurd Angenent (A5068621385)...
Fetching works for Frank Ankersmit (A5106131841)...
Fetching works for Isabel Arends (A5028443868)...
Fetching works for Ad van der Avoird (A5030928179)...
Fetching works for Hein de Baar (A5085046864)...
Fetching works for Thomas Bäck (A50626468

In [None]:
import json  # Make sure to import the json module

with open("knaw_works.json", "r") as file:
    data = json.load(file)
    print(data[:10])  # View the first 10 works

In [None]:
import json

input_file = "/content/knaw_works.json"
output_file = "knaw_works_subset.json"

# Open and load the full JSON file
with open(input_file, "r") as file:
    data = json.load(file)  # Load entire JSON file into memory

# Extract the first 1000 records (or fewer if the file has less than 1000)
subset = data[:1000]  # Adjust the number for your subset size

# Save the subset to a new JSON file
with open(output_file, "w") as output:
    json.dump(subset, output, indent=4)

print(f"Subset saved to {output_file}")

Subset saved to knaw_works_subset.json


In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file = "knaw_members_with_indices.json"  # Replace with your actual file path
output_json_file = "knaw_works_filtered.json"  # JSON file output
output_csv_file = "knaw_works_filtered.csv"  # CSV file output
progress_log_file = "progress_log.json"  # Log file to track progress

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    while True:
        url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for work in data["results"]:
                filtered_work = {
                    "id": work.get("id"),
                    "title": work.get("title"),
                    "doi": work.get("doi"),
                    "publication_year": work.get("publication_year"),
                    "cited_by_count": work.get("cited_by_count"),
                    "authorships": [
                        {
                            "author": {
                                "id": auth.get("author", {}).get("id"),
                                "display_name": auth.get("author", {}).get("display_name")
                            },
                            "institutions": [
                                {
                                    "id": inst.get("id"),
                                    "display_name": inst.get("display_name"),
                                    "country_code": inst.get("country_code")
                                }
                                for inst in auth.get("institutions", [])
                            ]
                        }
                        for auth in work.get("authorships", [])
                    ],
                    "topics": [
                        {
                            "display_name": topic.get("display_name"),
                            "score": topic.get("score")
                        }
                        for topic in work.get("concepts", [])
                    ],
                    "open_access": work.get("open_access"),
                    "referenced_works": work.get("referenced_works")
                }
                works.append(filtered_work)
            if "next" not in data["meta"]:
                break
            page += 1
            time.sleep(1)  # Respect API rate limits
        else:
            print(f"Error fetching works for author {author_id}: {response.status_code}")
            break
    return works

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Load or initialize the progress log
try:
    with open(progress_log_file, "r") as log_file:
        progress_log = json.load(log_file)
except FileNotFoundError:
    progress_log = {}

# Fetch works for each author and organize them under authors
authors_with_works = []
for member in members:
    name = member["name"]
    openalex_ids = member.get("openalex_id")
    all_works = []

    if name in progress_log and progress_log[name]:
        print(f"Skipping {name}, already completed.")
        continue

    print(f"Fetching works for {name}...")

    try:
        if isinstance(openalex_ids, list):
            for openalex_id in openalex_ids:
                author_id = openalex_id.split("/")[-1]  # Extract author ID
                all_works.extend(fetch_author_works(author_id))
        elif isinstance(openalex_ids, str):
            author_id = openalex_ids.split("/")[-1]
            all_works.extend(fetch_author_works(author_id))

        authors_with_works.append({
            "name": name,
            "openalex_id": openalex_ids,
            "works": all_works
        })

        # Mark the author as successfully processed
        progress_log[name] = True

    except Exception as e:
        print(f"Error processing {name}: {e}")
        progress_log[name] = False  # Mark as failed

    # Save progress after each author
    with open(progress_log_file, "w") as log_file:
        json.dump(progress_log, log_file, indent=4)

# Save the filtered data to a JSON file
with open(output_json_file, "w") as outfile:
    json.dump(authors_with_works, outfile, indent=4)
print(f"Filtered works data saved to JSON: {output_json_file}")

# Convert to CSV for easier inspection
csv_data = []
for author in authors_with_works:
    for work in author["works"]:
        csv_data.append({
            "author_name": author["name"],
            "author_openalex_id": author["openalex_id"],
            "work_id": work["id"],
            "work_title": work["title"],
            "doi": work["doi"],
            "publication_year": work["publication_year"],
            "cited_by_count": work["cited_by_count"],
            "topics": ", ".join([topic["display_name"] for topic in work["topics"]]),
            "open_access_status": work["open_access"]["oa_status"] if work["open_access"] else None,
            "referenced_works_count": len(work["referenced_works"]) if work["referenced_works"] else 0
        })

pd.DataFrame(csv_data).to_csv(output_csv_file, index=False)
print(f"Filtered works data saved to CSV: {output_csv_file}")

Fetching works for Wil van der Aalst (https://openalex.org/A5069762894)...
Fetching works for Kees Aarts (https://openalex.org/A5025713059)...
Fetching works for Gerard Acket (https://openalex.org/A5046174111)...
Fetching works for Remieg Aerts (https://openalex.org/A5026631020)...
Fetching works for Conny Aerts (https://openalex.org/A5071357492)...
Fetching works for Reuven Agami (https://openalex.org/A5079323488)...
Fetching works for Frits Agterberg (https://openalex.org/A5083547574)...
Fetching works for Takuzo Aida (https://openalex.org/A5063217261)...
Fetching works for Anna Akhmanova (https://openalex.org/A5024109873)...
Fetching works for Barbara Aland (https://openalex.org/A5007027058)...
Fetching works for Tjeerd van Albada (https://openalex.org/A5068636900)...
Fetching works for André Aleman (https://openalex.org/A5083885093)...
Fetching works for Keimpe Algra (https://openalex.org/A5060721437)...
Fetching works for Maurits Allessie (https://openalex.org/A5112166922)...


KeyboardInterrupt: 

In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file = "/content/knaw_members_with_indices.json"  # Replace with your actual file path
output_json_file = "knaw_works_filtered.json"  # JSON file output
output_csv_file = "knaw_works_filtered.csv"  # CSV file output
progress_log_file = "progress_log.json"  # Log file to track progress

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    retries = 3  # Retry limit for failed requests

    while True:
        url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
        for attempt in range(retries):
            try:
                response = requests.get(url, timeout=10)
                response.raise_for_status()  # Raise error for bad status codes
                data = response.json()

                for work in data.get("results", []):
                    primary_topic = work.get("primary_topic") or {}
                    filtered_work = {
                        "id": work.get("id"),
                        "title": work.get("title", ""),
                        "doi": work.get("doi"),
                        "publication_year": work.get("publication_year"),
                        "cited_by_count": work.get("cited_by_count", 0),
                        "authorships": [
                            {
                                "author": {
                                    "id": auth.get("author", {}).get("id"),
                                    "display_name": auth.get("author", {}).get("display_name", "")
                                },
                                "institutions": [
                                    {
                                        "id": inst.get("id"),
                                        "display_name": inst.get("display_name", ""),
                                        "country_code": inst.get("country_code", "")
                                    }
                                    for inst in auth.get("institutions", []) if inst
                                ]
                            }
                            for auth in work.get("authorships", []) if auth
                        ],
                        "field": primary_topic.get("field", {}).get("display_name", ""),
                        "subfield": primary_topic.get("subfield", {}).get("display_name", ""),
                        "keywords": [
                            {
                                "id": keyword.get("id"),
                                "display_name": keyword.get("display_name", ""),
                                "score": keyword.get("score", 0)
                            }
                            for keyword in work.get("keywords", []) if keyword
                        ],
                        "open_access": work.get("open_access"),
                        "referenced_works": work.get("referenced_works", [])
                    }
                    works.append(filtered_work)

                if "next" not in data.get("meta", {}):
                    return works  # All pages fetched
                page += 1
                break  # Break out of the retry loop on success
            except requests.exceptions.RequestException as e:
                print(f"Error fetching page {page} for author {author_id}: {e}")
                time.sleep(2 ** attempt)  # Exponential backoff
                if attempt == retries - 1:
                    return works  # Return partial results if retries exhausted

    return works

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Load or initialize the progress log
try:
    with open(progress_log_file, "r") as log_file:
        progress_log = json.load(log_file)
except FileNotFoundError:
    progress_log = {}

# Fetch works for each author and organize them under authors
authors_with_works = []
for member in members:
    name = member["name"]
    openalex_ids = member.get("openalex_id")
    all_works = []

    if name in progress_log and progress_log[name]:
        print(f"Skipping {name}, already completed.")
        continue

    print(f"Fetching works for {name}...")

    try:
        if isinstance(openalex_ids, list):
            for openalex_id in openalex_ids:
                if openalex_id:
                    author_id = openalex_id.split("/")[-1]  # Extract author ID
                    all_works.extend(fetch_author_works(author_id))
        elif isinstance(openalex_ids, str) and openalex_ids:
            author_id = openalex_ids.split("/")[-1]
            all_works.extend(fetch_author_works(author_id))

        authors_with_works.append({
            "name": name,
            "openalex_id": openalex_ids,
            "works": all_works
        })

        # Mark the author as successfully processed
        progress_log[name] = True

    except Exception as e:
        print(f"Error processing {name}: {e}")
        progress_log[name] = False  # Mark as failed

    # Save progress after each author
    with open(progress_log_file, "w") as log_file:
        json.dump(progress_log, log_file, indent=4)

# Save the filtered data to a JSON file
with open(output_json_file, "w") as outfile:
    json.dump(authors_with_works, outfile, indent=4)
print(f"Filtered works data saved to JSON: {output_json_file}")

# Convert to CSV for easier inspection
csv_data = []
for author in authors_with_works:
    for work in author["works"]:
        csv_data.append({
            "author_name": author["name"],
            "author_openalex_id": author["openalex_id"],
            "work_id": work["id"],
            "work_title": work["title"],
            "doi": work["doi"],
            "publication_year": work["publication_year"],
            "cited_by_count": work["cited_by_count"],
            "field": work["field"],
            "subfield": work["subfield"],
            "keywords": ", ".join([keyword["display_name"] for keyword in work["keywords"]]),
            "open_access_status": work["open_access"]["oa_status"] if work["open_access"] else None,
            "referenced_works_count": len(work["referenced_works"])
        })

pd.DataFrame(csv_data).to_csv(output_csv_file, index=False)
print(f"Filtered works data saved to CSV: {output_csv_file}")

Fetching works for Wil van der Aalst...
Fetching works for Kees Aarts...
Fetching works for Gerard Acket...
Fetching works for Remieg Aerts...
Fetching works for Conny Aerts...
Fetching works for Reuven Agami...
Fetching works for Frits Agterberg...
Fetching works for Takuzo Aida...
Fetching works for Anna Akhmanova...
Fetching works for Barbara Aland...
Fetching works for Tjeerd van Albada...
Fetching works for André Aleman...
Fetching works for Keimpe Algra...
Fetching works for Maurits Allessie...
Fetching works for Sigurd Angenent...
Fetching works for Frank Ankersmit...
Fetching works for Isabel Arends...
Fetching works for Ad van der Avoird...
Fetching works for Hein de Baar...
Fetching works for Thomas Bäck...
Fetching works for Jan Baerends...
Fetching works for Marian Bakermans-Kranenburg...
Fetching works for Jan Albert Bakker...
Fetching works for Huib Bakker...
Fetching works for Egbert Bakker...
Fetching works for Erik Bakkers...
Fetching works for Henk Barendregt...
Fetch

In [None]:
import json

# Input and output file paths
input_file = "/content/knaw_members_with_indices.json"  # Replace with your distorted JSON file path
output_file = "knaw_members_with_indices_restored.json"  # Restored output file

# Load the distorted JSON file
with open(input_file, "r", encoding="utf-8") as infile:
    data = json.load(infile)

# Decode Unicode escape sequences
def decode_unicode(data):
    if isinstance(data, dict):
        return {key: decode_unicode(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [decode_unicode(item) for item in data]
    elif isinstance(data, str):
        try:
            # Attempt to decode the string
            return data.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            # Return the original string if decoding fails
            return data
    else:
        return data

restored_data = decode_unicode(data)

# Save the restored JSON file
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(restored_data, outfile, ensure_ascii=False, indent=4)

print(f"Restored file saved to {output_file}")

Restored file saved to knaw_members_with_indices_restored.json


In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file = "/content/knaw_members_with_indices_restored.json"  # Replace with your actual file path
output_json_file = "knaw_works_filtered.json"  # JSON file output
output_csv_file = "knaw_works_filtered.csv"  # CSV file output
progress_log_file = "progress_log.json"  # Log file to track progress

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    while True:
        url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for work in data.get("results", []):  # Handle missing 'results'
                filtered_work = {
                    "id": work.get("id", ""),
                    "title": work.get("title", ""),
                    "doi": work.get("doi", ""),
                    "publication_year": work.get("publication_year", ""),
                    "cited_by_count": work.get("cited_by_count", 0),
                    "authorships": [
                        {
                            "author": {
                                "id": auth.get("author", {}).get("id", ""),
                                "display_name": auth.get("author", {}).get("display_name", "")
                            },
                            "institutions": [
                                {
                                    "id": inst.get("id", ""),
                                    "display_name": inst.get("display_name", ""),
                                    "country_code": inst.get("country_code", ""),
                                    "type": inst.get("type", "")  # Safely handle missing type
                                }
                                for inst in auth.get("institutions", []) if inst
                            ]
                        }
                        for auth in work.get("authorships", []) if auth
                    ],
                    "primary_topic": {
                        "id": work.get("primary_topic", {}).get("id", ""),
                        "display_name": work.get("primary_topic", {}).get("display_name", ""),
                        "score": work.get("primary_topic", {}).get("score", 0),
                        "field": {
                            "id": work.get("primary_topic", {}).get("field", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("field", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("field") else {},
                        "subfield": {
                            "id": work.get("primary_topic", {}).get("subfield", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("subfield", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("subfield") else {},
                        "domain": {
                            "id": work.get("primary_topic", {}).get("domain", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("domain", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("domain") else {}
                    } if work.get("primary_topic") else {},  # Handle missing primary_topic
                    "concepts": [
                        {
                            "id": concept.get("id", ""),
                            "display_name": concept.get("display_name", ""),
                            "level": concept.get("level", 0),
                            "score": concept.get("score", 0)
                        }
                        for concept in work.get("concepts", []) if concept
                    ],
                    "open_access": work.get("open_access", {}),
                    "sustainable_development_goals": [
                        {
                            "id": sdg.get("id", ""),
                            "score": sdg.get("score", 0),
                            "display_name": sdg.get("display_name", "")
                        }
                        for sdg in work.get("sustainable_development_goals", []) if sdg
                    ],
                    "referenced_works": work.get("referenced_works", [])
                }
                works.append(filtered_work)
            if "next" not in data.get("meta", {}):  # Handle missing 'meta'
                break
            page += 1
            time.sleep(1)  # Respect API rate limits
        else:
            print(f"Error fetching works for author {author_id}: {response.status_code}")
            break
    return works

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Load or initialize the progress log
try:
    with open(progress_log_file, "r") as log_file:
        progress_log = json.load(log_file)
except FileNotFoundError:
    progress_log = {}

# Fetch works for each author and organize them under authors
authors_with_works = []
for member in members:
    name = member["name"]
    openalex_ids = member.get("openalex_id")
    all_works = []

    if name in progress_log and progress_log[name]:
        print(f"Skipping {name}, already completed.")
        continue

    print(f"Fetching works for {name}...")

    try:
        if isinstance(openalex_ids, list):
            for openalex_id in openalex_ids:
                author_id = openalex_id.split("/")[-1]  # Extract author ID
                all_works.extend(fetch_author_works(author_id))
        elif isinstance(openalex_ids, str):
            author_id = openalex_ids.split("/")[-1]
            all_works.extend(fetch_author_works(author_id))

        authors_with_works.append({
            "name": name,
            "openalex_id": openalex_ids,
            "works": all_works
        })

        # Mark the author as successfully processed
        progress_log[name] = True

    except Exception as e:
        print(f"Error processing {name}: {e}")
        progress_log[name] = False  # Mark as failed

    # Save progress after each author
    with open(progress_log_file, "w") as log_file:
        json.dump(progress_log, log_file, indent=4)

# Save the filtered data to a JSON file
with open(output_json_file, "w") as outfile:
    json.dump(authors_with_works, outfile, indent=4)
print(f"Filtered works data saved to JSON: {output_json_file}")

# Convert to CSV for easier inspection
csv_data = []
for author in authors_with_works:
    for work in author["works"]:
        csv_data.append({
            "author_name": author["name"],
            "author_openalex_id": author["openalex_id"],
            "work_id": work["id"],
            "work_title": work["title"],
            "doi": work["doi"],
            "publication_year": work["publication_year"],
            "cited_by_count": work["cited_by_count"],
            "field": work["primary_topic"].get("field", {}).get("display_name", ""),
            "subfield": work["primary_topic"].get("subfield", {}).get("display_name", ""),
            "domain": work["primary_topic"].get("domain", {}).get("display_name", ""),
            "concepts": ", ".join([concept["display_name"] for concept in work["concepts"]]),
            "open_access_status": work.get("open_access", {}).get("oa_status", ""),
            "sustainable_development_goals": ", ".join([sdg["display_name"] for sdg in work.get("sustainable_development_goals", [])]),
            "referenced_works_count": len(work.get("referenced_works", []))
        })

pd.DataFrame(csv_data).to_csv(output_csv_file, index=False)
print(f"Filtered works data saved to CSV: {output_csv_file}")

Fetching works for Wil van der Aalst...
Fetching works for Kees Aarts...
Fetching works for Gerard Acket...
Fetching works for Remieg Aerts...
Fetching works for Conny Aerts...
Fetching works for Reuven Agami...
Fetching works for Frits Agterberg...
Fetching works for Takuzo Aida...
Fetching works for Anna Akhmanova...
Fetching works for Barbara Aland...
Fetching works for Tjeerd van Albada...
Fetching works for André Aleman...
Fetching works for Keimpe Algra...
Fetching works for Maurits Allessie...
Fetching works for Sigurd Angenent...
Fetching works for Frank Ankersmit...
Fetching works for Isabel Arends...
Fetching works for Ad van der Avoird...
Fetching works for Hein de Baar...
Fetching works for Thomas Bäck...
Fetching works for Jan Baerends...
Fetching works for Marian Bakermans-Kranenburg...
Fetching works for Jan Albert Bakker...
Fetching works for Huib Bakker...
Fetching works for Egbert Bakker...
Fetching works for Erik Bakkers...
Fetching works for Henk Barendregt...
Fetch

In [None]:
import json

# Input and output file paths
input_file = "/content/knaw_works_filtered.json"  # Replace with your distorted JSON file path
output_file = "knaw_works_filtered_restored.json"  # Restored output file

# Load the distorted JSON file
with open(input_file, "r", encoding="utf-8") as infile:
    data = json.load(infile)

# Decode Unicode escape sequences
def decode_unicode(data):
    if isinstance(data, dict):
        return {key: decode_unicode(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [decode_unicode(item) for item in data]
    elif isinstance(data, str):
        try:
            # Attempt to decode the string
            return data.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            # Return the original string if decoding fails
            return data
    else:
        return data

restored_data = decode_unicode(data)

# Save the restored JSON file
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(restored_data, outfile, ensure_ascii=False, indent=4)

print(f"Restored file saved to {output_file}")

Restored file saved to knaw_works_filtered_restored.json


In [None]:
import pandas as pd

# Input and output file paths
input_file = "/content/knaw_works_filtered.csv"  # Replace with your distorted CSV file path
output_file = "knaw_works_filtered_restored.csv"  # Restored output file

# Load the distorted CSV file
df = pd.read_csv(input_file)

# Decode Unicode escape sequences in strings
def decode_unicode_string(value):
    if isinstance(value, str):  # Ensure the value is a string before decoding
        try:
            return value.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            return value  # Return the original value if decoding fails
    return value  # Return the value as-is if it's not a string

# Apply the Unicode decoding function to all string cells in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Apply only to object (string) columns
        df[col] = df[col].apply(decode_unicode_string)

# Save the restored CSV file
df.to_csv(output_file, index=False)

print(f"Restored file saved to {output_file}")

Restored file saved to knaw_works_filtered_restored.csv
