In [None]:
pip install requests beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

# Base URL of the Austrian Academy website
base_url = "https://www.oeaw.ac.at"

# URL to the main members page
members_page_url = "https://www.oeaw.ac.at/en/members"

# Function to scrape all members from the main page
def scrape_members():
    members = []
    try:
        response = requests.get(members_page_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all member entries in the list
        member_list = soup.find_all("li")
        for member_item in member_list:
            link_tag = member_item.find("a", class_="before-arrow-right-icon")
            if link_tag:
                profile_url = base_url + link_tag["href"]
                degree_tag = link_tag.find("span", class_="degree")
                name_tag = link_tag.find("span", class_="name text-bold")

                # Extract name and degree, fallback to "Not available" if missing
                name = name_tag.get_text(strip=True) if name_tag else "Not available"
                degree = degree_tag.get_text(strip=True) if degree_tag else "Not available"

                members.append({"name": name, "degree": degree, "profile_url": profile_url})
    except Exception as e:
        print(f"Error fetching members: {e}")
    return members

# Function to scrape additional details from the profile page
def scrape_profile_details(member):
    try:
        response = requests.get(member["profile_url"])
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Example: Scraping expertise or other details (update as per the profile page structure)
        expertise_tag = soup.find("div", class_="field-expertise")  # Update with actual class or ID
        affiliations_tag = soup.find("div", class_="field-affiliations")  # Update with actual class or ID

        # Extract text content
        member["expertise"] = expertise_tag.get_text(strip=True) if expertise_tag else "Not available"
        member["affiliations"] = affiliations_tag.get_text(strip=True) if affiliations_tag else "Not available"
    except Exception as e:
        print(f"Error fetching profile details for {member['name']}: {e}")

# Scrape all members
members_data = scrape_members()

# Scrape profile details for each member
for member in members_data:
    scrape_profile_details(member)
    time.sleep(1)  # Add delay to respect API rate limits

# Save as JSON
json_output_path = "austrian_academy_members.json"
with open(json_output_path, "w", encoding="utf-8") as json_file:
    json.dump(members_data, json_file, indent=4, ensure_ascii=False)

print(f"Data saved to {json_output_path}")

Data saved to austrian_academy_members.json


In [None]:
import json

# Path to your JSON file
file_path = "/content/austrian_academy_members.json"  # Replace with the actual path to your JSON file

# Load the JSON data
with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)

# Count the number of members
number_of_members = len(data)

print(f"Total number of members: {number_of_members}")

Total number of members: 950


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time

# Load the JSON file containing member profile links
members_file_path = "/content/austrian_academy_members.json"  # Replace with your actual file path

# Read the JSON file
with open(members_file_path, "r", encoding="utf-8") as file:
    members_data = json.load(file)

base_url = "https://www.oeaw.ac.at"  # Ensure correct base URL

# List to store the enhanced member data
enhanced_members_data = []

for member in members_data:
    try:
        # Fix URL construction
        profile_url = member["profile_url"] if member["profile_url"].startswith("http") else f"{base_url}{member['profile_url']}"
        response = requests.get(profile_url)
        response.raise_for_status()  # Ensure the request was successful

        soup = BeautifulSoup(response.content, "html.parser")

        # Extract name
        name = soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A"

        # Extract title
        title = soup.find("span", class_="degree").get_text(strip=True) if soup.find("span", class_="degree") else "N/A"

        # Extract affiliation
        affiliation = soup.find("ul", class_="medium-space-between").get_text(strip=True) if soup.find("ul", class_="medium-space-between") else "N/A"

        # Extract membership type (expertise)
        membership_type = soup.find("p", class_="membertype-statement").get_text(strip=True) if soup.find("p", class_="membertype-statement") else "N/A"

        # Append the data
        enhanced_members_data.append({
            "name": name,
            "title": title,
            "affiliation": affiliation,
            "membership_type": membership_type,
            "profile_url": profile_url
        })

        print(f"Scraped data for: {name}")

        # Sleep to respect the server
        time.sleep(1)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {member.get('name', 'Unknown')}: {e}")

# Save the enhanced data to a new JSON file in the current directory
output_file_name = "austrian_academy_members_enhanced.json"

with open(output_file_name, "w", encoding="utf-8") as output_file:
    json.dump(enhanced_members_data, output_file, indent=4, ensure_ascii=False)

print(f"Enhanced data saved to {output_file_name}")

Scraped data for: Rainer Abart
Scraped data for: Karl Acham
Scraped data for: Antal Ádám
Scraped data for: Ludwig Adamovich
Scraped data for: Timon Erik Adolph
Scraped data for: Alícia Adserà
Scraped data for: Adriano Aguzzi
Scraped data for: Josef Aicher
Scraped data for: Martin Aigner
Scraped data for: Luciana Aigner-Foresti
Scraped data for: Brenda Almond
Scraped data for: Michael Alram
Scraped data for: Eva Alram-Stern
Scraped data for: Carlos Alvar Ezquerra
Scraped data for: João Alves
Scraped data for: Michel Amandry
Scraped data for: Petra Amann
Scraped data for: Brigitta Ammann
Scraped data for: Angelika Amon
Scraped data for: Bertil Andersson
Scraped data for: Bernard Andreae
Scraped data for: Anaïs  Angelo
Scraped data for: Flavio Anselmetti
Scraped data for: Leah Armstrong
Scraped data for: Markus Arndt
Scraped data for: Eduard Arzt
Scraped data for: Alain Aspect
Scraped data for: Markus Aspelmeyer
Scraped data for: Aleida Assmann
Scraped data for: Alice Auersperg
Scraped da

In [None]:
import requests
import json
import time

# Input: Austrian Academy Members JSON file
input_file_path = "/content/austrian_academy_members_enhanced.json"  # Replace with your actual file path

# Output file paths
output_file_path = "austrian_academy_members_author_ids.json"  # JSON output file

# Function to fetch OpenAlex data for an author by name
def fetch_openalex_data(name):
    try:
        url = f"https://api.openalex.org/authors?filter=display_name.search:{name}"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        if "results" in data and data["results"]:
            author = data["results"][0]
            return {
                "openalex_id": author.get("id"),
                "works_count": author.get("works_count"),
                "cited_by_count": author.get("cited_by_count"),
                "h_index": author.get("h_index"),
                "i10_index": author.get("i10_index"),
            }
        else:
            return {
                "openalex_id": None,
                "works_count": None,
                "cited_by_count": None,
                "h_index": None,
                "i10_index": None,
            }
    except requests.exceptions.RequestException as e:
        print(f"Error fetching OpenAlex data for {name}: {e}")
        return {
            "openalex_id": None,
            "works_count": None,
            "cited_by_count": None,
            "h_index": None,
            "i10_index": None,
        }

# Read the input JSON file
with open(input_file_path, "r", encoding="utf-8") as file:
    members_data = json.load(file)

# List to store enriched member data
enriched_members_data = []

# Iterate through each member and fetch OpenAlex data
for member in members_data:
    name = member.get("name", "N/A")
    print(f"Fetching OpenAlex data for: {name}")

    # Fetch OpenAlex data
    openalex_data = fetch_openalex_data(name)

    # Append OpenAlex data to the member record
    enriched_member = {
        "name": name,
        "expertise": member.get("expertise", "N/A"),
        **openalex_data  # Merge OpenAlex data
    }
    enriched_members_data.append(enriched_member)

    # Respect API rate limits
    time.sleep(1)

# Save the enriched data to a new JSON file
with open(output_file_path, "w", encoding="utf-8") as output_file:
    json.dump(enriched_members_data, output_file, indent=4, ensure_ascii=False)

print(f"Data with OpenAlex details saved to {output_file_path}")

Fetching OpenAlex data for: Rainer Abart
Fetching OpenAlex data for: Karl Acham
Fetching OpenAlex data for: Antal Ádám
Fetching OpenAlex data for: Ludwig Adamovich
Fetching OpenAlex data for: Timon Erik Adolph
Fetching OpenAlex data for: Alícia Adserà
Fetching OpenAlex data for: Adriano Aguzzi
Fetching OpenAlex data for: Josef Aicher
Fetching OpenAlex data for: Martin Aigner
Fetching OpenAlex data for: Luciana Aigner-Foresti
Fetching OpenAlex data for: Brenda Almond
Fetching OpenAlex data for: Michael Alram
Fetching OpenAlex data for: Eva Alram-Stern
Fetching OpenAlex data for: Carlos Alvar Ezquerra
Fetching OpenAlex data for: João Alves
Fetching OpenAlex data for: Michel Amandry
Fetching OpenAlex data for: Petra Amann
Fetching OpenAlex data for: Brigitta Ammann
Fetching OpenAlex data for: Angelika Amon
Fetching OpenAlex data for: Bertil Andersson
Fetching OpenAlex data for: Bernard Andreae
Fetching OpenAlex data for: Anaïs  Angelo
Fetching OpenAlex data for: Flavio Anselmetti
Fetching

In [None]:
import requests
import json
import time

# Input and output file paths
input_file_path = "/content/austrian_academy_members_enhanced.json"  # Replace with your actual file path
output_file_path = "austrian_academy_members_author_ids_enhanced.json"

# OpenAlex API endpoint
openalex_base_url = "https://api.openalex.org/authors"

# Function to fetch OpenAlex data
def get_openalex_metrics(name):
    try:
        # Query OpenAlex API to find the author
        response = requests.get(openalex_base_url, params={"search": name})
        response.raise_for_status()
        results = response.json()

        if "results" in results and results["results"]:
            # Take the first match (or apply additional filters if necessary)
            author = results["results"][0]
            return {
                "openalex_id": author.get("id"),
                "works_count": author.get("works_count", 0),
                "cited_by_count": author.get("cited_by_count", 0),
                "h_index": author.get("h_index", 0),
                "i10_index": author.get("i10_index", 0)  # Extract i10_index if available
            }
        else:
            return {
                "openalex_id": None,
                "works_count": None,
                "cited_by_count": None,
                "h_index": None,
                "i10_index": None
            }
    except Exception as e:
        print(f"Error fetching OpenAlex data for {name}: {e}")
        return {
            "openalex_id": None,
            "works_count": None,
            "cited_by_count": None,
            "h_index": None,
            "i10_index": None
        }

# Load the initial JSON file
with open(input_file_path, "r", encoding="utf-8") as file:
    members_data = json.load(file)

# Enrich the data with OpenAlex metrics
enriched_data = []

for member in members_data:
    print(f"Fetching OpenAlex data for {member['name']}...")
    openalex_metrics = get_openalex_metrics(member["name"])

    # Combine the existing member info with OpenAlex metrics
    enriched_member = {
        "name": member.get("name"),
        "title": member.get("title"),
        "affiliation": member.get("affiliation"),
        "membership_type": member.get("membership_type"),
        "works_count": openalex_metrics["works_count"],
        "cited_by_count": openalex_metrics["cited_by_count"],
        "h_index": openalex_metrics["h_index"],
        "i10_index": openalex_metrics["i10_index"],
        "openalex_id": openalex_metrics["openalex_id"]
    }

    enriched_data.append(enriched_member)
    time.sleep(1)  # Respect API rate limits

# Save the enriched data to a new JSON file
with open(output_file_path, "w", encoding="utf-8") as output_file:
    json.dump(enriched_data, output_file, indent=4, ensure_ascii=False)

print(f"Enriched data saved to {output_file_path}")

Fetching OpenAlex data for Rainer Abart...
Fetching OpenAlex data for Karl Acham...
Fetching OpenAlex data for Antal Ádám...
Fetching OpenAlex data for Ludwig Adamovich...
Fetching OpenAlex data for Timon Erik Adolph...
Fetching OpenAlex data for Alícia Adserà...
Fetching OpenAlex data for Adriano Aguzzi...
Fetching OpenAlex data for Josef Aicher...
Fetching OpenAlex data for Martin Aigner...
Fetching OpenAlex data for Luciana Aigner-Foresti...
Fetching OpenAlex data for Brenda Almond...
Fetching OpenAlex data for Michael Alram...
Fetching OpenAlex data for Eva Alram-Stern...
Fetching OpenAlex data for Carlos Alvar Ezquerra...
Fetching OpenAlex data for João Alves...
Fetching OpenAlex data for Michel Amandry...
Fetching OpenAlex data for Petra Amann...
Fetching OpenAlex data for Brigitta Ammann...
Fetching OpenAlex data for Angelika Amon...
Fetching OpenAlex data for Bertil Andersson...
Fetching OpenAlex data for Bernard Andreae...
Fetching OpenAlex data for Anaïs  Angelo...
Fetching O

In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file_path = "/content/austrian_academy_members_enhanced.json"  # Replace with actual file path
output_file_path = "austrian_academy_author_ids.json"

# Load members JSON
with open(input_file_path, 'r', encoding='utf-8') as file:
    members = json.load(file)

# OpenAlex API endpoint
openalex_base_url = "https://api.openalex.org/authors"

# Function to query OpenAlex
def get_openalex_author_id(name):
    try:
        response = requests.get(openalex_base_url, params={"search": name})
        response.raise_for_status()  # Raise an exception for HTTP errors
        results = response.json()

        if "results" in results and results["results"]:
            # Take the first match (or apply more filters if needed)
            first_result = results["results"][0]
            return {
                "author_id": first_result.get("id"),
                "name": first_result.get("display_name"),
                "works_count": first_result.get("works_count", 0),
                "cited_by_count": first_result.get("cited_by_count", 0),
            }
        else:
            return None  # No match found
    except Exception as e:
        print(f"Error fetching author ID for {name}: {e}")
        return None

# Process each member to fetch OpenAlex Author ID
results = []
for member in members:
    print(f"Fetching OpenAlex Author ID for {member['name']}...")
    author_data = get_openalex_author_id(member["name"])
    if author_data:
        results.append({
            "name": member["name"],
            "title": member.get("title", None),
            "affiliation": member.get("affiliation", None),
            "membership_type": member.get("membership_type", None),
            "profile_url": member.get("profile_url", None),
            "openalex_id": author_data["author_id"],
            "works_count": author_data["works_count"],
            "cited_by_count": author_data["cited_by_count"],
        })
    else:
        results.append({
            "name": member["name"],
            "title": member.get("title", None),
            "affiliation": member.get("affiliation", None),
            "membership_type": member.get("membership_type", None),
            "profile_url": member.get("profile_url", None),
            "openalex_id": None,
            "works_count": None,
            "cited_by_count": None,
        })
    # Respect API rate limits
    time.sleep(1)  # Adjust delay if needed

# Save the results to a JSON file
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(results, file, ensure_ascii=False, indent=4)

print(f"Author IDs saved to '{output_file_path}'.")

# Optionally, save to CSV for easier inspection
pd.DataFrame(results).to_csv("austrian_academy_author_ids.csv", index=False)
print("Author IDs also saved to 'austrian_academy_author_ids.csv'.")

Fetching OpenAlex Author ID for Rainer Abart...
Fetching OpenAlex Author ID for Karl Acham...
Fetching OpenAlex Author ID for Antal Ádám...
Fetching OpenAlex Author ID for Ludwig Adamovich...
Fetching OpenAlex Author ID for Timon Erik Adolph...
Fetching OpenAlex Author ID for Alícia Adserà...
Fetching OpenAlex Author ID for Adriano Aguzzi...
Fetching OpenAlex Author ID for Josef Aicher...
Fetching OpenAlex Author ID for Martin Aigner...
Fetching OpenAlex Author ID for Luciana Aigner-Foresti...
Fetching OpenAlex Author ID for Brenda Almond...
Fetching OpenAlex Author ID for Michael Alram...
Fetching OpenAlex Author ID for Eva Alram-Stern...
Fetching OpenAlex Author ID for Carlos Alvar Ezquerra...
Fetching OpenAlex Author ID for João Alves...
Fetching OpenAlex Author ID for Michel Amandry...
Fetching OpenAlex Author ID for Petra Amann...
Fetching OpenAlex Author ID for Brigitta Ammann...
Fetching OpenAlex Author ID for Angelika Amon...
Fetching OpenAlex Author ID for Bertil Andersson...


In [None]:
import requests
import json
import time

# Input and output file paths
input_file = "/content/austrian_academy_author_ids.json"  # Replace with your actual file path
output_file = "austrian_academy_members_with_indices.json"  # File to save the updated data

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch H-index and I10-index for an author
def fetch_author_metrics(author_id):
    url = f"{openalex_base_url}/authors/{author_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        h_index = data.get("summary_stats", {}).get("h_index", None)
        i10_index = data.get("summary_stats", {}).get("i10_index", None)
        return h_index, i10_index
    else:
        print(f"Error fetching data for author {author_id}: {response.status_code}")
        return None, None

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Process each member and fetch H-index and I10-index
for member in members:
    openalex_ids = member.get("openalex_id")
    if isinstance(openalex_ids, list):
        # Initialize variables to aggregate metrics
        h_indices = []
        i10_indices = []
        for openalex_id in openalex_ids:
            author_id = openalex_id.split("/")[-1]  # Extract author ID
            h_index, i10_index = fetch_author_metrics(author_id)
            if h_index is not None:
                h_indices.append(h_index)
            if i10_index is not None:
                i10_indices.append(i10_index)
            time.sleep(1)  # Respect API rate limits

        # Aggregate indices (e.g., take maximum values)
        member["h_index"] = max(h_indices) if h_indices else None
        member["i10_index"] = max(i10_indices) if i10_indices else None

    elif isinstance(openalex_ids, str):
        author_id = openalex_ids.split("/")[-1]  # Extract author ID
        h_index, i10_index = fetch_author_metrics(author_id)
        member["h_index"] = h_index
        member["i10_index"] = i10_index
        time.sleep(1)  # Respect API rate limits
    else:
        print(f"No OpenAlex ID for {member['name']}. Skipping...")
        member["h_index"] = None
        member["i10_index"] = None

# Save the updated JSON with indices
with open(output_file, "w") as outfile:
    json.dump(members, outfile, indent=4)

print(f"H-index and I10-index added and saved to {output_file}")

No OpenAlex ID for Paul Albert Blanz. Skipping...
No OpenAlex ID for Georg Peter Braulik. Skipping...
No OpenAlex ID for Alejandro Raul Burga Ramos. Skipping...
No OpenAlex ID for Igor Bert Dawid. Skipping...
No OpenAlex ID for Ernst Peter Michael Dronke. Skipping...
No OpenAlex ID for Günter Bernhard L. Fettweis. Skipping...
No OpenAlex ID for Monika Gehrig-Merz. Skipping...
No OpenAlex ID for Karl-Heinz Alois Glaßmeier. Skipping...
No OpenAlex ID for Hermann Franz Haupt. Skipping...
No OpenAlex ID for Ivo Ludwig Hofacker. Skipping...
No OpenAlex ID for Reinhard Franz Josef Hüttl. Skipping...
No OpenAlex ID for Stephen Ira Katz. Skipping...
No OpenAlex ID for Anke Rita Kaysser-Pyzalla. Skipping...
No OpenAlex ID for Bernd Ulrich Kluge. Skipping...
No OpenAlex ID for Ernst August Kramer. Skipping...
No OpenAlex ID for Vasileios Lambrinoudakis. Skipping...
No OpenAlex ID for Gene Elden Likens. Skipping...
No OpenAlex ID for Richard Viktor Mattessich. Skipping...
No OpenAlex ID for Georg

In [None]:
import requests
import json
import time
import os

# Input and output file paths
input_file = "/content/austrian_academy_members_with_indices.json"  # Replace with your actual file path
output_file = "austrian_academy_works.json"  # File to save the detailed works data
progress_file = "progress_log2.json"  # Save progress to resume if interrupted

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    while True:
        try:
            url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
            response = requests.get(url, timeout=30)
            response.raise_for_status()  # Raise an exception for HTTP errors
            data = response.json()

            works.extend(data.get("results", []))
            if "next" not in data.get("meta", {}):
                break

            page += 1
            time.sleep(1)  # Respect API rate limits
        except requests.exceptions.RequestException as e:
            print(f"Error fetching works for author {author_id} on page {page}: {e}")
            break
    return works

# Resume or start fresh
if os.path.exists(progress_file):
    with open(progress_file, "r") as file:
        processed_authors = json.load(file)
else:
    processed_authors = {}

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Fetch works for each author
all_works = []
for member in members:
    openalex_ids = member.get("openalex_id")
    if isinstance(openalex_ids, list):
        for openalex_id in openalex_ids:
            author_id = openalex_id.split("/")[-1]  # Extract author ID
            if author_id not in processed_authors:
                print(f"Fetching works for {member['name']} ({author_id})...")
                works = fetch_author_works(author_id)
                all_works.extend(works)
                processed_authors[author_id] = True
                # Save progress after each author
                with open(progress_file, "w") as file:
                    json.dump(processed_authors, file, indent=4)
    elif isinstance(openalex_ids, str):
        author_id = openalex_ids.split("/")[-1]  # Extract author ID
        if author_id not in processed_authors:
            print(f"Fetching works for {member['name']} ({author_id})...")
            works = fetch_author_works(author_id)
            all_works.extend(works)
            processed_authors[author_id] = True
            # Save progress after each author
            with open(progress_file, "w") as file:
                json.dump(processed_authors, file, indent=4)
    else:
        print(f"No OpenAlex ID for {member['name']}. Skipping...")

# Save the works data to a JSON file
with open(output_file, "w") as outfile:
    json.dump(all_works, outfile, indent=4)

print(f"Works data with full details saved to {output_file}.")

# Clean up the progress log file if processing is complete
if len(processed_authors) == len(members):
    os.remove(progress_file)

Fetching works for Rainer Abart (A5011048858)...
Fetching works for Karl Acham (A5033657966)...
Fetching works for Antal Ádám (A5112833742)...
Fetching works for Ludwig Adamovich (A5023385566)...
Fetching works for Timon Erik Adolph (A5036264788)...
Fetching works for Alícia Adserà (A5041559404)...
Fetching works for Adriano Aguzzi (A5026438561)...
Fetching works for Josef Aicher (A5051024997)...
Fetching works for Martin Aigner (A5012929183)...
Fetching works for Luciana Aigner-Foresti (A5002107112)...
Fetching works for Brenda Almond (A5113839426)...
Fetching works for Michael Alram (A5000633744)...
Fetching works for Eva Alram-Stern (A5006887627)...
Fetching works for Carlos Alvar Ezquerra (A5051009435)...
Fetching works for João Alves (A5015547149)...
Fetching works for Michel Amandry (A5001874471)...
Fetching works for Petra Amann (A5071971627)...
Fetching works for Brigitta Ammann (A5073355683)...
Fetching works for Angelika Amon (A5006607755)...
Fetching works for Bertil Anderss

In [None]:
import json

# Input and output file paths
input_file = "/content/austrian_academy_members_with_indices.json"  # Replace with your distorted JSON file path
output_file = "austrian_academy_members_with_indices_restored.json"  # Restored output file

# Load the distorted JSON file
with open(input_file, "r", encoding="utf-8") as infile:
    data = json.load(infile)

# Decode Unicode escape sequences
def decode_unicode(data):
    if isinstance(data, dict):
        return {key: decode_unicode(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [decode_unicode(item) for item in data]
    elif isinstance(data, str):
        try:
            # Attempt to decode the string
            return data.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            # Return the original string if decoding fails
            return data
    else:
        return data

restored_data = decode_unicode(data)

# Save the restored JSON file
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(restored_data, outfile, ensure_ascii=False, indent=4)

print(f"Restored file saved to {output_file}")

Restored file saved to austrian_academy_members_with_indices_restored.json


In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file = "/content/austrian_academy_members_with_indices_restored.json"  # Replace with your actual file path
output_json_file = "austrian_academy_works_filtered.json"  # JSON file output
output_csv_file = "austrian_academy_works_filtered.csv"  # CSV file output
progress_log_file = "progress_log2.json"  # Log file to track progress

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    while True:
        url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for work in data.get("results", []):  # Handle missing 'results'
                filtered_work = {
                    "id": work.get("id", ""),
                    "title": work.get("title", ""),
                    "doi": work.get("doi", ""),
                    "publication_year": work.get("publication_year", ""),
                    "cited_by_count": work.get("cited_by_count", 0),
                    "authorships": [
                        {
                            "author": {
                                "id": auth.get("author", {}).get("id", ""),
                                "display_name": auth.get("author", {}).get("display_name", "")
                            },
                            "institutions": [
                                {
                                    "id": inst.get("id", ""),
                                    "display_name": inst.get("display_name", ""),
                                    "country_code": inst.get("country_code", ""),
                                    "type": inst.get("type", "")  # Safely handle missing type
                                }
                                for inst in auth.get("institutions", []) if inst
                            ]
                        }
                        for auth in work.get("authorships", []) if auth
                    ],
                    "primary_topic": {
                        "id": work.get("primary_topic", {}).get("id", ""),
                        "display_name": work.get("primary_topic", {}).get("display_name", ""),
                        "score": work.get("primary_topic", {}).get("score", 0),
                        "field": {
                            "id": work.get("primary_topic", {}).get("field", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("field", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("field") else {},
                        "subfield": {
                            "id": work.get("primary_topic", {}).get("subfield", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("subfield", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("subfield") else {},
                        "domain": {
                            "id": work.get("primary_topic", {}).get("domain", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("domain", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("domain") else {}
                    } if work.get("primary_topic") else {},  # Handle missing primary_topic
                    "concepts": [
                        {
                            "id": concept.get("id", ""),
                            "display_name": concept.get("display_name", ""),
                            "level": concept.get("level", 0),
                            "score": concept.get("score", 0)
                        }
                        for concept in work.get("concepts", []) if concept
                    ],
                    "open_access": work.get("open_access", {}),
                    "sustainable_development_goals": [
                        {
                            "id": sdg.get("id", ""),
                            "score": sdg.get("score", 0),
                            "display_name": sdg.get("display_name", "")
                        }
                        for sdg in work.get("sustainable_development_goals", []) if sdg
                    ],
                    "referenced_works": work.get("referenced_works", [])
                }
                works.append(filtered_work)
            if "next" not in data.get("meta", {}):  # Handle missing 'meta'
                break
            page += 1
            time.sleep(1)  # Respect API rate limits
        else:
            print(f"Error fetching works for author {author_id}: {response.status_code}")
            break
    return works

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Load or initialize the progress log
try:
    with open(progress_log_file, "r") as log_file:
        progress_log = json.load(log_file)
except FileNotFoundError:
    progress_log = {}

# Fetch works for each author and organize them under authors
authors_with_works = []
for member in members:
    name = member["name"]
    openalex_ids = member.get("openalex_id")
    all_works = []

    if name in progress_log and progress_log[name]:
        print(f"Skipping {name}, already completed.")
        continue

    print(f"Fetching works for {name}...")

    try:
        if isinstance(openalex_ids, list):
            for openalex_id in openalex_ids:
                author_id = openalex_id.split("/")[-1]  # Extract author ID
                all_works.extend(fetch_author_works(author_id))
        elif isinstance(openalex_ids, str):
            author_id = openalex_ids.split("/")[-1]
            all_works.extend(fetch_author_works(author_id))

        authors_with_works.append({
            "name": name,
            "openalex_id": openalex_ids,
            "works": all_works
        })

        # Mark the author as successfully processed
        progress_log[name] = True

    except Exception as e:
        print(f"Error processing {name}: {e}")
        progress_log[name] = False  # Mark as failed

    # Save progress after each author
    with open(progress_log_file, "w") as log_file:
        json.dump(progress_log, log_file, indent=4)

# Save the filtered data to a JSON file
with open(output_json_file, "w") as outfile:
    json.dump(authors_with_works, outfile, indent=4)
print(f"Filtered works data saved to JSON: {output_json_file}")

# Convert to CSV for easier inspection
csv_data = []
for author in authors_with_works:
    for work in author["works"]:
        csv_data.append({
            "author_name": author["name"],
            "author_openalex_id": author["openalex_id"],
            "work_id": work["id"],
            "work_title": work["title"],
            "doi": work["doi"],
            "publication_year": work["publication_year"],
            "cited_by_count": work["cited_by_count"],
            "field": work["primary_topic"].get("field", {}).get("display_name", ""),
            "subfield": work["primary_topic"].get("subfield", {}).get("display_name", ""),
            "domain": work["primary_topic"].get("domain", {}).get("display_name", ""),
            "concepts": ", ".join([concept["display_name"] for concept in work["concepts"]]),
            "open_access_status": work.get("open_access", {}).get("oa_status", ""),
            "sustainable_development_goals": ", ".join([sdg["display_name"] for sdg in work.get("sustainable_development_goals", [])]),
            "referenced_works_count": len(work.get("referenced_works", []))
        })

pd.DataFrame(csv_data).to_csv(output_csv_file, index=False)
print(f"Filtered works data saved to CSV: {output_csv_file}")

Fetching works for Rainer Abart...
Fetching works for Karl Acham...
Fetching works for Antal Ádám...
Fetching works for Ludwig Adamovich...
Fetching works for Timon Erik Adolph...
Fetching works for Alícia Adserà...
Fetching works for Adriano Aguzzi...
Fetching works for Josef Aicher...
Fetching works for Martin Aigner...
Fetching works for Luciana Aigner-Foresti...
Fetching works for Brenda Almond...
Fetching works for Michael Alram...
Fetching works for Eva Alram-Stern...
Fetching works for Carlos Alvar Ezquerra...
Fetching works for João Alves...
Fetching works for Michel Amandry...
Fetching works for Petra Amann...
Fetching works for Brigitta Ammann...
Fetching works for Angelika Amon...
Fetching works for Bertil Andersson...
Fetching works for Bernard Andreae...
Fetching works for Anaïs  Angelo...
Fetching works for Flavio Anselmetti...
Fetching works for Leah Armstrong...
Fetching works for Markus Arndt...
Fetching works for Eduard Arzt...
Fetching works for Alain Aspect...
Fetch

In [None]:
import json

# Input and output file paths
input_file = "/content/austrian_academy_works_filtered.json"  # Replace with your distorted JSON file path
output_file = "/content/austrian_academy_works_filtered_restored.json"  # Restored output file

# Load the distorted JSON file
with open(input_file, "r", encoding="utf-8") as infile:
    data = json.load(infile)

# Decode Unicode escape sequences
def decode_unicode(data):
    if isinstance(data, dict):
        return {key: decode_unicode(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [decode_unicode(item) for item in data]
    elif isinstance(data, str):
        try:
            # Attempt to decode the string
            return data.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            # Return the original string if decoding fails
            return data
    else:
        return data

restored_data = decode_unicode(data)

# Save the restored JSON file
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(restored_data, outfile, ensure_ascii=False, indent=4)

print(f"Restored file saved to {output_file}")

Restored file saved to /content/austrian_academy_works_filtered_restored.json


In [None]:
import pandas as pd

# Input and output file paths
input_file = "/content/austrian_academy_works_filtered.csv"  # Replace with your distorted CSV file path
output_file = "austrian_academy_works_filtered_restored.csv"  # Restored output file

# Load the distorted CSV file
df = pd.read_csv(input_file)

# Decode Unicode escape sequences in strings
def decode_unicode_string(value):
    if isinstance(value, str):  # Ensure the value is a string before decoding
        try:
            return value.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            return value  # Return the original value if decoding fails
    return value  # Return the value as-is if it's not a string

# Apply the Unicode decoding function to all string cells in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Apply only to object (string) columns
        df[col] = df[col].apply(decode_unicode_string)

# Save the restored CSV file
df.to_csv(output_file, index=False)

print(f"Restored file saved to {output_file}")

Restored file saved to austrian_academy_works_filtered_restored.csv
