In [None]:
pip install requests beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Base URL for the honorary members
base_url = "https://bilimakademisi.org/onursal-uyeler/page/"

# Total number of pages
total_pages = 10  # Update based on the actual number of pages

# List to store member data
honorary_members = []

for page in range(1, total_pages + 1):
    print(f"Fetching page {page}...")
    url = f"{base_url}{page}"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to fetch page {page}. Status code: {response.status_code}")
        continue

    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Locate the table containing members
    table = soup.find("table", class_="tablepress")

    if not table:
        print(f"No table found on page {page}.")
        continue

    # Extract rows from the table
    rows = table.find("tbody").find_all("tr")

    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 2:
            continue  # Skip malformed rows

        # Extract data
        member_name_and_title = cols[0].get_text(strip=True)
        member_affiliation = cols[1].get_text(strip=True)
        member_profile_link = cols[0].find("a")["href"] if cols[0].find("a") else None

        honorary_members.append({
            "name_and_title": member_name_and_title,
            "affiliation": member_affiliation,
            "profile_link": member_profile_link
        })

    # Respectful scraping
    time.sleep(1)

# Save to CSV and JSON
df = pd.DataFrame(honorary_members)
df.to_csv("honorary_members_ba.csv", index=False, encoding="utf-8")
df.to_json("honorary_members_ba.json", orient="records", indent=4, force_ascii=False)

print("Scraping completed. Data saved to 'honorary_members_ba.csv' and 'honorary_members_ba.json'.")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Scraping completed. Data saved to 'honorary_members_ba.csv' and 'honorary_members_ba.json'.


In [None]:
import requests
from bs4 import BeautifulSoup
import json
import csv

# Function to extract members from a given soup object
def extract_asli_members(soup):
    table = soup.find("table", {"id": "tablepress-asliuyeler"})
    if not table:
        print("No table found with ID 'tablepress-asliuyeler'.")
        return []

    rows = table.find("tbody").find_all("tr")
    members = []
    for row in rows:
        cols = row.find_all("td")
        member = {
            "name": cols[0].get_text(strip=True) if len(cols) > 0 else "",
            "profile_url": cols[0].find("a")["href"] if cols[0].find("a") and "href" in cols[0].find("a").attrs else "",
            "research_area": cols[1].get_text(strip=True) if len(cols) > 1 else "",
            "university": cols[2].get_text(strip=True) if len(cols) > 2 else ""
        }
        members.append(member)
    return members

# Function to fetch a page and parse it with BeautifulSoup
def fetch_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return BeautifulSoup(response.text, 'html.parser')
        else:
            print(f"Failed to fetch page. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Main function to scrape data across pages
def scrape_asli_members(base_url, pages):
    all_members = []
    for page in range(1, pages + 1):
        print(f"Fetching page {page}...")
        url = f"{base_url}?paged={page}"
        soup = fetch_page(url)
        if not soup:
            break
        members = extract_asli_members(soup)
        if not members:
            break
        all_members.extend(members)
    return all_members

# Save results to JSON
def save_to_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Data saved to {filename}")

# Save results to CSV
def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["name", "profile_url", "research_area", "university"])
        writer.writeheader()
        writer.writerows(data)
    print(f"Data saved to {filename}")

# Configuration
BASE_URL = "https://bilimakademisi.org/uyeler/"
PAGES = 3  # Adjust the number of pages as needed

# Run the scraper
if __name__ == "__main__":
    members = scrape_asli_members(BASE_URL, PAGES)
    if members:
        save_to_json(members, "asli_members_ba.json")
        save_to_csv(members, "asli_members_ba.csv")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Data saved to asli_members_ba.json
Data saved to asli_members_ba.csv


In [None]:
import json

# Load JSON data from a file
def load_json(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)
    except Exception as e:
        print(f"Error loading {filename}: {e}")
        return []

# Count members and compare
def count_and_compare(asli_file, honorary_file):
    # Load members from JSON files
    asli_members = load_json(asli_file)
    honorary_members = load_json(honorary_file)

    # Get counts
    asli_count = len(asli_members)
    honorary_count = len(honorary_members)

    # Print counts
    print(f"Number of Asli Members: {asli_count}")
    print(f"Number of Honorary Members: {honorary_count}")

    # Total count
    total_count = asli_count + honorary_count
    print(f"Total Members: {total_count}")

# File paths
asli_file = "/content/asli_members_ba.json"
honorary_file = "/content/honorary_members_ba.json"

# Run the comparison
count_and_compare(asli_file, honorary_file)

Number of Asil Members: 666
Number of Honorary Members: 420
Total Members: 1086


In [None]:
import json

def remove_duplicates_honorary(members):
    """
    Remove duplicate honorary members based on all fields.
    """
    seen = set()
    unique_members = []
    for member in members:
        member_tuple = (member["name_and_title"], member["affiliation"], member["profile_link"])
        if member_tuple not in seen:
            seen.add(member_tuple)
            unique_members.append(member)
    return unique_members


def remove_duplicates_asli(members):
    """
    Remove duplicate asli members based on all fields.
    """
    seen = set()
    unique_members = []
    for member in members:
        member_tuple = (member["name"], member["profile_url"], member["research_area"], member["university"])
        if member_tuple not in seen:
            seen.add(member_tuple)
            unique_members.append(member)
    return unique_members


# Load your JSON data here
import json

# Load honorary members
with open("honorary_members_ba.json", "r", encoding="utf-8") as honorary_file:
    honorary_members = json.load(honorary_file)

# Load asli members
with open("asli_members_ba.json", "r", encoding="utf-8") as asli_file:
    asli_members = json.load(asli_file)

# Remove duplicates
cleaned_honorary_members = remove_duplicates_honorary(honorary_members)
cleaned_asli_members = remove_duplicates_asli(asli_members)

print(f"Cleaned Honorary Members: {len(cleaned_honorary_members)}")
print(f"Cleaned Asli Members: {len(cleaned_asli_members)}")

# Save the cleaned lists back to JSON files
with open("cleaned_honorary_members.json", "w", encoding="utf-8") as honorary_file:
    json.dump(cleaned_honorary_members, honorary_file, indent=4, ensure_ascii=False)

with open("cleaned_asli_members.json", "w", encoding="utf-8") as asli_file:
    json.dump(cleaned_asli_members, asli_file, indent=4, ensure_ascii=False)

Cleaned Honorary Members: 42
Cleaned Asli Members: 222


In [None]:
import json

# Load cleaned Asli and Honorary members JSON files
with open("cleaned_asli_members.json", "r", encoding="utf-8") as f_asli:
    asli_members = json.load(f_asli)

with open("cleaned_honorary_members.json", "r", encoding="utf-8") as f_honorary:
    honorary_members = json.load(f_honorary)

# Combine both lists
all_members = asli_members + honorary_members

# Check for duplicates
seen = set()
duplicates = []
unique_members = []
for member in all_members:
    # Use 'name' or 'name_and_title' fields for comparison
    name = member.get("name", "").strip().lower() or member.get("name_and_title", "").strip().lower()
    if name in seen:
        duplicates.append(member)  # Add the full duplicate entry for review
    else:
        seen.add(name)
        unique_members.append(member)  # Add only unique members to the final list

# Print duplicates for manual review
if duplicates:
    print("Duplicate Entries Found:")
    for dup in duplicates:
        print(dup)
else:
    print("No duplicate entries found.")

# Save all unique members to a new JSON file
with open("all_members_ba.json", "w", encoding="utf-8") as f_all_members:
    json.dump(unique_members, f_all_members, indent=4, ensure_ascii=False)

print("Combined members saved to 'all_members_ba.json'")

No duplicate entries found.
Combined members saved to 'all_members_ba.json'


In [None]:
import json

# Names to check (deceased members)
deceased_names = [
    "Miral Dizdaroğlu",
    "Fuat Keyman",
    "Durmuş Ali Demir",
    "Zafer Toprak",
    "Ayhan Ulubelen",
    "Hamit Fişek",
    "Philip W. Anderson",
    "Rahmi Güven",
    "David Pines",
    "Güven Arsebük",
    "Çiğdem Kağıtçıbaşı",
    "Yücel Kanpolat",
    "Tosun Terzioğlu",
    "Namık Kemal Pak"
]

# Normalize the names in deceased list
deceased_names_normalized = [name.strip().lower() for name in deceased_names]

# Load the all_members.json file
with open("all_members_ba.json", "r", encoding="utf-8") as f_all_members:
    all_members = json.load(f_all_members)

# Extract and normalize names from all_members
all_member_names = set(
    member.get("name", "").strip().lower() for member in all_members if "name" in member
) | set(
    member.get("name_and_title", "").split(" -")[0].strip().lower()
    for member in all_members if "name_and_title" in member
)

# Check for matches
matches = [name for name in deceased_names_normalized if name in all_member_names]

# Print results
if matches:
    print("The following deceased members are already in all_members_ba.json:")
    for match in matches:
        print(match)
else:
    print("No matches found. These deceased members are not in all_members_ba.json.")

The following deceased members are already in all_members_ba.json:
david pines


In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file_path = "/content/all_members_ba.json"  # Replace with actual file path
output_file_path = "ba_author_ids.json"

# Load all members JSON
with open(input_file_path, 'r', encoding='utf-8') as file:
    all_members = json.load(file)

# OpenAlex API endpoint
openalex_base_url = "https://api.openalex.org/authors"

# Function to query OpenAlex
def get_openalex_author_id(name):
    try:
        response = requests.get(openalex_base_url, params={"search": name})
        response.raise_for_status()  # Raise an exception for HTTP errors
        results = response.json()

        if "results" in results and results["results"]:
            # Take the first match (or apply more filters if needed)
            first_result = results["results"][0]
            return {
                "author_id": first_result.get("id"),
                "name": first_result.get("display_name"),
                "works_count": first_result.get("works_count", 0),
                "cited_by_count": first_result.get("cited_by_count", 0),
            }
        else:
            return None  # No match found
    except Exception as e:
        print(f"Error fetching author ID for {name}: {e}")
        return None

# Process each member to fetch OpenAlex Author ID
results = []
for member in all_members:
    # Get name from "name" or "name_and_title"
    name = member.get("name") or member.get("name_and_title")
    print(f"Fetching OpenAlex Author ID for {name}...")

    # Fetch author data from OpenAlex
    author_data = get_openalex_author_id(name)

    # Construct the result entry
    result_entry = {
        "name": member.get("name"),
        "affiliation": member.get("affiliation"),
        "profile_link": member.get("profile_link") or member.get("profile_url"),
        "research_area": member.get("research_area"),
        "university": member.get("university"),
        "openalex_id": author_data["author_id"] if author_data else None,
        "works_count": author_data["works_count"] if author_data else None,
        "cited_by_count": author_data["cited_by_count"] if author_data else None,
    }

    results.append(result_entry)
    # Respect API rate limits
    time.sleep(1)  # Adjust delay if needed

# Save the results to a JSON file
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(results, file, ensure_ascii=False, indent=4)

print(f"Author IDs saved to '{output_file_path}'.")

# Optionally, save to CSV for easier inspection
pd.DataFrame(results).to_csv("ba_author_ids.csv", index=False)
print("Author IDs also saved to 'ba_author_ids.csv'.")

Fetching OpenAlex Author ID for Can Fuat Delale...
Fetching OpenAlex Author ID for Mehmet Özdoğan...
Fetching OpenAlex Author ID for Aslıhan Yener...
Fetching OpenAlex Author ID for M. Ali Alpar...
Fetching OpenAlex Author ID for Ersin Göğüş...
Fetching OpenAlex Author ID for Feryal Özel...
Fetching OpenAlex Author ID for Ethem Alpaydın...
Fetching OpenAlex Author ID for Lale Akarun...
Fetching OpenAlex Author ID for Tamer Özsu...
Fetching OpenAlex Author ID for İzak Benbasat...
Fetching OpenAlex Author ID for Attila Gürsoy...
Fetching OpenAlex Author ID for Pekcan Ungan...
Fetching OpenAlex Author ID for Canan Atılgan...
Fetching OpenAlex Author ID for İvet Bahar...
Fetching OpenAlex Author ID for Mustafa Tekin...
Fetching OpenAlex Author ID for Şermin Genç...
Fetching OpenAlex Author ID for Mehmet Öztürk...
Fetching OpenAlex Author ID for Vasıf Hasırcı...
Fetching OpenAlex Author ID for Berrin Tansel...
Fetching OpenAlex Author ID for Nilsun İnce...
Fetching OpenAlex Author ID for Or

In [None]:
import requests
import json
import time

# Input and output file paths
input_file = "/content/ba_author_ids.json"  # Replace with your actual file path
output_file = "ba_members_with_indices.json"  # File to save the updated data

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch H-index and I10-index for an author
def fetch_author_metrics(author_id):
    url = f"{openalex_base_url}/authors/{author_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        h_index = data.get("summary_stats", {}).get("h_index", None)
        i10_index = data.get("summary_stats", {}).get("i10_index", None)
        return h_index, i10_index
    else:
        print(f"Error fetching data for author {author_id}: {response.status_code}")
        return None, None

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Process each member and fetch H-index and I10-index
for member in members:
    openalex_ids = member.get("openalex_id")
    if isinstance(openalex_ids, list):
        # Initialize variables to aggregate metrics
        h_indices = []
        i10_indices = []
        for openalex_id in openalex_ids:
            author_id = openalex_id.split("/")[-1]  # Extract author ID
            h_index, i10_index = fetch_author_metrics(author_id)
            if h_index is not None:
                h_indices.append(h_index)
            if i10_index is not None:
                i10_indices.append(i10_index)
            time.sleep(1)  # Respect API rate limits

        # Aggregate indices (e.g., take maximum values)
        member["h_index"] = max(h_indices) if h_indices else None
        member["i10_index"] = max(i10_indices) if i10_indices else None

    elif isinstance(openalex_ids, str):
        author_id = openalex_ids.split("/")[-1]  # Extract author ID
        h_index, i10_index = fetch_author_metrics(author_id)
        member["h_index"] = h_index
        member["i10_index"] = i10_index
        time.sleep(1)  # Respect API rate limits
    else:
        print(f"No OpenAlex ID for {member['name']}. Skipping...")
        member["h_index"] = None
        member["i10_index"] = None

# Save the updated JSON with indices
with open(output_file, "w") as outfile:
    json.dump(members, outfile, indent=4)

print(f"H-index and I10-index added and saved to {output_file}")

No OpenAlex ID for Ayşe Buğra Kavala. Skipping...
No OpenAlex ID for Hakkı Erdal Akalın. Skipping...
No OpenAlex ID for Dame Jocelyn Bell Burnell. Skipping...
No OpenAlex ID for Ayla Zırh Gürsoy. Skipping...
No OpenAlex ID for Avedis Hacınlıyan. Skipping...
No OpenAlex ID for Lord Martin Rees. Skipping...
No OpenAlex ID for İzzettin Silier. Skipping...
No OpenAlex ID for Dame Hellen Wallace. Skipping...
H-index and I10-index added and saved to ba_members_with_indices.json


In [None]:
import json

# Input and output file paths
input_file = "ba_members_with_indices.json"  # Replace with your distorted JSON file path
output_file = "ba_members_with_indices_restored.json"  # Restored output file

# Load the distorted JSON file
with open(input_file, "r", encoding="utf-8") as infile:
    data = json.load(infile)

# Decode Unicode escape sequences
def decode_unicode(data):
    if isinstance(data, dict):
        return {key: decode_unicode(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [decode_unicode(item) for item in data]
    elif isinstance(data, str):
        try:
            # Attempt to decode the string
            return data.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            # Return the original string if decoding fails
            return data
    else:
        return data

restored_data = decode_unicode(data)

# Save the restored JSON file
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(restored_data, outfile, ensure_ascii=False, indent=4)

print(f"Restored file saved to {output_file}")

Restored file saved to ba_members_with_indices_restored.json


In [None]:
import requests
import json
import time
import pandas as pd

# Input and output file paths
input_file = "/content/ba_members_with_indices_restored.json"  # Replace with your actual file path
output_json_file = "ba_works_filtered.json"  # JSON file output
output_csv_file = "ba_works_filtered.csv"  # CSV file output
progress_log_file = "progress_log3.json"  # Log file to track progress

# OpenAlex base API URL
openalex_base_url = "https://api.openalex.org"

# Function to fetch works for an author
def fetch_author_works(author_id):
    works = []
    page = 1
    while True:
        url = f"{openalex_base_url}/works?filter=author.id:{author_id}&per-page=200&page={page}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            for work in data.get("results", []):  # Handle missing 'results'
                filtered_work = {
                    "id": work.get("id", ""),
                    "title": work.get("title", ""),
                    "doi": work.get("doi", ""),
                    "publication_year": work.get("publication_year", ""),
                    "cited_by_count": work.get("cited_by_count", 0),
                    "authorships": [
                        {
                            "author": {
                                "id": auth.get("author", {}).get("id", ""),
                                "display_name": auth.get("author", {}).get("display_name", "")
                            },
                            "institutions": [
                                {
                                    "id": inst.get("id", ""),
                                    "display_name": inst.get("display_name", ""),
                                    "country_code": inst.get("country_code", ""),
                                    "type": inst.get("type", "")  # Safely handle missing type
                                }
                                for inst in auth.get("institutions", []) if inst
                            ]
                        }
                        for auth in work.get("authorships", []) if auth
                    ],
                    "primary_topic": {
                        "id": work.get("primary_topic", {}).get("id", ""),
                        "display_name": work.get("primary_topic", {}).get("display_name", ""),
                        "score": work.get("primary_topic", {}).get("score", 0),
                        "field": {
                            "id": work.get("primary_topic", {}).get("field", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("field", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("field") else {},
                        "subfield": {
                            "id": work.get("primary_topic", {}).get("subfield", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("subfield", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("subfield") else {},
                        "domain": {
                            "id": work.get("primary_topic", {}).get("domain", {}).get("id", ""),
                            "display_name": work.get("primary_topic", {}).get("domain", {}).get("display_name", "")
                        } if work.get("primary_topic", {}).get("domain") else {}
                    } if work.get("primary_topic") else {},  # Handle missing primary_topic
                    "concepts": [
                        {
                            "id": concept.get("id", ""),
                            "display_name": concept.get("display_name", ""),
                            "level": concept.get("level", 0),
                            "score": concept.get("score", 0)
                        }
                        for concept in work.get("concepts", []) if concept
                    ],
                    "open_access": work.get("open_access", {}),
                    "sustainable_development_goals": [
                        {
                            "id": sdg.get("id", ""),
                            "score": sdg.get("score", 0),
                            "display_name": sdg.get("display_name", "")
                        }
                        for sdg in work.get("sustainable_development_goals", []) if sdg
                    ],
                    "referenced_works": work.get("referenced_works", [])
                }
                works.append(filtered_work)
            if "next" not in data.get("meta", {}):  # Handle missing 'meta'
                break
            page += 1
            time.sleep(1)  # Respect API rate limits
        else:
            print(f"Error fetching works for author {author_id}: {response.status_code}")
            break
    return works

# Load the existing JSON file
with open(input_file, "r") as infile:
    members = json.load(infile)

# Load or initialize the progress log
try:
    with open(progress_log_file, "r") as log_file:
        progress_log = json.load(log_file)
except FileNotFoundError:
    progress_log = {}

# Fetch works for each author and organize them under authors
authors_with_works = []
for member in members:
    name = member["name"]
    openalex_ids = member.get("openalex_id")
    all_works = []

    if name in progress_log and progress_log[name]:
        print(f"Skipping {name}, already completed.")
        continue

    print(f"Fetching works for {name}...")

    try:
        if isinstance(openalex_ids, list):
            for openalex_id in openalex_ids:
                author_id = openalex_id.split("/")[-1]  # Extract author ID
                all_works.extend(fetch_author_works(author_id))
        elif isinstance(openalex_ids, str):
            author_id = openalex_ids.split("/")[-1]
            all_works.extend(fetch_author_works(author_id))

        authors_with_works.append({
            "name": name,
            "openalex_id": openalex_ids,
            "works": all_works
        })

        # Mark the author as successfully processed
        progress_log[name] = True

    except Exception as e:
        print(f"Error processing {name}: {e}")
        progress_log[name] = False  # Mark as failed

    # Save progress after each author
    with open(progress_log_file, "w") as log_file:
        json.dump(progress_log, log_file, indent=4)

# Save the filtered data to a JSON file
with open(output_json_file, "w") as outfile:
    json.dump(authors_with_works, outfile, indent=4)
print(f"Filtered works data saved to JSON: {output_json_file}")

# Convert to CSV for easier inspection
csv_data = []
for author in authors_with_works:
    for work in author["works"]:
        csv_data.append({
            "author_name": author["name"],
            "author_openalex_id": author["openalex_id"],
            "work_id": work["id"],
            "work_title": work["title"],
            "doi": work["doi"],
            "publication_year": work["publication_year"],
            "cited_by_count": work["cited_by_count"],
            "field": work["primary_topic"].get("field", {}).get("display_name", ""),
            "subfield": work["primary_topic"].get("subfield", {}).get("display_name", ""),
            "domain": work["primary_topic"].get("domain", {}).get("display_name", ""),
            "concepts": ", ".join([concept["display_name"] for concept in work["concepts"]]),
            "open_access_status": work.get("open_access", {}).get("oa_status", ""),
            "sustainable_development_goals": ", ".join([sdg["display_name"] for sdg in work.get("sustainable_development_goals", [])]),
            "referenced_works_count": len(work.get("referenced_works", []))
        })

pd.DataFrame(csv_data).to_csv(output_csv_file, index=False)
print(f"Filtered works data saved to CSV: {output_csv_file}")

Fetching works for Can Fuat Delale...
Fetching works for Mehmet Özdoğan...
Fetching works for Aslıhan Yener...
Fetching works for M. Ali Alpar...
Fetching works for Ersin Göğüş...
Fetching works for Feryal Özel...
Fetching works for Ethem Alpaydın...
Fetching works for Lale Akarun...
Fetching works for Tamer Özsu...
Fetching works for İzak Benbasat...
Fetching works for Attila Gürsoy...
Fetching works for Pekcan Ungan...
Fetching works for Canan Atılgan...
Fetching works for İvet Bahar...
Fetching works for Mustafa Tekin...
Fetching works for Şermin Genç...
Fetching works for Mehmet Öztürk...
Fetching works for Vasıf Hasırcı...
Fetching works for Berrin Tansel...
Fetching works for Nilsun İnce...
Fetching works for Orhan Yenigün...
Fetching works for Delia Sponza...
Fetching works for Mehmet Kobya...
Fetching works for İdil Arslan Alaton...
Fetching works for Derin Orhon...
Fetching works for İhsan Çalış...
Fetching works for Hüsnü Can Başer...
Fetching works for Sedat Ölçer...
Fetchin

In [None]:
import json

# Input and output file paths
input_file = "/content/ba_works_filtered.json"  # Replace with your distorted JSON file path
output_file = "ba_works_filtered_restored.json"  # Restored output file

# Load the distorted JSON file
with open(input_file, "r", encoding="utf-8") as infile:
    data = json.load(infile)

# Decode Unicode escape sequences
def decode_unicode(data):
    if isinstance(data, dict):
        return {key: decode_unicode(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [decode_unicode(item) for item in data]
    elif isinstance(data, str):
        try:
            # Attempt to decode the string
            return data.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            # Return the original string if decoding fails
            return data
    else:
        return data

restored_data = decode_unicode(data)

# Save the restored JSON file
with open(output_file, "w", encoding="utf-8") as outfile:
    json.dump(restored_data, outfile, ensure_ascii=False, indent=4)

print(f"Restored file saved to {output_file}")

Restored file saved to ba_works_filtered_restored.json


In [None]:
import pandas as pd

# Input and output file paths
input_file = "/content/ba_works_filtered.csv"  # Replace with your distorted CSV file path
output_file = "ba_works_filtered_restored.csv"  # Restored output file

# Load the distorted CSV file
df = pd.read_csv(input_file)

# Decode Unicode escape sequences in strings
def decode_unicode_string(value):
    if isinstance(value, str):  # Ensure the value is a string before decoding
        try:
            return value.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            return value  # Return the original value if decoding fails
    return value  # Return the value as-is if it's not a string

# Apply the Unicode decoding function to all string cells in the DataFrame
for col in df.columns:
    if df[col].dtype == 'object':  # Apply only to object (string) columns
        df[col] = df[col].apply(decode_unicode_string)

# Save the restored CSV file
df.to_csv(output_file, index=False)

print(f"Restored file saved to {output_file}")

Restored file saved to ba_works_filtered_restored.csv
