# Figshare test

## Imports

In [11]:
import os
import json
import requests
import time

In [38]:
figshare_access_token = os.getenv("FIGSHARE_ACCESS_TOKEN_POSTER")
figshare_access_token

'29db349255e0da4e20543419db02fb0b89ddb762fbea0396327c71a70d9bc5c9a41a5b98939262bc3fbb7859944af534ad89204fbfb2700ad42c95aac273cd42'

## Test 1: Original code

In [6]:
# Inspired from the example available here https://help.figshare.com/article/how-to-use-the-figshare-api#search-ids
# get info of all the posters (unlike Zenodo, this doesn't return all the metadata for each poster
# so we do that only to get all the ids of the posters)
BASE_URL = "https://api.figshare.com/v2"
results = []

posted_after_list = ["2012-01-01", "2021-01-01"]
posted_before_list = ["2020-12-31", "2024-12-31"]

for posted_after, posted_before in zip(posted_after_list, posted_before_list):
    search_logic = (
        ":item_type:poster AND "
        + ":posted_after:"
        + posted_after
        + " AND :posted_before:"
        + posted_before
    )
    query = '{"search_for": "' + search_logic + '"}'
    y = json.loads(query)

    for j in range(1, 11):
        r = json.loads(
            requests.post(
                BASE_URL + "/articles/search?page_size=1000&page={}".format(j), params=y
            ).content
        )
        if r:
            results.extend(r)
        else:
            break

In [7]:
len(results)

200

In [9]:
results[1]

{'project_id': None,
 'id': 30972252,
 'title': 'Enantioselective\nCobalt-Catalyzed C–H Activation\nand Annulation of Indole-3-carboxamides with Alkynes: Access to Axially\nChiral γ‑Carbolinones',
 'doi': '10.1021/acs.orglett.5c04514.s001',
 'handle': '',
 'url': 'https://api.figshare.com/v2/articles/30972252',
 'published_date': '2025-12-30T18:08:23Z',
 'thumb': 'https://ndownloader.figshare.com/files/60703596/preview/60703596/thumb.png',
 'defined_type': 6,
 'defined_type_name': 'journal contribution',
 'group_id': 2436,
 'url_private_api': 'https://api.figshare.com/v2/account/articles/30972252',
 'url_public_api': 'https://api.figshare.com/v2/articles/30972252',
 'url_private_html': 'https://figshare.com/account/articles/30972252',
 'url_public_html': 'https://acs.figshare.com/articles/journal_contribution/Enantioselective_Cobalt-Catalyzed_C_H_Activation_and_Annulation_of_Indole-3-carboxamides_with_Alkynes_Access_to_Axially_Chiral_Carbolinones/30972252',
 'timeline': {'posted': '202

## Test #2 with changes

#### Token and base url

In [27]:
figshare_access_token = os.getenv("FIGSHARE_ACCESS_TOKEN_POSTER")
BASE_URL = "https://api.figshare.com/v2"

#### Get summary records of all posters

In [18]:
summaries = []

# Date ranges to handle pagination limits
date_ranges = [
    {"after": "2012-01-01", "before": "2020-12-31"},
    {"after": "2021-01-01", "before": "2024-12-31"}
]

for period in date_ranges:
    page = 1
    while True:
        search_query = f":item_type: poster AND :posted_after: {period['after']} AND :posted_before: {period['before']}"
        
        payload = {
            "search_for": search_query,
            "page_size": 1000,
            "page": page
        }
        
        response = requests.post(f"{BASE_URL}/articles/search", json=payload)
        
        if response.status_code != 200:
            print(f"API Error: {response.status_code}")
            break
            
        data = response.json()
        if not data:
            break
            
        summaries.extend(data)
        print(f"Fetched {len(data)} items for period starting {period['after']} (Page {page})")
        
        page += 1
        time.sleep(0.5)

print(f"\nDone! Total record summaries found: {len(summaries)}")
filename = "outputs/repository-records/figshare-summaries.json"

with open(filename, "w", encoding="utf-8") as f:
    json.dump(summaries, f, ensure_ascii=False, indent=2)

print(f"Success! Saved {len(summaries)} record summaries to {filename}")

Fetched 1000 items for period starting 2012-01-01 (Page 1)
Fetched 1000 items for period starting 2012-01-01 (Page 2)
Fetched 1000 items for period starting 2012-01-01 (Page 3)
Fetched 1000 items for period starting 2012-01-01 (Page 4)
Fetched 1000 items for period starting 2012-01-01 (Page 5)
Fetched 1000 items for period starting 2012-01-01 (Page 6)
Fetched 1000 items for period starting 2012-01-01 (Page 7)
Fetched 1000 items for period starting 2012-01-01 (Page 8)
Fetched 111 items for period starting 2012-01-01 (Page 9)
Fetched 1000 items for period starting 2021-01-01 (Page 1)
Fetched 1000 items for period starting 2021-01-01 (Page 2)
Fetched 1000 items for period starting 2021-01-01 (Page 3)
Fetched 1000 items for period starting 2021-01-01 (Page 4)
Fetched 1000 items for period starting 2021-01-01 (Page 5)
Fetched 1000 items for period starting 2021-01-01 (Page 6)
Fetched 402 items for period starting 2021-01-01 (Page 7)

Done! Total poster found: 14513
Success! Saved 14513 post

#### Verify results (all records are unique and are posters)

In [34]:
# load data
with open("outputs/repository-records/figshare-summaries.json", "r", encoding="utf-8") as f:
    results = json.load(f)

In [35]:
# view a record
results[0]

{'project_id': None,
 'id': 24736404,
 'title': 'Beyond Tradition: Rethinking Early Modern Europe',
 'doi': '10.7274/24736404.v1',
 'handle': '',
 'url': 'https://api.figshare.com/v2/articles/24736404',
 'published_date': '2016-03-31T00:00:00Z',
 'thumb': 'https://s3-eu-west-1.amazonaws.com/figshare-production-eu-nd-previews9481-eu-west-1/43566198/thumb.png',
 'defined_type': 5,
 'defined_type_name': 'poster',
 'group_id': 51396,
 'url_private_api': 'https://api.figshare.com/v2/account/articles/24736404',
 'url_public_api': 'https://api.figshare.com/v2/articles/24736404',
 'url_private_html': 'https://figshare.com/account/articles/24736404',
 'url_public_html': 'https://curate.nd.edu/articles/poster/Beyond_Tradition_Rethinking_Early_Modern_Europe/24736404',
 'timeline': {'posted': '2016-03-31T00:00:00',
  'firstOnline': '2016-03-31T00:00:00'},
 'resource_title': '',
 'resource_doi': '',
 'created_date': '2023-12-09T00:11:47Z',
 'modified_date': '2024-08-30T19:37:01Z'}

In [36]:
# Verifications
total_records = len(results)
ids = []
missing_id_count = 0
total_posters = 0

for item in results:
    id_ = item.get("id")
    resource_type = item.get("defined_type_name")
    
    if id_ is None:
        missing_id_count += 1
    else:
        ids.append(id_)
    
    if resource_type == "poster":
        total_posters += 1

unique_ids_count = len(set(ids))
duplicate_entries = len(ids) - unique_ids_count

print(f"Total records: {total_records}")
print(f"Total records with resource type poster: {total_posters}")
print(f"Unique records: {unique_ids_count}")
print(f"Non poster records: {total_records - total_posters}")
print(f"Records with missing id: {missing_id_count}")
print(f"Duplicate Records (based on id): {duplicate_entries}")

if (
    total_records == total_posters == unique_ids_count
    and (total_records - total_posters) == 0
    and missing_id_count == 0
    and duplicate_entries == 0
):
    print("All records are posters with unique IDs - clear to continue.")
else:
    print("Data sanity check failed - review counts above.")

Total records: 14513
Total records with resource type poster: 14513
Unique records: 14513
Non poster records: 0
Records with missing id: 0
Duplicate Records (based on id): 0
All records are posters with unique IDs - clear to continue.


#### Get full metadata

In [41]:
INPUT_FILE = "outputs/repository-records/figshare-summaries.json"
OUTPUT_FILE = "outputs/repository-records/figshare-metadata.ndjson"

# Load target IDs
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    summaries = json.load(f)
all_ids = set(item["id"] for item in summaries if item.get("id"))

# Check for existing progress if any
processed_ids = set()
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            try:
                record = json.loads(line)
                processed_ids.add(record["id"])
            except json.JSONDecodeError:
                continue
    print(f"{len(processed_ids)} posters records already in local storage")
else:
    print("Starting fresh harvest")

# Determine ids to run for this session
ids_to_fetch = list(all_ids - processed_ids)
total_to_fetch = len(ids_to_fetch)
session_processed_count = 0

print(f"Queueing {total_to_fetch} new posters for metadata harvesting")

headers = {"Authorization": f"token {figshare_access_token}"}

# Harvest metadata (save on the go)
with open(OUTPUT_FILE, "a", encoding="utf-8") as ndjson_file:
    for figshare_id in ids_to_fetch:
        try:
            r = requests.get(f"{BASE_URL}/articles/{figshare_id}", headers=headers)
            
            if r.status_code == 200:
                metadata = r.json()
                ndjson_file.write(json.dumps(metadata) + "\n")
                session_processed_count += 1
                
                if session_processed_count % 100 == 0:
                    print(f"\rProgress: {session_processed_count}/{total_to_fetch} new items saved...", end="", flush=True)
            
            elif r.status_code == 429:
                print("Rate limit reached. Waiting 60s...")
                time.sleep(60)
                
            else:
                print(f"[Warning] ID {figshare_id} returned status {r.status_code}: {r.text[:100]}")
            
            time.sleep(0.5)

        except Exception as e:
            print(f"Skipping ID {figshare_id} due to error: {e}")
            continue

# Summary
total_final_count = len(processed_ids) + session_processed_count
print(f"Run Complete!")
print(f"Posters processed in this run: {session_processed_count}")
print(f"Total posters now in file:     {total_final_count}")

0 posters records already in local storage
Queueing 14513 new posters for metadata harvesting
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>

<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>



KeyboardInterrupt: 

#### Get stats

In [32]:
STATS_URL = "https://stats.figshare.com/total"
INPUT_METADATA_FILE = "outputs/repository-records/figshare-metadata.ndjson"
OUTPUT_ENRICHED_FILE = "outputs/repository-records/figshare.ndjson"

# Load target metadata records to get the IDs of all posters
all_metadata = []
if os.path.exists(INPUT_FILE):
    with open(INPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            all_metadata.append(json.loads(line))
    all_ids = {item["id"] for item in all_metadata}
else:
    print(f"Error: {INPUT_FILE} not found. Run metadata harvest first")
    sys.exit()

# Check for existing progress in the enriched file
processed_ids = set()
if os.path.exists(OUTPUT_FILE):
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        for line in f:
            try:
                record = json.loads(line)
                processed_ids.add(record["id"])
            except json.JSONDecodeError:
                continue
    print(f"{len(processed_ids)} enriched records already output file")
else:
    print("Starting fresh stats harvest")

# Determine records to run for this session
session_processed_count = 0
records_to_process = [m for m in all_metadata if m["id"] not in processed_ids]
total_to_fetch = len(records_to_process)

print(f"Queueing {total_to_fetch} posters for stats enrichment")

headers = {"Authorization": f"token {figshare_access_token}"}

# Harvest stats
with open(OUTPUT_FILE, "a", encoding="utf-8") as ndjson_file:
    for record in records_to_process:
        figshare_id = record["id"]
        try:
            stats = {}
            # Need API endpoints for views and downloads for each ID
            for metric in ["views", "downloads"]:
                url = f"{STATS_URL}/{metric}/article/{figshare_id}"
                r = requests.get(url, headers=headers)
                
                if r.status_code == 200:
                    stats[metric] = r.json().get("totals", 0)
                else:
                    stats[metric] = 0
            
            # Add stats to the existing metadata record
            record["stats"] = stats
            
            # Write to output file
            ndjson_file.write(json.dumps(record) + "\n")
            session_processed_count += 1
            
            if session_processed_count % 100 == 0:
                sys.stdout.write(f"\rProgress: {session_processed_count}/{total_to_fetch} new items saved...", end="", flush=True)
                sys.stdout.flush()
            
            # slight delay to be safe with the API
            time.sleep(0.5)

        except Exception as e:
            print(f"\nSkipping ID {figshare_id} due to error: {e}")
            continue

# Summary
total_final_count = len(processed_ids) + session_processed_count
print(f"\nRun Complete!")
print(f"Posters enriched in this run: {session_processed_count}")
print(f"Total enriched records now in file: {total_final_count}")