In [None]:
# Google Colab notebook to download all 150k approx journal titles usable for DOME Wizard template to reduce free text
import requests
import csv
import time

def get_total_journal_count():
    """
    Get the total number of journals available in Crossref

    Returns:
    Integer representing total journal count
    """
    base_url = "https://api.crossref.org/journals"
    headers = {
        "User-Agent": "JournalExporter/1.0 (mailto:your-email@example.com)"
    }

    params = {
        "rows": 0  # We only need the metadata, not actual results
    }

    response = requests.get(base_url, params=params, headers=headers)

    if response.status_code != 200:
        print(f"Error: API responded with status code {response.status_code}")
        return None

    data = response.json()
    total_results = data.get('message', {}).get('total-results', 0)

    return total_results

def get_all_journals(rows_per_request=1000, max_requests=None):
    """
    Retrieve journal titles from Crossref API and return them as a list

    Parameters:
    rows_per_request: Number of journals to retrieve per API call
    max_requests: Maximum number of API requests to make (None for all journals)

    Returns:
    List of dictionaries containing journal information
    """
    journals = []
    cursor = "*"  # Starting cursor for pagination
    base_url = "https://api.crossref.org/journals"

    # Set up headers with user agent as recommended by Crossref
    headers = {
        "User-Agent": "JournalExporter/1.0 (mailto:your-email@example.com)"
    }

    # Get total number of journals
    total_journals = get_total_journal_count()
    if total_journals is None:
        print("Could not determine total journal count. Using default max_requests.")
    else:
        print(f"Total journals available: {total_journals}")
        if max_requests is None:
            # Calculate required requests to get all journals
            max_requests = (total_journals // rows_per_request) + 1
            print(f"Will need approximately {max_requests} requests to retrieve all journals")

    # Default to 100 if max_requests is still None
    if max_requests is None:
        max_requests = 100
        print(f"Using default maximum of {max_requests} requests")

    request_count = 0
    while request_count < max_requests:
        params = {
            "rows": rows_per_request,
            "cursor": cursor
        }

        print(f"Making request {request_count+1}/{max_requests}...")
        response = requests.get(base_url, params=params, headers=headers)

        if response.status_code != 200:
            print(f"Error: API responded with status code {response.status_code}")
            break

        data = response.json()

        # Extract the items and add to our list
        items = data.get('message', {}).get('items', [])
        if not items:
            print("No more journals to retrieve.")
            break

        journals.extend(items)

        # Get next cursor for pagination
        next_cursor = data.get('message', {}).get('next-cursor')
        if not next_cursor:
            print("No next cursor available. Finished retrieving journals.")
            break

        cursor = next_cursor
        request_count += 1

        # Progress report
        print(f"Retrieved {len(journals)} journals so far ({len(journals)/total_journals*100:.2f}% complete)...")

        # Sleep to be nice to the API
        time.sleep(1)

    return journals

def save_journals_to_csv(journals, filename="crossref_journals.csv"):
    """
    Save journal data to a CSV file

    Parameters:
    journals: List of journal dictionaries from Crossref API
    filename: Name of the output CSV file
    """
    # Define which fields to extract from each journal
    fieldnames = ["title", "issn", "publisher"]

    with open(filename, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for journal in journals:
            # Extract the required fields from each journal
            row = {
                "title": journal.get("title", ""),
                "issn": ", ".join(journal.get("issn", [])),
                "publisher": journal.get("publisher", "")
            }
            writer.writerow(row)

    print(f"Successfully saved {len(journals)} journal titles to {filename}")

def main():
    print("Checking total number of journals in Crossref...")
    total_count = get_total_journal_count()
    if total_count:
        print(f"Found {total_count} total journals available in Crossref")

    retrieve_all = input("Retrieve all journals? (y/n, default: n): ").lower() == 'y'

    if retrieve_all:
        print("Starting to retrieve ALL journals from Crossref API...")
        journals = get_all_journals(max_requests=None)  # None means get all
    else:
        max_requests = int(input("Enter maximum number of requests (1000 journals per request): ") or "100")
        print(f"Starting to retrieve journals from Crossref API (max {max_requests} requests)...")
        journals = get_all_journals(max_requests=max_requests)

    if not journals:
        print("No journals retrieved. Exiting.")
        return

    print(f"Retrieved {len(journals)} journals. Saving to CSV...")
    save_journals_to_csv(journals)
    print("Done!")

if __name__ == "__main__":
    main()

Checking total number of journals in Crossref...
Found 145416 total journals available in Crossref
Retrieve all journals? (y/n, default: n): y
Starting to retrieve ALL journals from Crossref API...
Total journals available: 145416
Will need approximately 146 requests to retrieve all journals
Making request 1/146...
Retrieved 1000 journals so far (0.69% complete)...
Making request 2/146...
Retrieved 2000 journals so far (1.38% complete)...
Making request 3/146...
Retrieved 3000 journals so far (2.06% complete)...
Making request 4/146...
Retrieved 4000 journals so far (2.75% complete)...
Making request 5/146...
Retrieved 5000 journals so far (3.44% complete)...
Making request 6/146...
Retrieved 6000 journals so far (4.13% complete)...
Making request 7/146...
Retrieved 7000 journals so far (4.81% complete)...
Making request 8/146...
Retrieved 8000 journals so far (5.50% complete)...
Making request 9/146...
Retrieved 9000 journals so far (6.19% complete)...
Making request 10/146...
Retriev