# Scholar.py Citation Counter
This notebook will use [scholar.py](https://github.com/ckreibich/scholar.py) to fetch citation counts for each DOI in your groups, aggregate by year (2018-2025), and output a summary CSV.

In [3]:
import requests
import pandas as pd
import time
import os
import ast

In [4]:
df = pd.read_csv('dois_by_topic.csv')
# Convert string representation of list to actual list
df['doi_list'] = df['doi_list'].apply(ast.literal_eval)
groups = df['doi_list'].tolist()
print(f"Loaded {len(groups)} groups of DOIs.")

Loaded 20 groups of DOIs.


In [5]:
# Fetch yearly citation counts for each DOI in all groups using Semantic Scholar API and save each group's results to a separate CSV, skipping groups with existing CSVs, and waiting 1 second between each search
years = list(range(2018, 2026))
for group_idx, doi_list in enumerate(groups):
    csv_name = f'group{group_idx}_semanticscholar_citations.csv'
    if os.path.exists(csv_name):
        print(f'Skipping group {group_idx}, {csv_name} already exists.')
        continue
    rows = []
    for doi in doi_list:
        url = f'https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=year,citationCount,citations.year'
        try:
            response = requests.get(url)
            if response.status_code == 200:
                data = response.json()
                citations = data.get('citations', [])
                year_counts = {y: 0 for y in years}
                for c in citations:
                    cyear = c.get('year', None)
                    if cyear in year_counts:
                        year_counts[cyear] += 1
                for year in years:
                    rows.append({'doi': doi, 'year': year, 'citation_count': year_counts[year], 'success': True})
            else:
                for year in years:
                    rows.append({'doi': doi, 'year': year, 'citation_count': None, 'success': False})
        except Exception as e:
            for year in years:
                rows.append({'doi': doi, 'year': year, 'citation_count': None, 'success': False})
        time.sleep(1)  # Wait 1 second between each search
    df = pd.DataFrame(rows)
    df.to_csv(csv_name, index=False)
    print(f'Saved {csv_name}')

Skipping group 0, group0_semanticscholar_citations.csv already exists.
Skipping group 1, group1_semanticscholar_citations.csv already exists.
Skipping group 2, group2_semanticscholar_citations.csv already exists.
Skipping group 3, group3_semanticscholar_citations.csv already exists.
Skipping group 4, group4_semanticscholar_citations.csv already exists.
Skipping group 5, group5_semanticscholar_citations.csv already exists.
Skipping group 6, group6_semanticscholar_citations.csv already exists.
Skipping group 7, group7_semanticscholar_citations.csv already exists.
Skipping group 8, group8_semanticscholar_citations.csv already exists.
Skipping group 9, group9_semanticscholar_citations.csv already exists.
Skipping group 10, group10_semanticscholar_citations.csv already exists.
Skipping group 11, group11_semanticscholar_citations.csv already exists.
Skipping group 12, group12_semanticscholar_citations.csv already exists.
Skipping group 13, group13_semanticscholar_citations.csv already exists

In [13]:
# Aggregate yearly citation totals for each group from the generated CSVs and output a summary CSV
years = list(range(2018, 2026))
group_totals = {}
for group_idx in range(20):
    csv_name = f'group{group_idx}_semanticscholar_citations.csv'
    try:
        df = pd.read_csv(csv_name)
        year_sums = df.groupby('year')['citation_count'].sum()
        group_totals[group_idx] = [year_sums.get(year, 0) for year in years]
    except Exception as e:
        print(f'Error loading {csv_name}: {e}')
        group_totals[group_idx] = [0 for _ in years]
summary_df = pd.DataFrame.from_dict(group_totals, orient='index', columns=years)
summary_df.index.name = 'group'
summary_df.to_csv('group_year_citation_totals.csv')
print('Saved group_year_citation_totals.csv')

Saved group_year_citation_totals.csv


In [14]:
# Find top 10 cited DOIs for each group and output a CSV with group, DOI, and citation count
top_dois_rows = []
for group_idx in range(20):
    csv_name = f'group{group_idx}_semanticscholar_citations.csv'
    try:
        df = pd.read_csv(csv_name)
        # Sum citations for each DOI across all years
        doi_sums = df.groupby('doi')['citation_count'].sum().sort_values(ascending=False)
        top_dois = doi_sums.head(10)
        for doi, count in top_dois.items():
            top_dois_rows.append({'group': group_idx, 'doi': doi, 'total_citations': count})
    except Exception as e:
        print(f'Error loading {csv_name}: {e}')
top_dois_df = pd.DataFrame(top_dois_rows)
top_dois_df.to_csv('group_top10_cited_dois.csv', index=False)
print('Saved group_top10_cited_dois.csv')

Saved group_top10_cited_dois.csv


In [8]:
# Fill in citation gaps for DOIs with success=False using OpenAlex, updating yearly citation counts in each group CSV
import requests
import pandas as pd
import time
years = list(range(2018, 2026))
for group_idx in range(20):
    csv_name = f'group{group_idx}_semanticscholar_citations.csv'
    try:
        df = pd.read_csv(csv_name)
        missing = df[df['success'] == False]['doi'].unique()
        for doi in missing:
            url = f'https://api.openalex.org/works/https://doi.org/{doi}'
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()
                    # OpenAlex provides yearly citation counts in 'counts_by_year'
                    counts_by_year = data.get('counts_by_year', [])
                    year_map = {item['year']: item['cited_by_count'] for item in counts_by_year}
                    for year in years:
                        idx = (df['doi'] == doi) & (df['year'] == year)
                        if year in year_map:
                            df.loc[idx, 'citation_count'] = year_map[year]
                            df.loc[idx, 'success'] = True
                else:
                    print(f'OpenAlex failed for DOI {doi} in group {group_idx}')
            except Exception as e:
                print(f'Error with OpenAlex for DOI {doi} in group {group_idx}: {e}')
            time.sleep(1)  # Be polite to the API
        df.to_csv(csv_name, index=False)
        print(f'Updated {csv_name} with OpenAlex data')
    except Exception as e:
        print(f'Error loading {csv_name}: {e}')

OpenAlex failed for DOI 10.1016/j.jadohealth.2024.06.0 in group 0
OpenAlex failed for DOI 10.1016/j.jadohealth.2023 in group 0
OpenAlex failed for DOI 10.1162/imag/a/00037 in group 0
Updated group0_semanticscholar_citations.csv with OpenAlex data
OpenAlex failed for DOI 10.1016/j.jadohealth.2023.06 in group 1
Updated group1_semanticscholar_citations.csv with OpenAlex data
OpenAlex failed for DOI 10.1097/psy.0000000000000 in group 2
OpenAlex failed for DOI 10.1016/j.healthplace.2022.10288 in group 2
Updated group2_semanticscholar_citations.csv with OpenAlex data
OpenAlex failed for DOI 10.1038/s41597-024-03058- in group 3
OpenAlex failed for DOI 10.1016/j.nicl.2024.10360 in group 3
OpenAlex failed for DOI 10.1044/2020/jslhr-20-00305 in group 3
OpenAlex failed for DOI 10.1007/978-3-031-34048-2/22 in group 3
OpenAlex failed for DOI 10.1162/imag/a/00157 in group 3
OpenAlex failed for DOI 10.1162/netn/a/00363 in group 3
OpenAlex failed for DOI 10.1044/2018/jslhr-s-18-0016 in group 3
OpenAle

In [11]:
# Test tools for yearly citation counts for DOI 10.1038/s41586-022-04492-9
doi = '10.1038/s41586-022-04492-9'
years = list(range(2018, 2026))

results = {}

# 1. Semantic Scholar API
try:
    url = f'https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}?fields=year,citationCount,citations.year'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        citations = data.get('citations', [])
        year_counts = {y: 0 for y in years}
        for c in citations:
            cyear = c.get('year', None)
            if cyear in year_counts:
                year_counts[cyear] += 1
        results['Semantic Scholar'] = year_counts
    else:
        results['Semantic Scholar'] = 'No data'
except Exception as e:
    results['Semantic Scholar'] = f'Error: {e}'

# 2. OpenAlex API
try:
    url = f'https://api.openalex.org/works/https://doi.org/{doi}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        counts_by_year = data.get('counts_by_year', [])
        year_map = {item['year']: item['cited_by_count'] for item in counts_by_year}
        year_counts = {y: year_map.get(y, 0) for y in years}
        results['OpenAlex'] = year_counts
    else:
        results['OpenAlex'] = 'No data'
except Exception as e:
    results['OpenAlex'] = f'Error: {e}'

# 3. Crossref API (total citations only, not per year)
try:
    url = f'https://api.crossref.org/works/{doi}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        citation_count = data['message'].get('is-referenced-by-count', None)
        results['Crossref'] = citation_count
    else:
        results['Crossref'] = 'No data'
except Exception as e:
    results['Crossref'] = f'Error: {e}'

# Print results for comparison
for tool, result in results.items():
    print(f'--- {tool} ---')
    print(result)

--- Semantic Scholar ---
{2018: 0, 2019: 0, 2020: 0, 2021: 1, 2022: 1, 2023: 1, 2024: 1, 2025: 0}
--- OpenAlex ---
{2018: 0, 2019: 0, 2020: 3, 2021: 15, 2022: 307, 2023: 536, 2024: 546, 2025: 386}
--- Crossref ---
1573


In [12]:
# For all groups, fetch OpenAlex yearly citation counts for all DOIs, compare to previous Semantic Scholar numbers, update with OpenAlex if different, and create a list of switched DOIs
import requests
import pandas as pd
import time
years = list(range(2018, 2026))
switched_dois = []
for group_idx in range(20):
    csv_name = f'group{group_idx}_semanticscholar_citations.csv'
    try:
        df = pd.read_csv(csv_name)
        for doi in df['doi'].unique():
            url = f'https://api.openalex.org/works/https://doi.org/{doi}'
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    data = response.json()
                    counts_by_year = data.get('counts_by_year', [])
                    year_map = {item['year']: item['cited_by_count'] for item in counts_by_year}
                    switched = False
                    for year in years:
                        idx = (df['doi'] == doi) & (df['year'] == year)
                        openalex_count = year_map.get(year, None)
                        semantic_count = df.loc[idx, 'citation_count'].values[0] if idx.any() else None
                        # Only switch if OpenAlex has a value and it's different from Semantic Scholar
                        if openalex_count is not None and openalex_count != semantic_count:
                            df.loc[idx, 'citation_count'] = openalex_count
                            df.loc[idx, 'success'] = True
                            switched = True
                    if switched:
                        switched_dois.append({'group': group_idx, 'doi': doi})
                else:
                    print(f'OpenAlex failed for DOI {doi} in group {group_idx}')
            except Exception as e:
                print(f'Error with OpenAlex for DOI {doi} in group {group_idx}: {e}')
            time.sleep(1)  # Be polite to the API
        df.to_csv(csv_name, index=False)
        print(f'Updated {csv_name} with OpenAlex data')
    except Exception as e:
        print(f'Error loading {csv_name}: {e}')
switched_df = pd.DataFrame(switched_dois)
switched_df.to_csv('switched_dois_openalex.csv', index=False)
print('Saved switched_dois_openalex.csv')

OpenAlex failed for DOI 10.1016/j.jadohealth.2024.06.0 in group 0
OpenAlex failed for DOI 10.1016/j.jadohealth.2023 in group 0
OpenAlex failed for DOI 10.1016/j.jadohealth.2023 in group 0
OpenAlex failed for DOI 10.1162/imag/a/00037 in group 0
OpenAlex failed for DOI 10.1162/imag/a/00037 in group 0
Updated group0_semanticscholar_citations.csv with OpenAlex data
Updated group0_semanticscholar_citations.csv with OpenAlex data
OpenAlex failed for DOI 10.1016/j.jadohealth.2023.06 in group 1
OpenAlex failed for DOI 10.1016/j.jadohealth.2023.06 in group 1
Updated group1_semanticscholar_citations.csv with OpenAlex data
Updated group1_semanticscholar_citations.csv with OpenAlex data
OpenAlex failed for DOI 10.1097/psy.0000000000000 in group 2
OpenAlex failed for DOI 10.1097/psy.0000000000000 in group 2
OpenAlex failed for DOI 10.1016/j.healthplace.2022.10288 in group 2
OpenAlex failed for DOI 10.1016/j.healthplace.2022.10288 in group 2
Updated group2_semanticscholar_citations.csv with OpenAlex