<div align="center" style="border:solid 1px gray;">
    <a href="https://openalex.org/">
        <img src="../../resources/img/OpenAlex-banner.png" alt="OpenAlex banner" width="300">
    </a>
</div>

# Calculate the h-index for a given author

<div style='background:#e7edf7'>
    In this notebook we will query the OpenAlex API to determine:
    <blockquote>
        <b><i>What's the h-index for a given author?</i></b>
    </blockquote>
    To get to the bottom of this, we will use the following API functionalities: 
    <a href="https://docs.openalex.org/api/get-lists-of-entities#filter">filtering</a>, 
    <a href="https://docs.openalex.org/api/get-lists-of-entities/sort-entity-lists">sorting</a> and
    <a href="https://docs.openalex.org/api#paging">paging</a>
</div>
<br>

.....



---

In [1]:
# input
orcid = "https://orcid.org/0000-0003-1613-5981"

In [2]:
def build_author_works_url(orcid):
    endpoint = 'works'

    filters = (
      f'author.orcid:{orcid}',
    )

    sort_value = 'cited_by_count:desc'

    return f'https://api.openalex.org/{endpoint}?filter={",".join(filters)}&sort={sort_value}'

author_works_url = build_author_works_url(orcid)
print(author_works_url)

https://api.openalex.org/works?filter=author.orcid:https://orcid.org/0000-0003-1613-5981&sort=cited_by_count:desc


In [4]:
import requests

def get_all_citations(works_url):
    works_url_with_cursor = works_url + '&cursor={}'

    # loop through pages
    cursor = '*'
    citation_counts = []
    while cursor:
        # set cursor value and request page from OpenAlex
        url = works_url_with_cursor.format(cursor)
        page_with_results = requests.get(url).json()

        # loop through partial list of results
        # extract citation count from every work
        results = page_with_results['results']
        citation_counts += [work['cited_by_count'] for work in results]

        # update cursor to meta.next_cursor
        cursor = page_with_results['meta']['next_cursor']

    return citation_counts

citation_counts = get_all_citations(author_works_url)
print("complete list of sorted citation counts:\n" + ', '.join(str(x) for x in citation_counts))

complete list of sorted citation counts:
604, 409, 307, 276, 164, 162, 110, 100, 70, 69, 36, 34, 25, 25, 23, 23, 22, 20, 18, 12, 11, 9, 9, 6, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0


-----
## 2. Calculate h-index

In [5]:
# modified binary search
def calculate_hirsch_index(sorted_citations):
    def hirsch_rec(low, high):
        if low >= high:
            return low + 1

        mid = -(-(high + low) // 2) # math.ceil
        if sorted_citations[mid] >= mid+1:
            return hirsch_rec(mid, high)
        else:
            return hirsch_rec(low, mid-1)

    # handle edge case: no citations
    if not sorted_citations or sorted_citations[0]==0:
        return 0
    else:
        return hirsch_rec(0, len(sorted_citations)-1)

hindex = calculate_hirsch_index(citation_counts)
print(hindex)

18


In [6]:
import pandas as pd

def visualize_hirsch_index(citation_counts):
    # create table with columns citations, rank, rank<=citations?
    df = pd.DataFrame(citation_counts, columns =['citations'])
    df.insert(0, 'rank', range(1, 1 + len(df)))
    df['rank<=citations?'] = (df['rank'] <= df['citations'])

    # highlight row and hindex
    def highlight_hindex_row(s, hindex):
        if s['rank'] < hindex:
            return [''] + [''] + ['background-color: lightgreen;']
        if s['rank'] == hindex:
            return ['border-radius: 50%;background-color: pink;border-bottom: 2px solid black;'] \
            + ['border-bottom: 2px solid black;'] \
            + ['background-color: lightgreen;border-bottom: 2px solid black;']
        #else: 
        return [''] + [''] + ['background-color: gold;']

    # style table: center columns, hide index, highlight rows
    df_styled = df.style.hide(axis="index") \
                      .set_properties(**{'text-align': 'center'}) \
                      .apply(highlight_hindex_row, hindex=hindex, axis=1)

    return df_styled

viz_df = visualize_hirsch_index(citation_counts)
display(viz_df)

rank,citations,rank<=citations?
1,604,True
2,409,True
3,307,True
4,276,True
5,164,True
6,162,True
7,110,True
8,100,True
9,70,True
10,69,True


-----
Happy exploring! 😎