In [1]:
from dataclasses import dataclass

import pandas as pd
from dataclasses_json import DataClassJsonMixin
from tqdm import tqdm

from rs_graph.data import load_rs_graph_repos_dataset

# Load and process data
joss_author_details = pd.read_parquet("joss-author-details.parquet")
repos_dataset = load_rs_graph_repos_dataset()

In [2]:
# Create a look up table for each author


@dataclass
class Contribution(DataClassJsonMixin):
    repo: str
    doi: str
    author_order_index: int


@dataclass
class AuthorshipDetails(DataClassJsonMixin):
    name: str
    aliases: set[str]
    h_index: int
    contributions: list[Contribution]


all_author_details: dict[str, AuthorshipDetails] = {}
for _, author_row in tqdm(joss_author_details.iterrows(), desc="rows"):
    # Get DOI so we don't have to do a lot of getitems
    doi = author_row["doi"]

    # Get matching row in repos dataset
    repo_row = repos_dataset.loc[repos_dataset.doi == doi].iloc[0]

    # Iter each author
    for author_details in author_row["authors"]:
        a_id = author_details["author_id"]

        # Add new author
        if a_id not in all_author_details:
            # Get longest name in aliases to use as "cannonical name"
            cannonical_name = author_details["name"]
            if author_details["aliases"] is not None:
                for alias in author_details["aliases"]:
                    if len(alias) > len(cannonical_name):
                        cannonical_name = alias

            # Convert aliases to set and "original author name"
            aliases = (
                set(author_details["aliases"])
                if author_details["aliases"] is not None
                else set()
            )
            aliases.add(author_details["name"])

            # Add new author
            all_author_details[a_id] = AuthorshipDetails(
                name=cannonical_name,
                aliases=aliases,
                h_index=author_details["h_index"],
                contributions=[
                    Contribution(
                        repo=repo_row["repo"],
                        doi=doi,
                        author_order_index=author_details["author_order_index"],
                    )
                ],
            )

        # Update existing author
        else:
            # Get existing author
            existing_author_details = all_author_details[a_id]

            # Always add new aliases
            if author_details["aliases"] is not None:
                existing_author_details.aliases.update(author_details["aliases"])

            # Always add possibly new name to aliases
            existing_author_details.aliases.add(author_details["name"])

            # Update cannonical name if need be
            for alias in existing_author_details.aliases:
                if len(alias) > len(existing_author_details.name):
                    existing_author_details.name = alias

            # Add new contribution
            existing_author_details.contributions.append(
                Contribution(
                    repo=repo_row["repo"],
                    doi=doi,
                    author_order_index=author_details["author_order_index"],
                )
            )

# Convert to dataframe
all_author_details_df = pd.DataFrame(
    [
        {
            "author_id": author_id,
            **author_details.to_dict(),
        }
        for author_id, author_details in all_author_details.items()
    ]
)
all_author_details_df  # noqa: B018

rows: 2108it [00:00, 2357.38it/s]


Unnamed: 0,author_id,name,aliases,h_index,contributions
0,116825899,David J. Schodt,"{David Schodt, David J Schodt, D. Schodt, D Sc...",4.0,[Contribution(repo='https://github.com/LidkeLa...
1,1711910,Michael J. Wester,"{M. J. Wester, Michael J. Wester, Michael West...",15.0,[Contribution(repo='https://github.com/LidkeLa...
2,25182872,Mohamadreza Fazel,"{Mohamad Fazel, M. R. Fazel, M. Fazel, Mohamad...",8.0,[Contribution(repo='https://github.com/LidkeLa...
3,2219330607,Sajjad Khan,{Sajjad Khan},1.0,[Contribution(repo='https://github.com/LidkeLa...
4,1401727991,Hanieh Mazloom-Farsibaf,{Hanieh Mazloom-Farsibaf},4.0,[Contribution(repo='https://github.com/LidkeLa...
...,...,...,...,...,...
6235,47036435,Daniel Steven Katz,"{D. S. Katz, Daniel Steven Katz, Daniel S Katz...",37.0,[Contribution(repo='https://github.com/applica...
6236,1678982,André Merzky,"{Andre Merzky, André Merzky}",23.0,[Contribution(repo='https://github.com/applica...
6237,1840916,Matteo Turilli,"{Matteo Turilli, M. Turilli}",18.0,[Contribution(repo='https://github.com/applica...
6238,1693678,Shantenu Jha,"{S. Jha, Shantenu Jha, S Jha}",32.0,[Contribution(repo='https://github.com/applica...


In [3]:
all_author_details_df.sample(3).contributions.tolist()

[[Contribution(repo='https://github.com/rob-luke/AuditoryStimuli.jl', doi='10.21105/joss.03613', author_order_index=1)],
 [Contribution(repo='https://bitbucket.org/dolfin-adjoint/pyadjoint', doi='10.21105/joss.01292', author_order_index=2),
  Contribution(repo='https://bitbucket.org/meg/cbcbeat', doi='10.21105/joss.00224', author_order_index=3)],
 [Contribution(repo='https://github.com/SCIInstitute/PFEIFER', doi='10.21105/joss.00472', author_order_index=2)]]