In [1]:
from dataclasses import dataclass

import pandas as pd
from dataclasses_json import DataClassJsonMixin
from tqdm import tqdm

from rs_graph.data import load_rs_graph_repos_dataset

# Load and process data
joss_author_details = pd.read_parquet("joss-author-details.parquet")
repos_dataset = load_rs_graph_repos_dataset()

In [51]:
joss_author_details

Unnamed: 0,corpus_id,url,doi,title,authors,embedding,citation_count
0,263639713,https://www.semanticscholar.org/paper/f14ca30d...,10.21105/joss.05563,SMITE: Single Molecule Imaging Toolbox Extraor...,"[{'aliases': ['D Schodt', 'David J Schodt', 'D...","{'model': 'specter_v1', 'vector': [0.331548929...",0
1,263617614,https://www.semanticscholar.org/paper/060e9e8a...,10.21105/joss.05443,"Rosalution: Supporting data accessibility, int...","[{'aliases': ['Angelina Uno-antonison'], 'auth...","{'model': 'specter_v1', 'vector': [-3.99032521...",0
2,253097959,https://www.semanticscholar.org/paper/88c2dab5...,10.21105/joss.05115,The Generalized Green's function Cluster Expan...,"[{'aliases': ['Matthew R Carbone'], 'author_id...","{'model': 'specter_v1', 'vector': [-6.97825527...",1
3,263617304,https://www.semanticscholar.org/paper/1fd3ceda...,10.21105/joss.05713,PyDGN: a Python Library for Flexible and Repro...,"[{'aliases': None, 'author_id': '41216883', 'a...","{'model': 'specter_v1', 'vector': [-5.51957702...",0
4,263279828,https://www.semanticscholar.org/paper/3b3de94e...,10.21105/joss.05136,Trash AI: A Web GUI for Serverless Computer Vi...,"[{'aliases': ['W. Cowger', 'Win C. Cowger', 'W...","{'model': 'specter_v1', 'vector': [-3.50501632...",0
...,...,...,...,...,...,...,...
2103,54166243,https://www.semanticscholar.org/paper/42cf63b7...,10.21105/joss.00018,Xenomapper: Mapping reads in a mixed species c...,"[{'aliases': ['M. J. Wakefield', 'Matthew Wake...",,15
2104,54169333,https://www.semanticscholar.org/paper/15139721...,10.21105/joss.00021,pyuca: a Python implementation of the Unicode ...,"[{'aliases': ['James Tauber'], 'author_id': '5...",,1
2105,54165652,https://www.semanticscholar.org/paper/434d3a72...,10.21105/joss.00012,mst_clustering: Clustering via Euclidean Minim...,"[{'aliases': ['J. T. Vanderplas', 'Jacob Vande...",,8
2106,53304838,https://www.semanticscholar.org/paper/264fb001...,10.21105/joss.00011,Carl: a Likelihood-free Inference Toolbox,"[{'aliases': ['G. Louppe'], 'author_id': '1881...",,16


In [2]:
# Create a look up table for each author


@dataclass
class Contribution(DataClassJsonMixin):
    repo: str
    doi: str
    author_order_index: int


@dataclass
class AuthorshipDetails(DataClassJsonMixin):
    name: str
    aliases: set[str]
    h_index: int
    contributions: list[Contribution]


all_author_details: dict[str, AuthorshipDetails] = {}
for _, author_row in tqdm(joss_author_details.iterrows(), desc="rows"):
    # Get DOI so we don't have to do a lot of getitems
    doi = author_row["doi"]

    # Get matching row in repos dataset
    repo_row = repos_dataset.loc[repos_dataset.doi == doi].iloc[0]

    # Iter each author
    for author_details in author_row["authors"]:
        a_id = author_details["author_id"]

        # Add new author
        if a_id not in all_author_details:
            # Get longest name in aliases to use as "cannonical name"
            cannonical_name = author_details["name"]
            if author_details["aliases"] is not None:
                for alias in author_details["aliases"]:
                    if len(alias) > len(cannonical_name):
                        cannonical_name = alias

            # Convert aliases to set and "original author name"
            aliases = (
                set(author_details["aliases"])
                if author_details["aliases"] is not None
                else set()
            )
            aliases.add(author_details["name"])

            # Add new author
            all_author_details[a_id] = AuthorshipDetails(
                name=cannonical_name,
                aliases=aliases,
                h_index=author_details["h_index"],
                contributions=[
                    Contribution(
                        repo=repo_row["repo"],
                        doi=doi,
                        author_order_index=author_details["author_order_index"],
                    )
                ],
            )

        # Update existing author
        else:
            # Get existing author
            existing_author_details = all_author_details[a_id]

            # Always add new aliases
            if author_details["aliases"] is not None:
                existing_author_details.aliases.update(author_details["aliases"])

            # Always add possibly new name to aliases
            existing_author_details.aliases.add(author_details["name"])

            # Update cannonical name if need be
            for alias in existing_author_details.aliases:
                if len(alias) > len(existing_author_details.name):
                    existing_author_details.name = alias

            # Add new contribution
            existing_author_details.contributions.append(
                Contribution(
                    repo=repo_row["repo"],
                    doi=doi,
                    author_order_index=author_details["author_order_index"],
                )
            )

# Convert to dataframe
all_author_details_df = pd.DataFrame(
    [
        {
            "author_id": author_id,
            **author_details.to_dict(),
        }
        for author_id, author_details in all_author_details.items()
    ]
)
all_author_details_df  # noqa: B018

rows: 2108it [00:00, 2546.71it/s]


Unnamed: 0,author_id,name,aliases,h_index,contributions
0,116825899,David J. Schodt,"[D. Schodt, David J Schodt, D Schodt, David J....",4.0,"[{'repo': 'https://github.com/LidkeLab/smite',..."
1,1711910,Michael J. Wester,"[Michael J. Wester, M J Wester, M. Wester, Mic...",15.0,"[{'repo': 'https://github.com/LidkeLab/smite',..."
2,25182872,Mohamadreza Fazel,"[M. R. Fazel, Mohamadreza Fazel, Mohamad Fazel...",8.0,"[{'repo': 'https://github.com/LidkeLab/smite',..."
3,2219330607,Sajjad Khan,[Sajjad Khan],1.0,"[{'repo': 'https://github.com/LidkeLab/smite',..."
4,1401727991,Hanieh Mazloom-Farsibaf,[Hanieh Mazloom-Farsibaf],4.0,"[{'repo': 'https://github.com/LidkeLab/smite',..."
...,...,...,...,...,...
6235,47036435,Daniel Steven Katz,"[D. Katz, D. S. Katz, Daniel Katz, D.s. Katz, ...",37.0,[{'repo': 'https://github.com/applicationskele...
6236,1678982,André Merzky,"[André Merzky, Andre Merzky]",23.0,[{'repo': 'https://github.com/applicationskele...
6237,1840916,Matteo Turilli,"[Matteo Turilli, M. Turilli]",18.0,[{'repo': 'https://github.com/applicationskele...
6238,1693678,Shantenu Jha,"[S Jha, S. Jha, Shantenu Jha]",32.0,[{'repo': 'https://github.com/applicationskele...


In [6]:
ex = all_author_details_df.iloc[0]
ex

author_id                                                116825899
name                                               David J. Schodt
aliases          [D. Schodt, David J Schodt, D Schodt, David J....
h_index                                                        4.0
contributions    [{'repo': 'https://github.com/LidkeLab/smite',...
Name: 0, dtype: object

In [8]:
ex.contributions[0]["repo"]

'https://github.com/LidkeLab/smite'

In [19]:
from dotenv import load_dotenv
from ghapi.all import GhApi, paged
load_dotenv()

api = GhApi()

In [15]:
from parse import search

parsed_github = search("https://github.com/{owner}/{repo}/", ex.contributions[0]["repo"] + "/")
parsed_github

<Result () {'owner': 'LidkeLab', 'repo': 'smite'}>

In [17]:
parsed_github.named["owner"]

'LidkeLab'

In [21]:
paged_contributors = list(paged(
    api.repos.list_contributors,
    owner=parsed_github.named["owner"],
    repo=parsed_github.named["repo"],
))
paged_contributors

[(#8) [{'login': 'MJWester', 'id': 14239744, 'node_id': 'MDQ6VXNlcjE0MjM5NzQ0', 'avatar_url': 'https://avatars.githubusercontent.com/u/14239744?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/MJWester', 'html_url': 'https://github.com/MJWester', 'followers_url': 'https://api.github.com/users/MJWester/followers', 'following_url': 'https://api.github.com/users/MJWester/following{/other_user}', 'gists_url': 'https://api.github.com/users/MJWester/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/MJWester/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/MJWester/subscriptions', 'organizations_url': 'https://api.github.com/users/MJWester/orgs', 'repos_url': 'https://api.github.com/users/MJWester/repos', 'events_url': 'https://api.github.com/users/MJWester/events{/privacy}', 'received_events_url': 'https://api.github.com/users/MJWester/received_events', 'type': 'User', 'site_admin': False, 'contributions': 619},{'login': 'dschodt', 'id': 3

In [41]:
@dataclass
class ContributorInfo(DataClassJsonMixin):
    username: str
    name: str | None
    company: str | None
    email: str | None
    location: str | None
    bio: str | None

contributor_infos = {}
contributor_index = 0
for page in paged_contributors:
    for contrib in page:
        user_info = api.users.get_by_username(contrib["login"])
        contributor_infos[contributor_index] = ContributorInfo(
            username=contrib["login"],
            name=user_info["name"],
            company=user_info["company"],
            email=user_info["email"],
            location=user_info["location"],
            bio=user_info["bio"],
        ).to_dict()
        contributor_index += 1

contributor_infos

{0: {'username': 'MJWester',
  'name': 'Michael Wester',
  'company': 'University of New Mexico',
  'email': None,
  'location': 'Albuquerque, New Mexico',
  'bio': None},
 1: {'username': 'dschodt',
  'name': 'David J. Schodt',
  'company': None,
  'email': None,
  'location': None,
  'bio': None},
 2: {'username': 'MJWester1',
  'name': 'Michael J. Wester',
  'company': 'University of New Mexico',
  'email': None,
  'location': 'Albuquerque, New Mexico, USA',
  'bio': None},
 3: {'username': 'kalidke',
  'name': 'Keith A. Lidke',
  'company': None,
  'email': None,
  'location': None,
  'bio': None},
 4: {'username': 'sajjad88',
  'name': 'Sajjad Khan',
  'company': 'University of New Mexico (UNM)',
  'email': None,
  'location': 'Albuquerque, New Mexico',
  'bio': 'Ph.D. candidate: Nanoscience & Microsystems Engineering, University of New Mexico, Albuquerque, NM, USA.\r\n\r\nResearch Assistant Optical Nanoscopy Lab, PAIS, UNM.'},
 5: {'username': 'spallikkuth',
  'name': 'Sandeep Pa

In [45]:
import dedupe

variables = [
    {"field": f, "type": "String", "has missing": True}
    for f in contributor_infos[0].keys()
]
variables

[{'field': 'username', 'type': 'String', 'has missing': True},
 {'field': 'name', 'type': 'String', 'has missing': True},
 {'field': 'company', 'type': 'String', 'has missing': True},
 {'field': 'email', 'type': 'String', 'has missing': True},
 {'field': 'location', 'type': 'String', 'has missing': True},
 {'field': 'bio', 'type': 'String', 'has missing': True}]

In [46]:
deduper = dedupe.Dedupe(variables)
deduper

<dedupe.api.Dedupe at 0x7f9926b21c90>

In [47]:
deduper.prepare_training(contributor_infos, sample_size=5)

In [50]:
dedupe.console_label(deduper)

username : MJWester1
name : Michael J. Wester
company : University of New Mexico
email : None
location : Albuquerque, New Mexico, USA
bio : None

username : Erburns
name : None
company : None
email : None
location : None
bio : None

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


 n


username : MJWester
name : Michael Wester
company : University of New Mexico
email : None
location : Albuquerque, New Mexico
bio : None

username : Erburns
name : None
company : None
email : None
location : None
bio : None

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : sajjad88
name : Sajjad Khan
company : University of New Mexico (UNM)
email : None
location : Albuquerque, New Mexico
bio : Ph.D. candidate: Nanoscience & Microsystems Engineering, University of New Mexico, Albuquerque, NM, USA.

Research Assistant Optical Nanoscopy Lab, PAIS, UNM.

username : spallikkuth
name : Sandeep Pallikkuth
company : University of New Mexico
email : None
location : Albuquerque, NM
bio : None

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : kalidke
name : Keith A. Lidke
company : None
email : None
location : None
bio : None

username : spallikkuth
name : Sandeep Pallikkuth
company : University of New Mexico
email : None
location : Albuquerque, NM
bio : None

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : MJWester
name : Michael Wester
company : University of New Mexico
email : None
location : Albuquerque, New Mexico
bio : None

username : MJWester1
name : Michael J. Wester
company : University of New Mexico
email : None
location : Albuquerque, New Mexico, USA
bio : None

0/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 y


username : sajjad88
name : Sajjad Khan
company : University of New Mexico (UNM)
email : None
location : Albuquerque, New Mexico
bio : Ph.D. candidate: Nanoscience & Microsystems Engineering, University of New Mexico, Albuquerque, NM, USA.

Research Assistant Optical Nanoscopy Lab, PAIS, UNM.

username : Erburns
name : None
company : None
email : None
location : None
bio : None

1/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : kalidke
name : Keith A. Lidke
company : None
email : None
location : None
bio : None

username : kiwibogo
name : Sheng Liu
company : None
email : kiwibogo@gmail.com
location : None
bio : None

1/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : kiwibogo
name : Sheng Liu
company : None
email : kiwibogo@gmail.com
location : None
bio : None

username : Erburns
name : None
company : None
email : None
location : None
bio : None

1/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : kalidke
name : Keith A. Lidke
company : None
email : None
location : None
bio : None

username : Erburns
name : None
company : None
email : None
location : None
bio : None

1/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : dschodt
name : David J. Schodt
company : None
email : None
location : None
bio : None

username : Erburns
name : None
company : None
email : None
location : None
bio : None

1/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : MJWester
name : Michael Wester
company : University of New Mexico
email : None
location : Albuquerque, New Mexico
bio : None

username : spallikkuth
name : Sandeep Pallikkuth
company : University of New Mexico
email : None
location : Albuquerque, NM
bio : None

1/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : MJWester1
name : Michael J. Wester
company : University of New Mexico
email : None
location : Albuquerque, New Mexico, USA
bio : None

username : kalidke
name : Keith A. Lidke
company : None
email : None
location : None
bio : None

1/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


 n


username : spallikkuth
name : Sandeep Pallikkuth
company : University of New Mexico
email : None
location : Albuquerque, NM
bio : None

username : kiwibogo
name : Sheng Liu
company : None
email : kiwibogo@gmail.com
location : None
bio : None

1/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


KeyboardInterrupt: Interrupted by user

In [52]:
from rs_graph.data import load_rs_graph_repos_dataset
load_rs_graph_repos_dataset()

Unnamed: 0,repo,title,doi,published_date
0,https://github.com/LidkeLab/smite,SMITE: Single Molecule Imaging Toolbox Extraor...,10.21105/joss.05563,2023-10-02T07:49:41.094Z
1,https://github.com/uab-cgds-worthey/rosalution,"Rosalution: Supporting data accessibility, int...",10.21105/joss.05443,2023-10-01T20:53:57.824Z
2,https://github.com/matthewcarbone/GGCE,The Generalized Green's function Cluster Expan...,10.21105/joss.05115,2023-10-01T20:29:44.672Z
3,https://github.com/diningphil/PyDGN/,PyDGN: a Python Library for Flexible and Repro...,10.21105/joss.05713,2023-10-01T12:42:45.520Z
4,https://github.com/code4sac/trash-ai,Trash AI: A Web GUI for Serverless Computer Vi...,10.21105/joss.05136,2023-09-29T15:46:58.364Z
...,...,...,...,...
2733,https://github.com/JeanElsner/panda-py,Taming the Panda with Python: A powerful duo f...,10.1016/j.softx.2023.101532,2023-12-31
2734,https://github.com/alejandrofdez-us/similarity-ts,SimilarityTS: Toolkit for the evaluation of si...,10.1016/j.softx.2023.101527,2023-12-31
2735,https://github.com/eol017/CNV-Z,CNV-Z; a new tool for detecting copy number va...,10.1016/j.softx.2023.101530,2023-09-22
2736,https://github.com/saadism777/Dental-Loop-Real...,Dental Loop FLT: Facial landmark tracking,10.1016/j.softx.2023.101531,2023-12-31
