In [1]:
import pandas as pd

initial_set = pd.read_csv("cc-na-initial-set.csv")
eva_set = pd.read_csv("cc-na-eva.csv")
nic_set = pd.read_csv("cc-na-nic.csv")
nic_set["is_bot_account"] = "no"
nic_set = nic_set.drop(
    columns=[col for col in nic_set.columns if col.startswith("Unnamed:")]
)

# Combine the datasets
combined_set = pd.concat(
    [initial_set, eva_set, nic_set], ignore_index=True
).reset_index(drop=True)
combined_set

Unnamed: 0,document_id,document_doi,repository_id,repository_url,developer_account_id,developer_account_url,developer_contribution_url,n_commits,n_additions,n_deletions,total_repo_commits,total_repo_additions,total_repo_deletions,contrib_type,is_actually_author,is_bot_account,notes
0,107848,https://doi.org/10.1088/1748-0221/18/11/P11028,104357,https://github.com/dnicotra/trackhhl,131471,https://github.com/dmark04,https://github.com/dnicotra/trackhhl/commits?a...,0.0,0.0,0.0,11.0,924.0,712.0,other,no,no,
1,44180,https://doi.org/10.1371/journal.pone.0263125,42221,https://github.com/lukemelas/efficientnet-pytorch,71221,https://github.com/robotrapta,https://github.com/lukemelas/efficientnet-pyto...,1.0,1.0,1.0,120.0,8902.0,3609.0,doc,unclear,no,
2,33751,https://doi.org/10.29012/jpc.870,32267,https://github.com/microsoft/prv_accountant,36697,https://github.com/s-zanella,https://github.com/microsoft/prv_accountant/co...,1.0,1.0,1.0,35.0,3604.0,883.0,code,no,no,
3,56650,https://doi.org/10.1007/s41095-021-0229-5,54205,https://github.com/menghaoguo/pct,2123,https://github.com/Menghao999,https://github.com/menghaoguo/pct/commits?auth...,1.0,11.0,1.0,30.0,1201.0,65.0,code,yes,no,
4,21394,https://doi.org/10.1109/TPAMI.2017.2769085,20529,https://github.com/hbilen/dynamic-image-nets,42202,https://github.com/Blssel,https://github.com/hbilen/dynamic-image-nets/c...,1.0,2.0,2.0,1403.0,163258.0,86480.0,other,no,no,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,41129,https://doi.org/10.1038/s41598-021-87709-7,39281,https://github.com/gcampede/terrorism-metagraphs,67511,https://github.com/gcampede,https://github.com/gcampede/terrorism-metagrap...,17.0,30595.0,29.0,17.0,30595.0,29.0,code,yes,no,First author repo owner
196,61912,https://doi.org/10.1145/3511808.3557275,59280,https://github.com/intellabs/nlp-architect,90822,https://github.com/chinnikrishna2231,https://github.com/intellabs/nlp-architect/com...,4.0,1479.0,125.0,451.0,601777.0,111623.0,code,no,no,
197,53557,https://doi.org/10.1109/ACCESS.2021.3112879,51256,https://github.com/gtlidar/tamp-manipulation,81979,https://github.com/edrumwri,https://github.com/gtlidar/tamp-manipulation/c...,176.0,0.0,0.0,25004.0,0.0,0.0,code,no,no,Alt branch
198,155656,https://doi.org/10.1613/jair.1.13326,151946,https://github.com/kebaek/minigrid,166678,https://github.com/YX-S-Z,https://github.com/kebaek/minigrid/commits?aut...,6.0,40.0,13.0,41.0,10517.0,1970.0,code,yes,no,First author


In [2]:
# Get the count and pct of bot accounts
bot_count = combined_set["is_bot_account"].value_counts()
bot_pct = combined_set["is_bot_account"].value_counts(normalize=True) * 100
bot_count, bot_pct

(is_bot_account
 no     198
 yes      2
 Name: count, dtype: int64,
 is_bot_account
 no     99.0
 yes     1.0
 Name: proportion, dtype: float64)

In [3]:
# Get the count and pct of actual authors we missed (is_actually_author)
author_cls_counts = combined_set["is_actually_author"].value_counts()
author_cls_pct = combined_set["is_actually_author"].value_counts(normalize=True) * 100
author_cls_counts, author_cls_pct

(is_actually_author
 no         78
 unclear    61
 yes        61
 Name: count, dtype: int64,
 is_actually_author
 no         39.0
 unclear    30.5
 yes        30.5
 Name: proportion, dtype: float64)

In [4]:
# Drop bot accounts
# Then groupby author cls and get the count of contribution type (contrib_type)
combined_set.loc[combined_set["is_bot_account"] == "no"].groupby("is_actually_author")[
    "contrib_type"
].value_counts().unstack().fillna(0).astype(int)

contrib_type,code,doc,other
is_actually_author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,59,13,4
unclear,50,8,3
yes,49,12,0


In [5]:
# Same thing but with pct
combined_set.loc[combined_set["is_bot_account"] == "no"].groupby("is_actually_author")[
    "contrib_type"
].value_counts(normalize=True).unstack().fillna(0) * 100

contrib_type,code,doc,other
is_actually_author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,77.631579,17.105263,5.263158
unclear,81.967213,13.114754,4.918033
yes,80.327869,19.672131,0.0


In [6]:
def _get_stats_for_author_cls(
    data: pd.DataFrame, author_cls: str | list[str], repo_owner_filter: str = "all"
) -> pd.DataFrame:
    # Get repo owner from repository_url
    data["repo_owner"] = data["repository_url"].str.split("/").str[3].str.lower()
    # Get contributor name from developer_account_url
    data["contributor_name"] = (
        data["developer_account_url"].str.split("/").str[3].str.lower()
    )

    # Handle ignore_repo_owners
    if repo_owner_filter == "none":
        # Filter out rows where repo_owner == contributor_name
        data = data.loc[data["repo_owner"] != data["contributor_name"]].copy()
    elif repo_owner_filter == "only":
        # Filter to only rows where repo_owner == contributor_name
        data = data.loc[data["repo_owner"] == data["contributor_name"]].copy()
    # If repo_owner_filter is "all", do nothing

    # Filter to author cls
    if isinstance(author_cls, str):
        # If author_cls is a string, filter to that single class
        data = data.loc[data["is_actually_author"] == author_cls].copy()
    elif isinstance(author_cls, list):
        # If author_cls is a list, filter to those classes
        data = data.loc[data["is_actually_author"].isin(author_cls)].copy()

    # Scope down to just is_actually_author == author_cls
    # And which contribution type is "code"
    # Take the quantiles, mean, and std of their commits (n_commits)
    # divided by total commits (total_repo_commits)
    commit_stats = (
        data.loc[
            (data["is_bot_account"] == "no")
            & (data["contrib_type"] == "code")
        ]
        .dropna(subset=["n_commits", "total_repo_commits"])
        .apply(lambda x: x["n_commits"] / x["total_repo_commits"], axis=1)
        .describe()
        .round(3)
    )

    # Same thing but for n_additions and total_repo_additions
    addition_stats = (
        data.loc[
            (data["is_bot_account"] == "no")
            & (data["contrib_type"] == "code")
            & (data["total_repo_additions"] > 0)
        ]
        .dropna(subset=["n_additions", "total_repo_additions"])
        .apply(lambda x: x["n_additions"] / x["total_repo_additions"], axis=1)
        .describe()
        .round(3)
    )

    # Same thing but with n_deletions and total_repo_deletions
    deletion_stats = (
        data.loc[
            (data["is_bot_account"] == "no")
            & (data["contrib_type"] == "code")
            & (data["total_repo_deletions"] > 0)
        ]
        .dropna(subset=["n_deletions", "total_repo_deletions"])
        .apply(lambda x: x["n_deletions"] / x["total_repo_deletions"], axis=1)
        .describe()
        .round(3)
    )

    # Combine n_additions and n_deletions into a single column
    # Combined total additions and deletions into a single column
    data["n_additions_deletions"] = data["n_additions"].fillna(0) + data[
        "n_deletions"
    ].fillna(0)
    data["total_repo_additions_deletions"] = data["total_repo_additions"].fillna(
        0
    ) + data["total_repo_deletions"].fillna(0)
    # Calculate the ratio of additions and deletions to total repo additions and deletions
    abs_stats = (
        data.loc[
            (data["is_bot_account"] == "no")
            & (data["contrib_type"] == "code")
            & (data["total_repo_additions_deletions"] > 0)
        ]
        .dropna(subset=["n_additions_deletions", "total_repo_additions_deletions"])
        .apply(
            lambda x: x["n_additions_deletions"] / x["total_repo_additions_deletions"],
            axis=1,
        )
        .describe()
        .round(3)
    )

    # Create frame
    stats_frame = pd.DataFrame(
        {
            "commit_stats": commit_stats,
            "addition_stats": addition_stats,
            "deletion_stats": deletion_stats,
            "abs_stats": abs_stats,
        }
    ).T

    return stats_frame


_get_stats_for_author_cls(combined_set, "no")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,53.0,0.178,0.307,0.001,0.007,0.029,0.107,1.0
addition_stats,51.0,0.227,0.38,0.0,0.001,0.007,0.192,1.0
deletion_stats,51.0,0.193,0.358,0.0,0.0,0.006,0.149,1.0
abs_stats,51.0,0.222,0.379,0.0,0.001,0.01,0.144,1.0


In [7]:
_get_stats_for_author_cls(combined_set, "unclear")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,50.0,0.747,0.366,0.004,0.505,1.0,1.0,1.0
addition_stats,50.0,0.722,0.42,0.0,0.31,1.0,1.0,1.0
deletion_stats,45.0,0.737,0.417,0.0,0.722,1.0,1.0,1.0
abs_stats,50.0,0.733,0.404,0.0,0.417,1.0,1.0,1.0


In [8]:
# My take away here is that we at least have 25% of the 30% making sizable contributions
# Reasoning being, 75th percentile is 25% making ~10% of total commits and abs additions/deletions
# Is that _worthy_ of authorship? Who knows, but 10% of total commits/additions/deletions is decent
# That is still ~7.5% of all papers

# Given that 25th percentile of unclear authors is also pretty high but not 100%,
# its likely that _some_ of those are also true "not authors"
0.25 * 30

7.5

In [9]:
# If we ignore repo owners, we can see the stats are a bit more distributed
# And I would argue that its maybe somewhere between 25% and 50% of unclear, non-repo owner authors
_get_stats_for_author_cls(combined_set, "no", repo_owner_filter="only")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,4.0,0.804,0.275,0.417,0.704,0.9,1.0,1.0
addition_stats,4.0,0.956,0.084,0.83,0.953,0.997,1.0,1.0
deletion_stats,4.0,0.915,0.141,0.706,0.891,0.976,1.0,1.0
abs_stats,4.0,0.953,0.087,0.822,0.948,0.995,1.0,1.0


In [10]:
# If we ignore repo owners, we can see the stats are a bit more distributed
# And I would argue that its maybe somewhere between 25% and 50% of unclear, non-repo owner authors
_get_stats_for_author_cls(combined_set, "unclear", repo_owner_filter="none")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,20.0,0.454,0.399,0.004,0.041,0.346,0.841,1.0
addition_stats,20.0,0.38,0.443,0.0,0.016,0.094,0.917,1.0
deletion_stats,20.0,0.486,0.492,0.0,0.003,0.431,0.999,1.0
abs_stats,20.0,0.392,0.44,0.0,0.011,0.127,0.919,1.0


In [11]:
# If we ignore repo owners, we can see the stats are a bit more distributed
# And I would argue that its maybe somewhere between 25% and 50% of unclear, non-repo owner authors
_get_stats_for_author_cls(combined_set, "unclear", repo_owner_filter="only")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,30.0,0.943,0.152,0.444,1.0,1.0,1.0,1.0
addition_stats,30.0,0.949,0.187,0.016,1.0,1.0,1.0,1.0
deletion_stats,25.0,0.938,0.18,0.158,1.0,1.0,1.0,1.0
abs_stats,30.0,0.961,0.124,0.407,1.0,1.0,1.0,1.0


In [12]:
# I think an estimate of 10% of all papers have a code-contributing non-author is reasonable
# Down from 30% to 10%, to me that is still a lot. Esp considering we are mostly dealing with
# analysis code. There are likely papers which rely on RSE developed tools which exist in separate repos
# and are not captured here.

In [13]:
_get_stats_for_author_cls(combined_set, ["unclear", "no"])

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,103.0,0.455,0.44,0.001,0.019,0.317,1.0,1.0
addition_stats,101.0,0.472,0.469,0.0,0.004,0.243,1.0,1.0
deletion_stats,96.0,0.448,0.472,0.0,0.002,0.149,1.0,1.0
abs_stats,101.0,0.475,0.467,0.0,0.004,0.338,1.0,1.0


In [14]:
_get_stats_for_author_cls(combined_set, ["unclear", "no"], repo_owner_filter="none")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,69.0,0.222,0.332,0.001,0.008,0.048,0.317,1.0
addition_stats,67.0,0.229,0.375,0.0,0.001,0.013,0.272,1.0
deletion_stats,67.0,0.237,0.398,0.0,0.0,0.006,0.174,1.0
abs_stats,67.0,0.229,0.375,0.0,0.001,0.013,0.259,1.0


In [15]:
_get_stats_for_author_cls(combined_set, ["unclear", "no"], repo_owner_filter="only")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
commit_stats,34.0,0.926,0.171,0.417,1.0,1.0,1.0,1.0
addition_stats,34.0,0.95,0.177,0.016,1.0,1.0,1.0,1.0
deletion_stats,29.0,0.935,0.174,0.158,1.0,1.0,1.0,1.0
abs_stats,34.0,0.96,0.119,0.407,1.0,1.0,1.0,1.0
