In [1]:
import polars as pl

estimates = pl.read_parquet("author-developer-discovery-estimates/")
estimates

article_repo_timedelta_allowance,total_pair_combinations,possible_pairs_given_filter
str,i64,i64
"""1 year""",10800,1453
"""1 year""",16,0
"""1 year""",4326,294
"""1 year""",85,33
"""1 year""",64,10
…,…,…
"""5 years""",6,4
"""5 years""",4428,2912
"""5 years""",0,0
"""5 years""",294,241


In [2]:
averages = estimates.with_columns(
    (
        (pl.col("possible_pairs_given_filter") + pl.lit(1))
        / (pl.col("total_pair_combinations") + pl.lit(1))
    ).alias("pair_reduction")
).group_by("article_repo_timedelta_allowance").agg(
    pl.col("possible_pairs_given_filter").mean().alias("mean_possible_pairs_given_filter"),
    pl.col("possible_pairs_given_filter").std().alias("std_possible_pairs_given_filter"),
    pl.col("possible_pairs_given_filter").median().alias("median_possible_pairs_given_filter"),
    pl.col("pair_reduction").mean().alias("mean_pair_reduction"),
    pl.col("pair_reduction").std().alias("std_pair_reduction"),
    pl.col("pair_reduction").median().alias("median_pair_reduction"),
).sort("mean_pair_reduction", descending=False)

averages

article_repo_timedelta_allowance,mean_possible_pairs_given_filter,std_possible_pairs_given_filter,median_possible_pairs_given_filter,mean_pair_reduction,std_pair_reduction,median_pair_reduction
str,f64,f64,f64,f64,f64,f64
"""180 days""",404.290581,1759.362339,75.0,0.124123,0.140653,0.089649
"""1 year""",808.654618,3448.479735,150.0,0.212692,0.150461,0.180374
"""2 years""",1545.967871,6646.029118,304.5,0.377723,0.18085,0.34472
"""3 years""",2226.64,9635.312405,439.5,0.523464,0.198011,0.495882
"""4 years""",2828.85,12514.966314,554.5,0.63967,0.201813,0.635308
"""5 years""",3349.812,15149.035802,649.0,0.72727,0.192802,0.739393


In [3]:
# For each possible pair, you need:
# one GitHub API call per contributor (on average lets say there is 2 contributors per article)
# one GitHub API call to get the README
# one GitHub API call to get the repository tree
# one GitHub API call to get the commits count
# one GitHub API call to get repository languages (actually ignore this, we only need it for successful pairs)
# one SemanticScholar API call to get the updated metadata (actually ignore this, we don't need it at all)
# one OpenAlex API call per author (on average lets say there is 5 authors per article) (actually ignore this, we only need it for successful pairs)

# each GitHub API call costs 0.9 seconds
# each SemanticScholar API call costs 0.1 seconds
# each OpenAlex API call costs 0.6 seconds

# so for each pair we have:
# total number of GitHub API calls = 5
# total number of SemanticScholar API calls = 0
# total number of OpenAlex API calls = 0
conservative_time_estimate_per_pair = (
    (5 * 0.9) + (0 * 0.1) + (0 * 0.6)
)

# Lets add a second to the conservative time estimate to account for the predictive model usage
conservative_time_estimate_per_pair += 0.3

# However, we run a lot of this in parallel, so we can assume that we can do ~6 article-repository pairs per second
# This means that the conservative time estimate per pair is actually:
# This number is highly dependent on the number of GitHub API keys we have access to
conservative_time_estimate_per_pair /= 6

# Add a column with the conservative time estimate per pair
averages = averages.with_columns(
    (
        pl.lit(conservative_time_estimate_per_pair)
        * pl.col("mean_possible_pairs_given_filter")
    ).alias("mean_conservative_time_estimate_seconds"),
    (
        pl.lit(conservative_time_estimate_per_pair)
        * pl.col("median_possible_pairs_given_filter")
    ).alias("median_conservative_time_estimate_seconds"),
)

# This is all per-author-developer pair
# Let's get this in minutes
averages = averages.with_columns(
    (
        pl.col("mean_conservative_time_estimate_seconds") / 60
    ).alias("mean_conservative_time_estimate_minutes"),
    (
        pl.col("median_conservative_time_estimate_seconds") / 60
    ).alias("median_conservative_time_estimate_minutes"),
)

# Finally, multiple these estimates by the number of author-developer pairs
averages = averages.with_columns(
    (
        pl.lit(120_000)
        * pl.col("mean_conservative_time_estimate_minutes")
    ).alias("mean_total_time_estimate_minutes"),
    (
        pl.lit(120_000)
        * pl.col("median_conservative_time_estimate_minutes")
    ).alias("median_total_time_estimate_minutes"),
)

# Divide by 60 to get hours
averages = averages.with_columns(
    (
        pl.col("mean_total_time_estimate_minutes") / 60
    ).alias("mean_total_time_estimate_hours"),
    (
        pl.col("median_total_time_estimate_minutes") / 60
    ).alias("median_total_time_estimate_hours"),
)

# Divide by 24 to get days
averages = averages.with_columns(
    (pl.col("mean_total_time_estimate_hours") / 24).alias("mean_total_time_estimate_days"),
    (pl.col("median_total_time_estimate_hours") / 24).alias("median_total_time_estimate_days"),
)

averages

article_repo_timedelta_allowance,mean_possible_pairs_given_filter,std_possible_pairs_given_filter,median_possible_pairs_given_filter,mean_pair_reduction,std_pair_reduction,median_pair_reduction,mean_conservative_time_estimate_seconds,median_conservative_time_estimate_seconds,mean_conservative_time_estimate_minutes,median_conservative_time_estimate_minutes,mean_total_time_estimate_minutes,median_total_time_estimate_minutes,mean_total_time_estimate_hours,median_total_time_estimate_hours,mean_total_time_estimate_days,median_total_time_estimate_days
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""180 days""",404.290581,1759.362339,75.0,0.124123,0.140653,0.089649,323.432465,60.0,5.390541,1.0,646864.92986,120000.0,10781.082164,2000.0,449.211757,83.333333
"""1 year""",808.654618,3448.479735,150.0,0.212692,0.150461,0.180374,646.923695,120.0,10.782062,2.0,1293800.0,240000.0,21564.123159,4000.0,898.505132,166.666667
"""2 years""",1545.967871,6646.029118,304.5,0.377723,0.18085,0.34472,1236.774297,243.6,20.612905,4.06,2473500.0,487200.0,41225.809906,8120.0,1717.742079,338.333333
"""3 years""",2226.64,9635.312405,439.5,0.523464,0.198011,0.495882,1781.312,351.6,29.688533,5.86,3562600.0,703200.0,59377.066667,11720.0,2474.044444,488.333333
"""4 years""",2828.85,12514.966314,554.5,0.63967,0.201813,0.635308,2263.08,443.6,37.718,7.393333,4526160.0,887200.0,75436.0,14786.666667,3143.166667,616.111111
"""5 years""",3349.812,15149.035802,649.0,0.72727,0.192802,0.739393,2679.8496,519.2,44.66416,8.653333,5359700.0,1038400.0,89328.32,17306.666667,3722.013333,721.111111
