In [1]:
import re
import textstat
import pandas as pd
import statsmodels.api as sm
from collections import Counter
from scipy.stats import mannwhitneyu

In [2]:
df = pd.read_json("data/pandas.json", convert_dates=True).sort_values(by="closed_at")
contrib_cnt = Counter(df.resolver)
gfi, ngfi = df[df.resolver_commit_num <= 1], df[df.resolver_commit_num > 1]
len(df), len(gfi), len(ngfi)

(7619, 1622, 5997)

In [3]:
def count_code_snippets(s: str) -> int:
    p = re.compile(r"```.+?```", flags=re.S)
    if s is None:
        return 0
    return len(p.findall(s))


def delete_code_snippets(s: str) -> str:
    if s is None:
        return ""
    p = re.compile(r"```.+?```", flags=re.S)
    s = p.sub("", s)
    # return " ".join(s.split())
    return s


def count_urls(s: str) -> int:
    if s is None:
        return 0
    p = re.compile(r"http[:/\w\.]+")
    lst = list(
        filter(  # do not count images, this will be done in count_imgs()
            lambda s2: not (
                s2.endswith("jpg") or s2.endswith("jpeg") or s2.endswith("png")
            ),
            p.findall(s),
        )
    )
    return len(lst)


def delete_urls(s: str) -> str:
    if s == None:
        return ""
    p = re.compile(r"http[:/\w\.]+")
    s = p.sub("", s)
    # return " ".join(s.split())
    return s


def count_imgs(s: str) -> int:
    if s is None:
        return 0
    p = re.compile(r"http[:/\w\.]+")
    lst = list(
        filter(
            lambda s2: s2.endswith("jpg") or s2.endswith("jpeg") or s2.endswith("png"),
            p.findall(s),
        )
    )
    return len(lst)


def count_text_len(s: str) -> int:
    if s == None:
        return 0
    return len(s.split())

gfi = gfi.drop_duplicates("resolver", keep="first")
gfi["clean_body"] = gfi.body.map(lambda s: delete_code_snippets(delete_code_snippets(s)))
gfi["len_title"] = gfi.title.map(count_text_len)
gfi["len_body"] = gfi.clean_body.map(count_text_len)
gfi["n_code_snips"] = gfi.body.map(count_code_snippets)
gfi["n_urls"] = gfi.body.map(count_urls)
gfi["n_imgs"] = gfi.body.map(count_imgs)
gfi["coleman_liau_index"] = gfi.clean_body.map(textstat.coleman_liau_index)
gfi["flesch_reading_ease"] = gfi.clean_body.map(textstat.flesch_reading_ease)
gfi["flesch_kincaid_grade"] = gfi.clean_body.map(textstat.flesch_kincaid_grade)
gfi["automated_readability_index"] = gfi.clean_body.map(textstat.automated_readability_index)
gfi["n_comments"] = gfi.comments.map(len)
gfi["n_events"] = gfi.events.map(len)

otc_gfi = gfi[gfi.resolver.map(lambda r: contrib_cnt[r] == 1)]
notc_gfi = gfi[gfi.resolver.map(lambda r: contrib_cnt[r] > 1)]
len(otc_gfi), len(notc_gfi)

(894, 449)

In [4]:
metrics = [
    "len_title", "len_body", 
    #"n_code_snips", "n_urls", "n_imgs", 
    "coleman_liau_index", 
    #"flesch_reading_ease", "flesch_kincaid_grade", "automated_readability_index", 
    "n_comments", "n_events"
]
otc_gfi[metrics].describe()

Unnamed: 0,len_title,len_body,coleman_liau_index,n_comments,n_events
count,894.0,894.0,894.0,894.0,894.0
mean,8.014541,132.934004,14.641588,4.712528,17.699105
std,3.215794,124.287916,16.362245,4.990149,12.61596
min,2.0,0.0,-22.21,0.0,0.0
25%,6.0,43.0,9.96,2.0,10.0
50%,8.0,102.5,12.18,3.0,15.0
75%,10.0,191.0,15.3525,6.0,22.0
max,28.0,1630.0,346.4,61.0,140.0


In [5]:
notc_gfi[metrics].describe()

Unnamed: 0,len_title,len_body,coleman_liau_index,n_comments,n_events
count,449.0,449.0,449.0,449.0,449.0
mean,8.182628,126.563474,13.58167,4.563474,17.5902
std,3.196337,101.613374,17.022839,5.00991,12.456227
min,1.0,0.0,-33.81,0.0,1.0
25%,6.0,41.0,9.6,1.0,9.0
50%,8.0,108.0,11.81,3.0,15.0
75%,10.0,189.0,14.03,6.0,22.0
max,21.0,543.0,296.8,34.0,89.0


In [6]:
for metric in metrics:
    print(f"{metric:30} {mannwhitneyu(notc_gfi[metric], otc_gfi[metric])}")

len_title                      MannwhitneyuResult(statistic=208163.0, pvalue=0.26348717274588085)
len_body                       MannwhitneyuResult(statistic=197727.5, pvalue=0.6572580367520774)
coleman_liau_index             MannwhitneyuResult(statistic=183658.0, pvalue=0.011020454270767263)
n_comments                     MannwhitneyuResult(statistic=192339.5, pvalue=0.20967242028056132)
n_events                       MannwhitneyuResult(statistic=197661.0, pvalue=0.6498720012686057)


In [7]:
list(map(
    lambda x: (x[0], x[1] / len(otc_gfi)), 
    sorted(Counter(sum(otc_gfi.labels, [])).items(), key=lambda x: -x[1])[0:20]
))

[('good first issue', 0.31543624161073824),
 ('Docs', 0.2796420581655481),
 ('Bug', 0.2695749440715884),
 ('Effort Low', 0.23937360178970918),
 ('Difficulty Novice', 0.1331096196868009),
 ('Reshaping', 0.0894854586129754),
 ('Needs Tests', 0.0738255033557047),
 ('Difficulty Intermediate', 0.07158836689038031),
 ('Error Reporting', 0.06935123042505593),
 ('Enhancement', 0.06599552572706935),
 ('Timeseries', 0.05704697986577181),
 ('Indexing', 0.05480984340044743),
 ('Groupby', 0.053691275167785234),
 ('Visualization', 0.0447427293064877),
 ('Regression', 0.039149888143176735),
 ('Dtypes', 0.039149888143176735),
 ('API Design', 0.039149888143176735),
 ('Effort Medium', 0.03803131991051454),
 ('MultiIndex', 0.03467561521252797),
 ('Testing', 0.03355704697986577)]

In [8]:
list(map(
    lambda x: (x[0], x[1] / len(notc_gfi)), 
    sorted(Counter(sum(notc_gfi.labels, [])).items(), key=lambda x: -x[1])[0:20]
))

[('Bug', 0.32516703786191536),
 ('good first issue', 0.3028953229398664),
 ('Effort Low', 0.23608017817371937),
 ('Docs', 0.18262806236080179),
 ('Difficulty Novice', 0.12694877505567928),
 ('Needs Tests', 0.111358574610245),
 ('Reshaping', 0.0957683741648107),
 ('Difficulty Intermediate', 0.0801781737193764),
 ('Groupby', 0.0645879732739421),
 ('Timeseries', 0.05790645879732739),
 ('Indexing', 0.053452115812917596),
 ('Error Reporting', 0.051224944320712694),
 ('MultiIndex', 0.04899777282850779),
 ('Enhancement', 0.04899777282850779),
 ('API Design', 0.042316258351893093),
 ('Effort Medium', 0.0400890868596882),
 ('Categorical', 0.0400890868596882),
 ('Dtypes', 0.0378619153674833),
 ('Numeric', 0.0378619153674833),
 ('Testing', 0.0378619153674833)]

In [9]:
y = gfi.resolver.map(lambda r: contrib_cnt[r] > 1)
X = gfi[metrics]
log_reg = sm.Logit(y, X).fit()
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.640918
         Iterations 6


0,1,2,3
Dep. Variable:,resolver,No. Observations:,1343.0
Model:,Logit,Df Residuals:,1338.0
Method:,MLE,Df Model:,4.0
Date:,"Sat, 21 May 2022",Pseudo R-squ.:,-0.005835
Time:,20:35:56,Log-Likelihood:,-860.75
converged:,True,LL-Null:,-855.76
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
len_title,-0.0255,0.014,-1.801,0.072,-0.053,0.002
len_body,-0.0009,0.001,-1.677,0.094,-0.002,0.000
coleman_liau_index,-0.0127,0.005,-2.317,0.021,-0.023,-0.002
n_comments,-0.0058,0.023,-0.259,0.796,-0.050,0.038
n_events,-0.0057,0.009,-0.669,0.503,-0.023,0.011
