In [1]:
import re
import nltk
import textstat
import pandas as pd
import statsmodels.api as sm
from collections import Counter
from scipy.stats import mannwhitneyu

nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /home/heh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/heh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
df = pd.read_json("data/pandas.json", convert_dates=True).sort_values(by="closed_at")
contrib_cnt = Counter(df.resolver)
gfi, ngfi = df[df.resolver_commit_num <= 1], df[df.resolver_commit_num > 1]
len(df), len(gfi), len(ngfi)

(7619, 1622, 5997)

In [3]:
def count_code_snippets(s: str) -> int:
    p = re.compile(r"```.+?```", flags=re.S)
    if s is None:
        return 0
    return len(p.findall(s))


def delete_code_snippets(s: str) -> str:
    if s is None:
        return ""
    p = re.compile(r"```.+?```", flags=re.S)
    s = p.sub("", s)
    # return " ".join(s.split())
    return s


def count_urls(s: str) -> int:
    if s is None:
        return 0
    p = re.compile(r"http[:/\w\.]+")
    lst = list(
        filter(  # do not count images, this will be done in count_imgs()
            lambda s2: not (
                s2.endswith("jpg") or s2.endswith("jpeg") or s2.endswith("png")
            ),
            p.findall(s),
        )
    )
    return len(lst)


def delete_urls(s: str) -> str:
    if s == None:
        return ""
    p = re.compile(r"http[:/\w\.]+")
    s = p.sub("", s)
    # return " ".join(s.split())
    return s


def count_imgs(s: str) -> int:
    if s is None:
        return 0
    p = re.compile(r"http[:/\w\.]+")
    lst = list(
        filter(
            lambda s2: s2.endswith("jpg") or s2.endswith("jpeg") or s2.endswith("png"),
            p.findall(s),
        )
    )
    return len(lst)


def count_text_len(s: str) -> int:
    if s == None:
        return 0
    return len(s.split())


def get_categorized_labels(labels: list) -> dict:
    keyword_rules = {
        "bug": ["bug"],
        "feature": ["feature"],
        "test": ["test", "testing"],
        "build": ["ci", "build"],
        "doc": ["doc", "document", "documentation"],
        "coding": ["code", "coding", "program", "programming"],
        "enhance": ["enhance", "enhancement"],
        "gfi": [
            "easy",
            "starter",
            "newbie",
            "beginner",
            "starter",
            "minor",
            "novice",
            ("good", "first"),
            ("low", "fruit"),
            ("effort", "low"),
            ("first", "time"),
            ("first", "timer"),
            ("first", "pr"),
            ("up", "for", "grab"),
        ],
        "medium": ["medium", "intermediate"],
        "major": [
            "important",
            "major",
            "breaking",
            "difficult",
            "hard",
            "core",
            "serious",
            ("priority", "p1"),
            ("priority", "high"),
            ("priority", "critical"),
        ],
        "triaged": [
            "triaged",
            "triage",
            "progress",
            "haspr",
            "fixed",
            "wontfix",
            ("ha", "pr"),
            ("ha", "fix"),
        ],
        "untriaged": [
            "untriaged",
            ("need", "triage"),
            ("needed", "triage"),
            ("no", "triage"),
        ],
    }

    label_cat = Counter()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    for k in keyword_rules:
        label_cat[k] = 0
    for label in labels:
        words = re.compile(r"\w+").findall(label.lower().replace("_", " "))
        words = [lemmatizer.lemmatize(w) for w in words]
        for cat, rules in keyword_rules.items():
            match = 0
            for rule in rules:
                if isinstance(rule, (tuple, list)):
                    if all(word in words for word in rule):
                        match = 1
                elif rule in words:
                    match = 1
                elif any(rule in w for w in words):
                    match = 1
            label_cat[cat] += match
    return label_cat

In [4]:

gfi = gfi.drop_duplicates("resolver", keep="first")
gfi["clean_body"] = gfi.body.map(lambda s: delete_code_snippets(delete_code_snippets(s)))
gfi["len_title"] = gfi.title.map(count_text_len)
gfi["len_body"] = gfi.clean_body.map(count_text_len)
gfi["n_code_snips"] = gfi.body.map(count_code_snippets)
gfi["n_urls"] = gfi.body.map(count_urls)
gfi["n_imgs"] = gfi.body.map(count_imgs)
gfi["coleman_liau_index"] = gfi.clean_body.map(textstat.coleman_liau_index)
gfi["flesch_reading_ease"] = gfi.clean_body.map(textstat.flesch_reading_ease)
gfi["flesch_kincaid_grade"] = gfi.clean_body.map(textstat.flesch_kincaid_grade)
gfi["automated_readability_index"] = gfi.clean_body.map(textstat.automated_readability_index)
gfi["n_comments"] = gfi.comments.map(len)
gfi["n_events"] = gfi.events.map(len)
gfi["label_categories"] = gfi.labels.map(get_categorized_labels)

otc_gfi = gfi[gfi.resolver.map(lambda r: contrib_cnt[r] == 1)]
notc_gfi = gfi[gfi.resolver.map(lambda r: contrib_cnt[r] > 1)]
len(otc_gfi), len(notc_gfi)

(894, 449)

In [5]:
metrics = [
    "len_title", "len_body", 
    #"n_code_snips", "n_urls", "n_imgs", 
    "coleman_liau_index", 
    #"flesch_reading_ease", "flesch_kincaid_grade", "automated_readability_index", 
    "n_comments", "n_events",
    "additions", "deletions", "changed_files"
]
otc_gfi[metrics].describe()

Unnamed: 0,len_title,len_body,coleman_liau_index,n_comments,n_events,additions,deletions,changed_files
count,894.0,894.0,894.0,894.0,894.0,894.0,894.0,894.0
mean,8.014541,132.934004,14.641588,4.712528,17.699105,48.021253,13.878076,3.376957
std,3.215794,124.287916,16.362245,4.990149,12.61596,132.381251,61.732796,5.334706
min,2.0,0.0,-22.21,0.0,0.0,0.0,0.0,1.0
25%,6.0,43.0,9.96,2.0,10.0,9.0,1.0,1.0
50%,8.0,102.5,12.18,3.0,15.0,21.0,2.0,3.0
75%,10.0,191.0,15.3525,6.0,22.0,45.0,8.0,4.0
max,28.0,1630.0,346.4,61.0,140.0,2858.0,1316.0,85.0


In [6]:
notc_gfi[metrics].describe()

Unnamed: 0,len_title,len_body,coleman_liau_index,n_comments,n_events,additions,deletions,changed_files
count,449.0,449.0,449.0,449.0,449.0,449.0,449.0,449.0
mean,8.182628,126.563474,13.58167,4.563474,17.5902,74.247216,33.100223,3.714922
std,3.196337,101.613374,17.022839,5.00991,12.456227,206.005199,163.040793,4.000421
min,1.0,0.0,-33.81,0.0,1.0,0.0,0.0,0.0
25%,6.0,41.0,9.6,1.0,9.0,12.0,1.0,1.0
50%,8.0,108.0,11.81,3.0,15.0,24.0,4.0,3.0
75%,10.0,189.0,14.03,6.0,22.0,62.0,17.0,4.0
max,21.0,543.0,296.8,34.0,89.0,2991.0,2874.0,35.0


In [7]:
for metric in metrics:
    print(f"{metric:30} {mannwhitneyu(notc_gfi[metric], otc_gfi[metric])}")

len_title                      MannwhitneyuResult(statistic=208163.0, pvalue=0.26348717274588085)
len_body                       MannwhitneyuResult(statistic=197727.5, pvalue=0.6572580367520774)
coleman_liau_index             MannwhitneyuResult(statistic=183658.0, pvalue=0.011020454270767263)
n_comments                     MannwhitneyuResult(statistic=192339.5, pvalue=0.20967242028056132)
n_events                       MannwhitneyuResult(statistic=197661.0, pvalue=0.6498720012686057)
additions                      MannwhitneyuResult(statistic=223575.0, pvalue=0.000645216417732718)
deletions                      MannwhitneyuResult(statistic=221862.0, pvalue=0.001474818323657494)
changed_files                  MannwhitneyuResult(statistic=215987.5, pvalue=0.01817801692724078)


In [8]:
list(map(
    lambda x: (x[0], x[1] / len(otc_gfi)), 
    sorted(Counter(sum(otc_gfi.labels, [])).items(), key=lambda x: -x[1])[0:20]
))

[('good first issue', 0.31543624161073824),
 ('Docs', 0.2796420581655481),
 ('Bug', 0.2695749440715884),
 ('Effort Low', 0.23937360178970918),
 ('Difficulty Novice', 0.1331096196868009),
 ('Reshaping', 0.0894854586129754),
 ('Needs Tests', 0.0738255033557047),
 ('Difficulty Intermediate', 0.07158836689038031),
 ('Error Reporting', 0.06935123042505593),
 ('Enhancement', 0.06599552572706935),
 ('Timeseries', 0.05704697986577181),
 ('Indexing', 0.05480984340044743),
 ('Groupby', 0.053691275167785234),
 ('Visualization', 0.0447427293064877),
 ('Regression', 0.039149888143176735),
 ('Dtypes', 0.039149888143176735),
 ('API Design', 0.039149888143176735),
 ('Effort Medium', 0.03803131991051454),
 ('MultiIndex', 0.03467561521252797),
 ('Testing', 0.03355704697986577)]

In [9]:
list(map(
    lambda x: (x[0], x[1] / len(notc_gfi)), 
    sorted(Counter(sum(notc_gfi.labels, [])).items(), key=lambda x: -x[1])[0:20]
))

[('Bug', 0.32516703786191536),
 ('good first issue', 0.3028953229398664),
 ('Effort Low', 0.23608017817371937),
 ('Docs', 0.18262806236080179),
 ('Difficulty Novice', 0.12694877505567928),
 ('Needs Tests', 0.111358574610245),
 ('Reshaping', 0.0957683741648107),
 ('Difficulty Intermediate', 0.0801781737193764),
 ('Groupby', 0.0645879732739421),
 ('Timeseries', 0.05790645879732739),
 ('Indexing', 0.053452115812917596),
 ('Error Reporting', 0.051224944320712694),
 ('MultiIndex', 0.04899777282850779),
 ('Enhancement', 0.04899777282850779),
 ('API Design', 0.042316258351893093),
 ('Effort Medium', 0.0400890868596882),
 ('Categorical', 0.0400890868596882),
 ('Dtypes', 0.0378619153674833),
 ('Numeric', 0.0378619153674833),
 ('Testing', 0.0378619153674833)]

In [10]:
counter = Counter()
for cat in otc_gfi.label_categories:
    counter.update(cat)
total = sum(counter.values())
for k, v in counter.items():
    counter[k] = v / total
counter

Counter({'bug': 0.1497824735860783,
         'feature': 0.0,
         'test': 0.05966438781852082,
         'build': 0.014916096954630205,
         'doc': 0.15537600994406464,
         'coding': 0.00435052827843381,
         'enhance': 0.036668738346799255,
         'gfi': 0.3927905531385954,
         'medium': 0.061528899937849595,
         'major': 0.11622125543816035,
         'triaged': 0.00435052827843381,
         'untriaged': 0.00435052827843381})

In [11]:
counter = Counter()
for cat in notc_gfi.label_categories:
    counter.update(cat)
total = sum(counter.values())
for k, v in counter.items():
    counter[k] = v / total
counter

Counter({'bug': 0.18341708542713567,
         'feature': 0.0,
         'test': 0.08417085427135679,
         'build': 0.016331658291457288,
         'doc': 0.10301507537688442,
         'coding': 0.001256281407035176,
         'enhance': 0.02763819095477387,
         'gfi': 0.38944723618090454,
         'medium': 0.06909547738693467,
         'major': 0.11809045226130653,
         'triaged': 0.0037688442211055275,
         'untriaged': 0.0037688442211055275})

In [12]:
y = gfi.resolver.map(lambda r: contrib_cnt[r] > 1)
X = gfi[metrics]
log_reg = sm.Logit(y, X).fit()
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.637311
         Iterations 6


0,1,2,3
Dep. Variable:,resolver,No. Observations:,1343.0
Model:,Logit,Df Residuals:,1335.0
Method:,MLE,Df Model:,7.0
Date:,"Sat, 21 May 2022",Pseudo R-squ.:,-0.0001736
Time:,21:11:05,Log-Likelihood:,-855.91
converged:,True,LL-Null:,-855.76
Covariance Type:,nonrobust,LLR p-value:,1.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
len_title,-0.0239,0.014,-1.667,0.095,-0.052,0.004
len_body,-0.0008,0.001,-1.510,0.131,-0.002,0.000
coleman_liau_index,-0.0129,0.006,-2.320,0.020,-0.024,-0.002
n_comments,-0.0086,0.023,-0.377,0.706,-0.053,0.036
n_events,-0.0067,0.009,-0.769,0.442,-0.024,0.010
additions,0.0004,0.001,0.635,0.526,-0.001,0.002
deletions,0.0022,0.001,1.852,0.064,-0.000,0.004
changed_files,-0.0149,0.015,-1.011,0.312,-0.044,0.014
