In [1]:
import sys
sys.path.append("../")

In [13]:
from tqdm import tqdm_notebook

import os
import re
import orjson
import github
import typing
import jsonlines

import pandas as pd
import numpy as np

In [3]:
JSONType = typing.Dict[str, typing.Any]

## Load a data batch to analyze

In [4]:
def iterate_lines(file_path: str) -> typing.Generator[str, None, None]:
    with open(file_path, mode="r") as ptr:
        while True:
            line = ptr.readline()
            if not line:
                return
            yield line

In [5]:
file_path = "../tmp/raw_dataset.jsonl"

In [6]:
def remove_diffs_from_line(line: str) -> JSONType:
    json = orjson.loads(line)
    for modification in json["modifications"]:
        modification["diff"] = None
    return json

In [7]:
remove_diffs_from_line(next(iterate_lines(file_path)))

{'sha': '74a04f01f662b4611d9aed44a77041a982b8766c',
 'repository': 'facebookresearch@pythia',
 'message': 'Initial commit',
 'modifications': [{'added': 9,
   'removed': 0,
   'change_type': 'ADD',
   'diff': None,
   'filename': '.gitignore',
   'old_path': None,
   'new_path': '.gitignore'},
  {'added': 71,
   'removed': 0,
   'change_type': 'ADD',
   'diff': None,
   'filename': 'Dockerfile',
   'old_path': None,
   'new_path': 'Dockerfile'},
  {'added': 30,
   'removed': 0,
   'change_type': 'ADD',
   'diff': None,
   'filename': 'LICENSE',
   'old_path': None,
   'new_path': 'LICENSE'},
  {'added': 269,
   'removed': 0,
   'change_type': 'ADD',
   'diff': None,
   'filename': 'README.md',
   'old_path': None,
   'new_path': 'README.md'},
  {'added': 1,
   'removed': 0,
   'change_type': 'ADD',
   'diff': None,
   'filename': 'best_model_predict_test.json',
   'old_path': None,
   'new_path': 'best_model/best_model_predict_test.json'},
  {'added': 86,
   'removed': 0,
   'change_ty

In [8]:
def filter_obj(obj: JSONType) -> JSONType:
    return {
        "sha": obj["sha"],
        "message": obj["message"],
        "modifications": len(obj["modifications"])
    }

def remove_diffs_from_file(file_path: str, output_path: str = None, max_len: int = 100_000):
    buffer: typing.List[JSONType] = []
    for line in iterate_lines(file_path):
        obj = remove_diffs_from_line(line)
        obj = filter_obj(obj)
        buffer.append(obj)
        if len(buffer) >= max_len:
            return buffer
    return buffer

In [9]:
data = remove_diffs_from_file(file_path, max_len=100_000)

In [10]:
df = pd.DataFrame(data)
df

Unnamed: 0,sha,message,modifications
0,74a04f01f662b4611d9aed44a77041a982b8766c,Initial commit,87
1,af0a1b48fd77d6581231fa1d7d1e935b4d039f15,Update README.md,1
2,7d0548bb561c96920d17767c38edded301c81d39,add acknowledgements,1
3,4520567675b4ece4fe2f0457aa0def752c23fdd4,adding code_of_conduct and contributing,3
4,beb0ae07e487b86ff8bf9d24d67123a408b2a378,allow to load pretrained multiple GPU model un...,1
...,...,...,...
99995,ae97ca1dc83848e40b0609a49eee0eb92e43570f,Use update_one instead of replace_one in mongo...,1
99996,0259ca814dbee9d5a7cf2930b610264e96d1d6d8,Add stalebot integration\n\nAs some issue tend...,1
99997,4d47de6f362ebb9d745475e84ee0a2d22b3a5b0c,Add reference to incense in README,3
99998,a6996a201a96c7d7c4aa84cf9af77f5fbfde9fb3,fix exception with compiled pybind11 extension...,1


## Modification count statistics

In [11]:
df.modifications.mean(), \
df.modifications.median(), \
df.modifications.max()

(2.7774, 1.0, 18337)

In [37]:
df.modifications.quantile(0.15), \
df.modifications.quantile(0.25), \
df.modifications.quantile(0.50), \
df.modifications.quantile(0.75), \
df.modifications.quantile(0.95)

(0.0, 1.0, 1.0, 2.0, 7.0)

In [38]:
df[df.modifications == 77983]

Unnamed: 0,sha,message,modifications
509774,c2d6d44438b5d23393afd13ddd80cd933cc00099,Massive rework of the whole repository,77983


## Explore message insights

In [14]:
df[df.message.str.contains("merge", flags=re.IGNORECASE, regex=True)]

Unnamed: 0,sha,message,modifications
14,49bf5f677c36baedb4d8fa53f665bd97dbaeb1b8,Merge pull request #7 from meetshah1995/master...,0
18,e942e94fae8ef3fe5d93f4bb3d25daf6b9e0eaa2,Merge pull request #9 from Cadene/master\n\nFi...,0
25,5ad666f417684f8f1280029ac46e19dc8b5ddb0a,Merge pull request #39 from HuaizhengZhang/pat...,0
364,c8ef41ef4723f371e9c8759f0fb396a507737098,Merge remote-tracking branch 'origin/master'\n...,0
394,b31cb7d03f687021c9ea334f35acdcb6d34305d5,Merge pull request #1 from Radagaisus/patch-1\...,0
...,...,...,...
99974,6faaa8bc08fe8f0f8fe31cdfecb42b4e2af6c24c,Merge pull request #367 from thequilo/improved...,0
99976,c8e570fed38dce37580a12bfec4ca2e4ddf0ccb7,Merge pull request #372 from onurgu/master\n\n...,0
99982,b738fa6222661d109c340ca115c5a89e00557020,Merge pull request #373 from bosr/feat/use-col...,0
99983,94785b9592b16e8498212ad9cfa0e05d49a1819d,Merge pull request #374 from brickerino/master...,0


In [20]:
df[df.message.str.contains("^bump", flags=re.IGNORECASE, regex=True)][30:60]

Unnamed: 0,sha,message,modifications
3352,acb11bd45fc153ea37278efb7683555df17f4215,Bumped version to dev0,1
3398,e91c46bd16c67b740ff17ec1d5eb8c3ba3611d4e,Bumped CHANGELOG,1
3402,5da6f63ceaac29896f3d54004462ab23b1ef4e16,Bumped CHANGELOG with recent changes,1
3443,357c790844379f54ac133b8b47e90daf6b34327e,Bumped version to dev0,1
3450,f9712a4f0f21fb71c83097a73845dfabfadb42ce,Bumped CHANGELOG and AUTHORS,2
3489,dbf19845a2bfd4f869cd7d17a7b8bcb63afd7a7e,"Bumped CHANGELOG, added Aleš to AUTHORS. Thanks!",2
3540,605a1b8b42658a27e8381c8ff9b9d2225e3d39d2,Bumped CHANGELOG,1
3632,c87e7c6274da5eff5a0283b28570a78cc9975610,Bumped changelog,1
3638,1745e1bd287b65d7d734b925660647533971193b,Bumped required package jsonfield to version 0...,1
3676,346bdf5d71608142b5f0d7185790e4429e01e322,Bumping changelog and version number for release.,2


In [78]:
df[df.message.str.contains("^initial(\s+\w+){0,3}$", flags=re.IGNORECASE, regex=True)]

Unnamed: 0,sha,message,modifications
0,74a04f01f662b4611d9aed44a77041a982b8766c,Initial commit,87
329,bfb28da372a8fd8fc1b93378c8530472cc0ca483,Initial commit,3
429,d46ec12f47e75ab7a34410d37d63906c99fe3b8f,Initial commit,3
634,a9102f42db200ae3d21486c2a97b98338ee65b2b,initial version of reactive,2
2320,83b1847647c08d089a477fd8441cc57737292e12,initial commit,6
...,...,...,...
992361,eb082e22b8431ff9e2b6ac3c776a1f83e7150453,initial arista module import,4
993129,58680f38c3642c9c495bdf2809c370c5f647e1f3,Initial bigip_node version,1
993560,7821b24144e6709b1febd3986a5f1a47c8898854,initial ejabberd_user module,1
994632,ccf4199ac8b453227e6ea73f5465ce69782ceab0,Initial commit of rax_dns_record,1


In [77]:
df[df.message.str.contains("^update(\s+\w+){0,3}$", flags=re.IGNORECASE, regex=True)]

Unnamed: 0,sha,message,modifications
19,18cd8b9c23db229d8e0e55b0a6c2d337cb66d2e3,Update download paths,1
129,543e5ad91bb5e51573f33eb7154883fb128a8379,Update configs,4
202,efb74335645b78e49b6d09813f292e6828235b9e,Update COC and Contributing,2
232,41c9b8b11844037bcd825b863fc42150aad06354,Update quickstart documentation,1
415,4db83916751aed54efa7a8f6d8264e8e893560b9,update readme,1
...,...,...,...
999781,97438f003909361d21a12a4187bbceb50fe1387a,Update the modules,3
999797,019f74dcedc598c31cb7109d82768e617115a232,Update modules,4
999820,9db17afc855532b87675eaeee5f0b032d8e92041,Update modules,4
999898,dd484e82111e2856af0dbcf6c48f2385743cbd63,Update modules,3


In [81]:
df.modifications.mean()

3.496965

In [136]:
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    df = df[~df.message.str.contains("merge", flags=re.IGNORECASE, regex=True)]
    df = df[~df.message.str.contains("^\W*(\w+(\W+|$)){0,3}$", regex=True)]
    return df

In [137]:
clean_dataset(df)

Unnamed: 0,sha,message,modifications
3,4520567675b4ece4fe2f0457aa0def752c23fdd4,adding code_of_conduct and contributing,3
4,beb0ae07e487b86ff8bf9d24d67123a408b2a378,allow to load pretrained multiple GPU model un...,1
5,bf29c7cd19aaf290f4efdcd70b8c61c68987dd43,add standalone image vqa nb,3
11,c862901d805bcca2dd18d6765ac73afcd9591797,"reformating the detectron feature extractor, f...",1
12,01e40789b4326be1b4335e81cf20b72f922fc0db,removing duplicating detectron feature extractor.,1
...,...,...,...
999992,2e929cf0cee26addecff9b129d1c6b82f6ea5cd8,Fixes bug #10281 - Trailing zeros were truncat...,1
999993,2dc910bd1486ab27bb3f7cf9c52f3dec75448ce5,Fixes bug #10225 - Runtime error\n\nPrevents a...,1
999994,b9bfb22a9dab2592c16388bc70db320bda7ff209,fix https transport bug,1
999997,b9aa8f07bebcdb7a448154a7c773ef173ab3c73f,skips certain ssh errors and attempts to conti...,1


In [138]:
df[df.message.str.contains("fix", flags=re.IGNORECASE, regex=True)]

Unnamed: 0,sha,message,modifications
16,9599ee6508727fa67beb14c5c74ecf38d9bf291c,Fix one_stage_run_model args order,1
18,e942e94fae8ef3fe5d93f4bb3d25daf6b9e0eaa2,Merge pull request #9 from Cadene/master\n\nFi...,0
44,14d066d2b0f4cf205056af51cbad1420b3525855,Updates for package init and other syntax fixes,18
49,fbbfb42c5264bd716a6883b0022218ef8b24daf4,Config related changes and soem fixes,6
50,54f43f1c9ccf1c45abb7d28472752350d86964fa,Bug fixes for variety of syntax error and more...,13
...,...,...,...
999991,a14248ffe1ec7a7cf8623732ebc24e680883fa20,Fixes bug #10281 - Trailing zeros were truncat...,1
999992,2e929cf0cee26addecff9b129d1c6b82f6ea5cd8,Fixes bug #10281 - Trailing zeros were truncat...,1
999993,2dc910bd1486ab27bb3f7cf9c52f3dec75448ce5,Fixes bug #10225 - Runtime error\n\nPrevents a...,1
999994,b9bfb22a9dab2592c16388bc70db320bda7ff209,fix https transport bug,1


In [161]:
df[df.message.str.contains("([^\d]|^)\d([^\d]|$)", flags=re.IGNORECASE, regex=True)][:40]

  return func(self, *args, **kwargs)


Unnamed: 0,sha,message,modifications
14,49bf5f677c36baedb4d8fa53f665bd97dbaeb1b8,Merge pull request #7 from meetshah1995/master...,0
18,e942e94fae8ef3fe5d93f4bb3d25daf6b9e0eaa2,Merge pull request #9 from Cadene/master\n\nFi...,0
25,5ad666f417684f8f1280029ac46e19dc8b5ddb0a,Merge pull request #39 from HuaizhengZhang/pat...,0
33,5714ac561bfe7def6535ce35e365131ebd174a55,Add tasks folder with vqa2 task built on base ...,4
72,cd323fc59e694b6f27a3cbe81cd03e7e2805b30c,Add best config for VQA2,1
76,75f8694219c60dfd59e481df1d526ab3efa372e9,"Revert imdb builder, add metrics info to vqa2 ...",3
86,1451d9a5e0823de5ab16ba4999d3fdc55ec32ba3,Update registry to register metrics and suppor...,1
123,56117ea05028cb075ef6e70780426621efc07439,End 2 End working vizwiz model and add model r...,9
144,eb64a23f951829a4260c9a461a1de7d7ab643b5c,Update losses and metrics to include info\n\n-...,2
154,5c11419fdfb9ab87e19e002fbc58ffff75b7d585,Add vqa2 ocr dataset and builder,2
