In [3]:
import sys
sys.path.append("../")

In [4]:
from src.download_commit_data import Config
from tqdm import tqdm_notebook

import os
import github
import typing
import jsonlines

import pandas as pd
import numpy as np

In [5]:
with jsonlines.open("../data/repositories/top1000_page1.jsonl") as reader:
    top1000_page1 = [obj for obj in reader.read()]

In [6]:
with jsonlines.open("../data/repositories/top1000_page2.jsonl") as reader:
    top1000_page2 = [obj for obj in reader.read()]

In [7]:
df1 = pd.DataFrame(top1000_page1)

In [8]:
df2 = pd.DataFrame(top1000_page2)

In [9]:
df = pd.concat([df1, df2])
df = df.drop_duplicates()

In [10]:
df[:10]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
0,donnemartin/system-design-primer,Python,275,75863,75863,12336,4330,False,False
1,vinta/awesome-python,Python,1399,75128,75128,14768,5316,False,False
2,public-apis/public-apis,Python,2129,64406,64406,7197,2715,False,False
3,tensorflow/models,Python,3936,58983,58983,37248,523487,False,False
4,ytdl-org/youtube-dl,Python,17388,57370,57370,9954,57035,False,False
5,nvbn/thefuck,Python,1547,50122,50122,2506,2788,False,False
6,pallets/flask,Python,3776,47344,47344,13032,7321,False,False
7,keras-team/keras,Python,5341,45176,45176,17170,13322,False,False
8,django/django,Python,27513,45043,45043,19418,190821,False,False
9,jakubroztocil/httpie,Python,1112,43661,43661,2866,5563,False,False


In [11]:
def basic_stats(field, title):
    mean = field.mean()
    _min, _max = field.min(), field.max()
    percentile_0_25 = field.quantile(0.25)
    percentile_0_50 = field.quantile(0.50)
    percentile_0_95 = field.quantile(0.95)
    print(f"""Basic stats for {title}:
    mean={mean:0.2f}
    min={_min:0.2f}
    max={_max:0.2f}
    25_percentile={percentile_0_25:0.2f}
    50_percentile={percentile_0_50:0.2f}
    95_percentile={percentile_0_95:0.2f}
    """)

In [12]:
basic_stats(df.stargazers_count, "stargazers_count")

Basic stats for stargazers_count:
    mean=3962.99
    min=1132.00
    max=75863.00
    25_percentile=1488.00
    50_percentile=2216.00
    95_percentile=11777.80
    


In [13]:
basic_stats(df.commits, "commits")

Basic stats for commits:
    mean=1902.02
    min=1.00
    max=169340.00
    25_percentile=119.00
    50_percentile=421.00
    95_percentile=8320.20
    


In [14]:
basic_stats(df.forks_count, "forks_count")

Basic stats for forks_count:
    mean=870.44
    min=28.00
    max=37248.00
    25_percentile=259.00
    50_percentile=453.00
    95_percentile=2663.60
    


In [15]:
basic_stats(df["size"], "size")

Basic stats for size:
    mean=35081.73
    min=7.00
    max=2741739.00
    25_percentile=935.00
    50_percentile=4313.00
    95_percentile=150054.20
    


In [16]:
df[df.fork == True]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork


In [17]:
df[df.archived == True]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
60,reddit-archive/reddit,Python,7956,15148,15148,2783,40093,True,False
171,eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/eeeeeeeeeeeee...,Python,415,7485,7485,672,1236,True,False
186,openai/universe,Python,210,7136,7136,856,1616,True,False
233,p-e-w/maybe,Python,78,6471,6471,174,140,True,False
330,xchaoinfo/fuck-login,Python,117,5356,5356,2005,316,True,False
353,yahoo/open_nsfw,Python,13,5099,5099,971,21634,True,False
440,facebookarchive/augmented-traffic-control,Python,237,4167,4167,580,12087,True,False
654,not-kennethreitz/osx-gcc-installer,Python,54,3089,3089,217,494,True,False
704,byt3bl33d3r/MITMf,Python,467,2878,2878,899,1375,True,False
710,nryoung/algorithms,Python,279,2865,2865,721,1866,True,False


In [18]:
df.sort_values("commits", ascending=True)[:50]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
16,shadowsocks/shadowsocks,Python,1,31774,31774,19454,891,False,False
128,ziggear/shadowsocks,Python,2,1880,1880,2511,273,False,False
774,joshua-wu/deepfakes_faceswap,Python,4,2704,2704,960,9,False,False
859,openai/evolution-strategies-starter,Python,4,1165,1165,226,44,False,False
577,llSourcell/YOLO_Object_Detection,Python,4,1390,1390,682,2264,False,False
495,angusshire/greenhat,Python,4,1431,1431,175,1127,False,False
728,llSourcell/learn_math_fast,Python,5,2822,2822,550,44,False,False
259,princeton-vl/CornerNet,Python,5,1770,1770,378,39,False,False
818,wepe/O2O-Coupon-Usage-Forecast,Python,5,1199,1199,785,1338,False,False
545,openai/finetune-transformer-lm,Python,6,1419,1419,368,423329,False,False


In [61]:
df.sort_values("commits", ascending=True)[:50]

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
16,shadowsocks/shadowsocks,Python,1,31774,31774,19454,891,False,False
128,ziggear/shadowsocks,Python,2,1880,1880,2511,273,False,False
774,joshua-wu/deepfakes_faceswap,Python,4,2704,2704,960,9,False,False
859,openai/evolution-strategies-starter,Python,4,1165,1165,226,44,False,False
577,llSourcell/YOLO_Object_Detection,Python,4,1390,1390,682,2264,False,False
495,angusshire/greenhat,Python,4,1431,1431,175,1127,False,False
728,llSourcell/learn_math_fast,Python,5,2822,2822,550,44,False,False
259,princeton-vl/CornerNet,Python,5,1770,1770,378,39,False,False
818,wepe/O2O-Coupon-Usage-Forecast,Python,5,1199,1199,785,1338,False,False
545,openai/finetune-transformer-lm,Python,6,1419,1419,368,423329,False,False


In [64]:
exp1 = df.sort_values("size", ascending=True)[:50][-20:]
exp1

Unnamed: 0,full_name,language,commits,stargazers_count,watchers_count,forks_count,size,archived,fork
725,kragniz/json-sempai,Python,86,1252,1252,54,39,False,False
190,careercup/CtCI-6th-Edition-Python,Python,56,1797,1797,749,40,False,False
614,eth0izzle/bucket-stream,Python,35,1372,1372,157,41,False,False
356,matthewearl/deep-anpr,Python,40,1625,1625,672,42,False,False
32,joshnewlan/say_what,Python,18,2063,2063,124,43,False,False
134,arc90/git-sweep,Python,18,1870,1870,105,43,False,False
728,llSourcell/learn_math_fast,Python,5,2822,2822,550,44,False,False
859,openai/evolution-strategies-starter,Python,4,1165,1165,226,44,False,False
297,fchollet/deep-learning-models,Python,22,5714,5714,1879,45,False,False
617,spyoungtech/grequests,Python,75,3213,3213,297,45,False,False


In [65]:
def x1(s):
    s = s.replace("/", "_")
    return s

In [68]:
for l in list(map(lambda x: f"https://github.com/{x}.git", exp1.full_name.values.tolist())):
    print(l)

https://github.com/kragniz/json-sempai.git
https://github.com/careercup/CtCI-6th-Edition-Python.git
https://github.com/eth0izzle/bucket-stream.git
https://github.com/matthewearl/deep-anpr.git
https://github.com/joshnewlan/say_what.git
https://github.com/arc90/git-sweep.git
https://github.com/llSourcell/learn_math_fast.git
https://github.com/openai/evolution-strategies-starter.git
https://github.com/fchollet/deep-learning-models.git
https://github.com/spyoungtech/grequests.git
https://github.com/edc/bass.git
https://github.com/meijieru/crnn.pytorch.git
https://github.com/vulnersCom/getsploit.git
https://github.com/floydhub/dl-docker.git
https://github.com/bitly/data_hacks.git
https://github.com/kevinburke/hamms.git
https://github.com/jlsutherland/doc2text.git
https://github.com/reinderien/mimic.git
https://github.com/danijar/handout.git
https://github.com/guillaumegenthial/sequence_tagging.git


In [60]:
list(map(lambda x: f"/tmp/repositories/{x1(x)}", exp1.full_name.values.tolist()))

['/tmp/repositories/spyder-ide_spyder',
 '/tmp/repositories/cloudera_hue',
 '/tmp/repositories/translate_pootle',
 '/tmp/repositories/divio_django-cms',
 '/tmp/repositories/stamparm_maltrail',
 '/tmp/repositories/trustedsec_social-engineer-toolkit',
 '/tmp/repositories/matrix-org_synapse',
 '/tmp/repositories/Tribler_tribler',
 '/tmp/repositories/cupy_cupy',
 '/tmp/repositories/conda_conda',
 '/tmp/repositories/Komodo_KomodoEdit',
 '/tmp/repositories/wbond_package_control_channel',
 '/tmp/repositories/openshift_openshift-ansible',
 '/tmp/repositories/mirumee_saleor',
 '/tmp/repositories/nltk_nltk',
 '/tmp/repositories/readthedocs_readthedocs.org',
 '/tmp/repositories/RasaHQ_rasa',
 '/tmp/repositories/biopython_biopython',
 '/tmp/repositories/spack_spack',
 '/tmp/repositories/pritunl_pritunl']

In [50]:
"/".replace()

TypeError: replace() takes no keyword arguments

In [None]:
git clone https://github.com/ishepard/pydriller.git /tmp/repositories/