This notebook search GitHub to create our Dataset

 - 100 projects, 10 for each of the most popular 10 programming languages
 

In [1]:
import requests
import json
import datetime

import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_rows', 500)

# Language Selection

The language selection comes from the top 10 technologies from Stackoverflow survey on 2019

In [3]:
languages = [
    "javascript",
    "typescript",
    "java",
    "python",
    "php",
    "c++",
    "c#",
    "c",
    "ruby",
    "go"
]
projects_per_lang = 35

# GitHub Search

To search GitHub, we need an API key, which is founded on the [Developer settings](https://github.com/settings/tokens). 
The following file is private, therefore not versioned.

In [4]:
token=!cat token
headers = {'Authorization': f'Bearer {token[0]}'}

Github GraphQL query to search repositories:

In [5]:
query = '''
    query($qnt: Int!, $query: String!) {
        search(
            query: $query,
            type: REPOSITORY,
            first: $qnt
        ) {
            repositoryCount
            nodes {
                ... on Repository {
                    name: nameWithOwner
                    description
                    url
                    stars: stargazers { totalCount }
                    issues{ totalCount }
                    releases { totalCount }
                    commits: object(expression: "HEAD") {
                        ... on Commit { 
                            last_commit: committedDate
                            history{ totalCount }
                        }
                    }
                    tags: refs(refPrefix: "refs/tags/") { totalCount }
                }
            }
        }
    }
'''

In [6]:
def process_language(lang, data):
    """ Create pandas dataframe from GitHub query response """
    project_lang_data = data['data']['search']['nodes']
    for project_data in project_lang_data:
        project_data['lang'] = lang
        
        # flatten results
        project_data['last_commit'] = project_data['commits']['last_commit']
        project_data['commits'] = project_data['commits']['history']
        for key in project_data.keys():
            if isinstance(project_data[key], dict) and 'totalCount' in project_data[key]:
                project_data[key] = project_data[key]['totalCount']
                
    df = pd.DataFrame.from_dict(data['data']['search']['nodes'])
    return df


In [7]:
print(f'Searching GitHub on {datetime.date.today()}')
github_projects = pd.DataFrame(columns=['name', 'lang', 'stars', 'commits', 'issues', 'tags', 'releases', 'last_commit', 'description', 'url'])

for lang in languages:
    print(f'Searching lang {lang}')
    response = requests.post('https://api.github.com/graphql', headers=headers, json={
        'query': query,
        'variables': { 'query': f'language:{lang} stars:>5000 sort:stars-desc', 'qnt': projects_per_lang }
    })
    if response.ok:
        data = json.loads(response.content)
        project_lang_df = process_language(lang, data)
        github_projects = pd.concat([github_projects, project_lang_df])

Searching GitHub on 2020-10-10
Searching lang javascript
Searching lang typescript
Searching lang java
Searching lang python
Searching lang php
Searching lang c++
Searching lang c#
Searching lang c
Searching lang ruby
Searching lang go


In [8]:
github_projects.last_commit = pd.to_datetime(github_projects.last_commit)
github_projects.stars = pd.to_numeric(github_projects.stars)
github_projects.commits = pd.to_numeric(github_projects.commits)
github_projects.issues = pd.to_numeric(github_projects.issues)
github_projects.releases = pd.to_numeric(github_projects.releases)
github_projects.tags = pd.to_numeric(github_projects.tags)

In [9]:
github_projects.head(5)

Unnamed: 0,name,lang,stars,commits,issues,tags,releases,last_commit,description,url
0,freeCodeCamp/freeCodeCamp,javascript,315467,26316,14827,0,0,2020-10-10 18:25:34+00:00,freeCodeCamp.org's open source codebase and cu...,https://github.com/freeCodeCamp/freeCodeCamp
1,vuejs/vue,javascript,173633,3127,9211,250,208,2020-10-05 14:57:24+00:00,"🖖 Vue.js is a progressive, incrementally-adopt...",https://github.com/vuejs/vue
2,facebook/react,javascript,157151,13618,9635,131,90,2020-10-09 13:29:30+00:00,"A declarative, efficient, and flexible JavaScr...",https://github.com/facebook/react
3,twbs/bootstrap,javascript,144624,20098,19637,62,62,2020-10-07 14:03:53+00:00,"The most popular HTML, CSS, and JavaScript fra...",https://github.com/twbs/bootstrap
4,airbnb/javascript,javascript,100327,1835,1030,94,0,2020-10-06 14:36:29+00:00,JavaScript Style Guide,https://github.com/airbnb/javascript


In [10]:
github_projects.shape

(350, 10)

# Project filtering

In [11]:
def select_projects(projects):
    for lang, data in projects.groupby(["lang"]):
        data = data[data.discarded == False].nlargest(n=10, columns="stars")
        projects.loc[projects.name.isin(data.name), 'selected'] = True
    return projects[(projects.selected == True) & (projects.discarded == False)].copy()

In [12]:
def discard_projects(projects, discard, reason):
    projects.loc[projects.name.isin(discard.name), 'discarded'] = reason

In [13]:
def remove_inactive_projects(projects, age):
    """ Remove inactive projects """
    selected_projects = projects[projects.last_commit > age]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]
    

In [14]:
def remove_small_projects(projects, min_commits):
    """ Remove projects with less commits than min_commits"""
    selected_projects = projects[projects.commits >= min_commits]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

In [15]:
def remove_big_projects(projects, max_percent_commits):
    """ Remove projects with more commits than a percentage of total """
    sum_commits = projects.commits.sum()
    projects['perc_commits'] = projects.commits / sum_commits

    selected_projects = projects[projects.perc_commits <= max_percent_commits]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

In [16]:
def remove_non_software_projects(projects):
    # c#
    projects.loc[projects.name == 'dotnet/AspNetCore.Docs', 'software'] = False
    # go:
    projects.loc[projects.name == 'avelino/awesome-go', 'software'] = False
    projects.loc[projects.name == 'astaxie/build-web-application-with-golang', 'software'] = False

    # java:
    projects.loc[projects.name == 'CyC2018/CS-Notes', 'software'] = False
    projects.loc[projects.name == 'Snailclimb/JavaGuide', 'software'] = False
    projects.loc[projects.name == 'iluwatar/java-design-patterns', 'software'] = False
    projects.loc[projects.name == 'eugenp/tutorials', 'software'] = False
  
    
    # javascript:
    projects.loc[projects.name == 'freeCodeCamp/freeCodeCamp', 'software'] = False
    projects.loc[projects.name == '30-seconds/30-seconds-of-code', 'software'] = False

    # python:
    projects.loc[projects.name == 'public-apis/public-apis', 'software'] = False
    projects.loc[projects.name == 'tensorflow/models', 'software'] = False

    # ruby:
    projects.loc[projects.name == 'freeCodeCamp/devdocs', 'software'] = False
    
    # typescript
    projects.loc[projects.name == 'DefinitelyTyped/DefinitelyTyped', 'software'] = False
    
    
    """ Remove non software projects """
    selected_projects = projects[projects.software == True]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

In [17]:
def remove_shared_repo_projects(projects):
    # c
    projects.loc[projects.name == 'curl/curl', 'shared'] = True
    projects.loc[projects.name == 'git/git', 'shared'] = True
    projects.loc[projects.name == 'libuv/libuv', 'shared'] = True
    projects.loc[projects.name == 'openssl/openssl', 'shared'] = True
    projects.loc[projects.name == 'redis/redis', 'shared'] = True
    projects.loc[projects.name == 'bilibili/ijkplayer', 'shared'] = True
    projects.loc[projects.name == 'facebook/zstd', 'shared'] = True

    # c#
    projects.loc[projects.name == 'dotnet-architecture/eShopOnContainers', 'shared'] = True
    projects.loc[projects.name == 'dotnet/aspnetcore', 'shared'] = True
    projects.loc[projects.name == 'dotnet/roslyn', 'shared'] = True
    projects.loc[projects.name == 'PowerShell/PowerShell', 'shared'] = True
    projects.loc[projects.name == 'AvaloniaUI/Avalonia', 'shared'] = True
    projects.loc[projects.name == 'OpenRA/OpenRA', 'shared'] = True
    projects.loc[projects.name == 'Unity-Technologies/ml-agents', 'shared'] = True

    # c++
    projects.loc[projects.name == 'BVLC/caffe', 'shared'] = True
    projects.loc[projects.name == 'grpc/grpc', 'shared'] = True
    projects.loc[projects.name == 'rethinkdb/rethinkdb', 'shared'] = True

    # go
    projects.loc[projects.name == 'ethereum/go-ethereum', 'shared'] = True
    projects.loc[projects.name == 'moby/moby', 'shared'] = True
    projects.loc[projects.name == 'golang/go', 'shared'] = True
    projects.loc[projects.name == 'pingcap/tidb', 'shared'] = True
    projects.loc[projects.name == 'syncthing/syncthing', 'shared'] = True

    # java
    projects.loc[projects.name == 'google/guava', 'shared'] = True
    projects.loc[projects.name == 'ReactiveX/RxJava', 'shared'] = True
    projects.loc[projects.name == 'zxing/zxing', 'shared'] = True

    # javascript
    projects.loc[projects.name == 'facebook/create-react-app', 'shared'] = True
    projects.loc[projects.name == 'mui-org/material-ui', 'shared'] = True

    # php
    projects.loc[projects.name == 'symfony/symfony', 'shared'] = True

    # python
    projects.loc[projects.name == 'scikit-learn/scikit-learn', 'shared'] = True

    # ruby
    projects.loc[projects.name == 'fastlane/fastlane', 'shared'] = True
    projects.loc[projects.name == 'rails/rails', 'shared'] = True
    projects.loc[projects.name == 'rapid7/metasploit-framework', 'shared'] = True
    projects.loc[projects.name == 'jekyll/jekyll', 'shared'] = True
    projects.loc[projects.name == 'ruby/ruby', 'shared'] = True
    projects.loc[projects.name == 'elastic/logstash', 'shared'] = True
    
    # typescript
    projects.loc[projects.name == 'angular/angular', 'shared'] = True
    projects.loc[projects.name == 'denoland/deno', 'shared'] = True
    projects.loc[projects.name == 'microsoft/vscode', 'shared'] = True
    projects.loc[projects.name == 'pixijs/pixi.js', 'shared'] = True
    
    """ Remove shared repo projects """
    selected_projects = projects[projects.shared == False]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

In [18]:
def remove_few_releasesprojects(projects):
    """ Remove projects with fewer releases than 10 """

    # go
    projects.loc[projects.name == 'minio/minio', 'few_releases'] = True
    
    # c++
    projects.loc[projects.name == 'x64dbg/x64dbg', 'few_releases'] = True
    
    # ruby
    projects.loc[projects.name == 'huginn/huginn', 'few_releases'] = True
    projects.loc[projects.name == 'forem/forem', 'few_releases'] = True
    
    # c#
    projects.loc[projects.name == '0xd4d/dnSpy', 'few_releases'] = True
    
    # c
    projects.loc[projects.name == 'coolsnowwolf/lede', 'few_releases'] = True
    
    selected_projects = projects[projects.few_releases == False]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

In [19]:
def remove_date_release_projects(projects):
    """ Remove projects with fewer releases than 10 """

    # python
    projects.loc[projects.name == 'ytdl-org/youtube-dl', 'date_releases'] = True
    
    selected_projects = projects[projects.date_releases == False]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]


## Discard projects

 - Inactive
 - Too small
 - Too big
 - Non software
 - Shared Repo


In [20]:
today = pd.to_datetime('now').tz_localize('UTC').normalize()
delta = pd.to_timedelta(30*6, unit='d')

age = today - delta
min_commits = 2000
max_percent_commits = 0.05

In [21]:
def scan(all_projects, fnc, label, **params):
    converged = False
    cnt_discarded = 0
    while not converged:
        projects = select_projects(all_projects)
        params['projects'] = projects
        [s,d,n] = fnc(**params)
        discard_projects(all_projects, d, label)
        converged = (n == 0)
        cnt_discarded += n
        if n:
            print(f' - discarded {n} - {label}')
    return cnt_discarded

In [30]:
all_projects = github_projects.copy()
all_projects["discarded"] = False
all_projects["software"] = True
all_projects["shared"] = False
all_projects["few_releases"] = False
all_projects["date_releases"] = False

round = 1
fully_converged = False
while not fully_converged:
    cnt_discarded = 0
    
    print(f'Round {round}')
    cnt_discarded += scan(all_projects, remove_inactive_projects, 'inactivity', **{ 'age': age })
    cnt_discarded += scan(all_projects, remove_small_projects, 'small', **{ 'min_commits': min_commits })
    cnt_discarded += scan(all_projects, remove_non_software_projects, 'non_software')
    cnt_discarded += scan(all_projects, remove_big_projects, 'big', **{ 'max_percent_commits': max_percent_commits })
    cnt_discarded += scan(all_projects, remove_shared_repo_projects, 'shared')
    cnt_discarded += scan(all_projects, remove_few_releasesprojects, 'few_releases')
    cnt_discarded += scan(all_projects, remove_date_release_projects, 'date_releases')
        
    projects = select_projects(all_projects)
    
    fully_converged = (cnt_discarded == 0)
    print(f'Discarded {cnt_discarded} projects\n')
    round += 1 
    

Round 1
 - discarded 2 - inactivity
 - discarded 25 - small
 - discarded 9 - small
 - discarded 9 - non_software
 - discarded 1 - big
 - discarded 2 - big
 - discarded 2 - big
 - discarded 2 - big
 - discarded 25 - shared
 - discarded 5 - shared
 - discarded 3 - shared
 - discarded 4 - few_releases
 - discarded 1 - date_releases
Discarded 90 projects

Round 2
 - discarded 6 - inactivity
 - discarded 1 - inactivity
 - discarded 1 - inactivity
 - discarded 13 - small
 - discarded 5 - small
 - discarded 3 - small
 - discarded 1 - small
 - discarded 1 - small
 - discarded 1 - small
 - discarded 3 - non_software
 - discarded 3 - big
 - discarded 6 - shared
 - discarded 2 - few_releases
Discarded 46 projects

Round 3
 - discarded 2 - inactivity
 - discarded 3 - small
 - discarded 1 - small
Discarded 6 projects

Round 4
Discarded 0 projects



In [31]:
all_projects.loc[all_projects.discarded != False, ['name', 'discarded', 'commits', 'tags', 'last_commit']].sort_values(['discarded','commits'], ascending=[True,False])

Unnamed: 0,name,discarded,commits,tags,last_commit
0,torvalds/linux,big,951249,669,2020-10-10 01:05:12+00:00
19,mono/mono,big,122183,466,2020-10-10 09:13:19+00:00
5,php/php-src,big,121215,1075,2020-10-10 17:20:13+00:00
13,Homebrew/homebrew-cask,big,117397,114,2020-10-10 17:04:23+00:00
3,apple/swift,big,112435,1698,2020-10-10 17:54:41+00:00
23,python/cpython,big,108207,448,2020-10-10 19:23:42+00:00
8,gitlabhq/gitlabhq,big,104618,1253,2020-10-10 12:08:27+00:00
9,FFmpeg/FFmpeg,big,99522,335,2020-10-10 13:37:54+00:00
0,tensorflow/tensorflow,big,97309,121,2020-10-10 19:18:19+00:00
1,kubernetes/kubernetes,big,94709,707,2020-10-10 05:22:46+00:00


In [32]:
all_projects[all_projects.discarded != False]['discarded'].value_counts()

small            62
shared           39
non_software     12
inactivity       12
big              10
few_releases      6
date_releases     1
Name: discarded, dtype: int64

In [33]:
all_projects[all_projects.discarded != False]['discarded'].value_counts().sum()

142

## Project list for manual inspection

In [34]:
for lang, data in projects.groupby(["lang"]):
    print(f'language: {lang}')
    #for i, project in data.iterrows():
    print(f" - {'stars':6} {'name':25}  {'description'}")
    for project in data.itertuples():
        print(f" - {project.stars:6} {project.name:25}: {project.description}")
    print("\n")


language: c
 - stars  name                       description
 -  48821 netdata/netdata          : Real-time performance monitoring, done right! https://www.netdata.cloud
 -  22526 obsproject/obs-studio    : OBS Studio - Free and open source software for live streaming and screen recording
 -  20985 ggreer/the_silver_searcher: A code-searching tool similar to ack, but faster.
 -  19085 tmux/tmux                : tmux source code
 -  13860 taosdata/TDengine        : An open-source big data platform designed and optimized for the Internet of Things (IoT).
 -  13242 radareorg/radare2        : UNIX-like reverse engineering framework and command-line toolset
 -  13130 mpv-player/mpv           : 🎥 Command line video player
 -  12745 nginx/nginx              : An official read-only mirror of http://hg.nginx.org/nginx/ which is updated hourly. Pull requests on GitHub cannot be accepted and will be automatically closed. The proper way to submit changes to nginx is via the nginx development maili

# Projects Summary

In [40]:
projects = projects.set_index('name')
projects.head(5)

Unnamed: 0_level_0,lang,stars,commits,issues,tags,releases,last_commit,description,url,discarded,software,shared,few_releases,date_releases,selected
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
vuejs/vue,javascript,173633,3127,9211,250,208,2020-10-05 14:57:24+00:00,"🖖 Vue.js is a progressive, incrementally-adopt...",https://github.com/vuejs/vue,False,True,False,False,False,True
facebook/react,javascript,157151,13618,9635,131,90,2020-10-09 13:29:30+00:00,"A declarative, efficient, and flexible JavaScr...",https://github.com/facebook/react,False,True,False,False,False,True
twbs/bootstrap,javascript,144624,20098,19637,62,62,2020-10-07 14:03:53+00:00,"The most popular HTML, CSS, and JavaScript fra...",https://github.com/twbs/bootstrap,False,True,False,False,False,True
d3/d3,javascript,93988,4279,2056,267,159,2020-09-23 16:56:59+00:00,"Bring data to life with SVG, Canvas and HTML. ...",https://github.com/d3/d3,False,True,False,False,False,True
facebook/react-native,javascript,90624,21174,20262,348,148,2020-10-10 09:51:59+00:00,A framework for building native apps with React.,https://github.com/facebook/react-native,False,True,False,False,False,True


In [41]:
projects.describe()

Unnamed: 0,stars,commits,issues,tags,releases
count,100.0,100.0,100.0,100.0,100.0
mean,36133.63,13047.54,6735.85,188.41,107.05
std,28690.667876,12243.8419,6692.668913,199.726737,146.495518
min,7388.0,2030.0,0.0,12.0,0.0
25%,18357.25,4260.75,2211.5,58.5,23.0
50%,30953.0,8362.5,4189.5,130.5,60.5
75%,45140.75,20134.0,9035.0,217.5,145.0
max,173633.0,55176.0,28232.0,1064.0,829.0


In [42]:
github_projects.to_pickle('github_projects.zip')
github_projects.to_csv('github_projects.csv')

In [43]:

projects.to_pickle('projects.zip')
projects.to_csv('projects.csv')