Build project dataset
 - 100 projects, 10 for each of the most popular 10 programming languages
 

In [1]:
import requests
import json
import datetime

import pandas as pd

# Language Selection

The language selection comes from the top 10 technologies from Stackoverflow survey on 2019

In [2]:
languages = [
    "javascript",
    "typescript",
    "java",
    "python",
    "php",
    "c++",
    "c#",
    "c",
    "ruby",
    "go"
]
projects_per_lang = 30

# GitHub Search

To search GitHub, we need an API key, which is founded on the [Developer settings](https://github.com/settings/tokens). 
The following file is private, therefore not versioned.

In [3]:
token=!cat token
headers = {'Authorization': f'Bearer {token[0]}'}

Github GraphQL query to search repositories:

In [84]:
query = '''
    query($qnt: Int!, $query: String!) {
        search(
            query: $query,
            type: REPOSITORY,
            first: $qnt
        ) {
            repositoryCount
            nodes {
                ... on Repository {
                    name: nameWithOwner
                    description
                    url
                    stars: stargazers { totalCount }
                    issues{ totalCount }
                    releases { totalCount }
                    commits: object(expression: "HEAD") {
                        ... on Commit { 
                            last_commit: committedDate
                            history{ totalCount }
                        }
                    }
                    tags: refs(refPrefix: "refs/tags/") { totalCount }
                }
            }
        }
    }
'''

In [103]:
def process_language(lang, data):
    """ Create pandas dataframe from GitHub query response """
    project_lang_data = data['data']['search']['nodes']
    for project_data in project_lang_data:
        project_data['lang'] = lang
        
        # flatten results
        project_data['last_commit'] = project_data['commits']['last_commit']
        project_data['commits'] = project_data['commits']['history']
        for key in project_data.keys():
            if isinstance(project_data[key], dict) and 'totalCount' in project_data[key]:
                project_data[key] = project_data[key]['totalCount']
                
    df = pd.DataFrame.from_dict(data['data']['search']['nodes'])
    return df


In [104]:
print(f'Searching GitHub on {datetime.date.today()}')
github_projects = pd.DataFrame(columns=['name', 'lang', 'stars', 'commits', 'issues', 'tags', 'releases', 'last_commit', 'description', 'url'])

for lang in languages:
    print(f'Searching lang {lang}')
    response = requests.post('https://api.github.com/graphql', headers=headers, json={
        'query': query,
        'variables': { 'query': f'language:{lang} sort:stars-desc', 'qnt': projects_per_lang }
    })
    if response.ok:
        data = json.loads(response.content)
        project_lang_df = process_language(lang, data)
        github_projects = pd.concat([github_projects, project_lang_df])

Searching GitHub on 2020-07-25
Searching lang javascript
Searching lang typescript
Searching lang java
Searching lang python
Searching lang php
Searching lang c++
Searching lang c#
Searching lang c
Searching lang ruby
Searching lang go


In [131]:
github_projects.last_commit = pd.to_datetime(github_projects.last_commit)
github_projects.stars = github_projects.stars.astype(int)
github_projects.commits = github_projects.commits.astype(int)
github_projects.issues = github_projects.issues.astype(int)
github_projects.releases = github_projects.releases.astype(int)
github_projects.tags = github_projects.tags.astype(int)

In [132]:
github_projects.head()

Unnamed: 0,name,lang,stars,commits,issues,tags,releases,last_commit,description,url
0,freeCodeCamp/freeCodeCamp,javascript,312879,25886,14620,0,0,2020-07-24 12:08:37+00:00,freeCodeCamp.org's open source codebase and cu...,https://github.com/freeCodeCamp/freeCodeCamp
1,vuejs/vue,javascript,168654,3104,9091,249,208,2020-07-01 09:11:46+00:00,"🖖 Vue.js is a progressive, incrementally-adopt...",https://github.com/vuejs/vue
2,facebook/react,javascript,152833,13426,9390,131,90,2020-07-25 11:32:21+00:00,"A declarative, efficient, and flexible JavaScr...",https://github.com/facebook/react
3,twbs/bootstrap,javascript,143082,19914,19463,59,59,2020-07-21 17:17:28+00:00,"The most popular HTML, CSS, and JavaScript fra...",https://github.com/twbs/bootstrap
4,airbnb/javascript,javascript,98099,1828,1006,94,0,2020-06-23 05:55:52+00:00,JavaScript Style Guide,https://github.com/airbnb/javascript


In [133]:
github_projects.shape

(300, 10)

# Project filtering

In [134]:
def select_projects(projects):
    for lang, data in projects.groupby(["lang"]):
        data = data[data.discarded == False].nlargest(n=10, columns="stars")
        projects.loc[projects.name.isin(data.name), 'selected'] = True
    return projects[(projects.selected == True) & (projects.discarded == False)].copy()

In [135]:
def discard_projects(projects, discard, reason):
    projects.loc[projects.name.isin(discard.name), 'discarded'] = reason

In [136]:
def remove_inactive_projects(projects, age):
    """ Remove inactive projects """
    selected_projects = projects[projects.last_commit > age]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]
    

In [137]:
def remove_small_projects(projects, min_commits):
    """ Remove projects with less commits than min_commits"""
    selected_projects = projects[projects.commits >= min_commits]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

In [138]:
def remove_big_projects(projects, max_percent_commits):
    """ Remove projects with more commits than a percentage of total """
    sum_commits = projects.commits.sum()
    projects['perc_commits'] = projects.commits / sum_commits

    selected_projects = projects[projects.perc_commits <= max_percent_commits]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

In [139]:
def remove_non_software_projects(projects):
    # go:
    projects.loc[projects.name == 'avelino/awesome-go', 'software'] = False
    projects.loc[projects.name == 'astaxie/build-web-application-with-golang', 'software'] = False

    # java:
    projects.loc[projects.name == 'CyC2018/CS-Notes', 'software'] = False
    projects.loc[projects.name == 'Snailclimb/JavaGuide', 'software'] = False
    projects.loc[projects.name == 'iluwatar/java-design-patterns', 'software'] = False

    # javascript:
    projects.loc[projects.name == 'freeCodeCamp/freeCodeCamp', 'software'] = False
    projects.loc[projects.name == '30-seconds/30-seconds-of-code', 'software'] = False

    # python:
    projects.loc[projects.name == 'public-apis/public-apis', 'software'] = False
    projects.loc[projects.name == 'tensorflow/models', 'software'] = False

    # ruby:
    projects.loc[projects.name == 'freeCodeCamp/devdocs', 'software'] = False
    
    """ Remove non software projects """
    selected_projects = projects[projects.software == True]
    discarded_projects = projects[~projects.name.isin(selected_projects.name)]
    return [selected_projects, discarded_projects, discarded_projects.shape[0]]

## Discard projects

 - Inactive
 - Too small
 - Too big
 - Non software


In [140]:
today = pd.to_datetime('now').tz_localize('UTC').normalize()
delta = pd.to_timedelta(30*6, unit='d')

age = today - delta
min_commits = 2000
max_percent_commits = 0.05

In [141]:
all_projects = github_projects.copy()
all_projects["discarded"] = False
all_projects["software"] = True

fully_converged = False
while not fully_converged:
    cnt_discarded = 0

    converged = False
    while not converged:
        projects = select_projects(all_projects)
        [s,d,n] = remove_inactive_projects(projects, age)
        discard_projects(all_projects, d, 'inactive')
        converged = (n == 0)
        cnt_discarded += n
        
    
    converged = False
    while not converged:
        projects = select_projects(all_projects)
        [s,d,n] = remove_small_projects(projects, min_commits)
        discard_projects(all_projects, d, 'small')
        converged = (n == 0)
        cnt_discarded += n
        
    converged = False
    while not converged:
        projects = select_projects(all_projects)
        [s,d,n] = remove_big_projects(projects, max_percent_commits)
        discard_projects(all_projects, d, 'big')
        converged = (n == 0)
        cnt_discarded += n

    converged = False
    while not converged:
        projects = select_projects(all_projects)
        [s,d,n] = remove_non_software_projects(projects)
        discard_projects(all_projects, d, 'non_software')
        converged = (n == 0)
        cnt_discarded += n
    
    fully_converged = (cnt_discarded == 0)
    print(f'Discarded {cnt_discarded} projects')
    
    
projects = select_projects(all_projects).copy()
projects.drop(columns=['discarded', 'software', 'selected'], inplace=True)

Discarded 56 projects
Discarded 10 projects
Discarded 1 projects
Discarded 0 projects


In [142]:
all_projects[all_projects.discarded != False].groupby(['discarded']).count().name

discarded
big              8
inactive         4
non_software     9
small           46
Name: name, dtype: int64

In [143]:
all_projects[all_projects.discarded != False].groupby(['discarded']).count().name.sum()

67

## Project list for manual inspection

In [144]:
for lang, data in projects.groupby(["lang"]):
    print(f'language: {lang}')
    #for i, project in data.iterrows():
    print(f" - {'stars':6} {'name':25}  {'description'}")
    for project in data.itertuples():
        print(f" - {project.stars:6} {project.name:25}: {project.description}")
    print("\n")


language: c
 - stars  name                       description
 -  47694 netdata/netdata          : Real-time performance monitoring, done right! https://www.netdata.cloud
 -  44306 redis/redis              : Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, Streams, HyperLogLogs, Bitmaps.
 -  33436 git/git                  : Git Source Code Mirror - This is a publish-only repository and all pull requests are ignored. Please follow Documentation/SubmittingPatches procedure for any of your improvements.
 -  20863 obsproject/obs-studio    : OBS Studio - Free and open source software for live streaming and screen recording
 -  20632 ggreer/the_silver_searcher: A code-searching tool similar to ack, but faster.
 -  18449 tmux/tmux                : tmux source code
 -  17675 curl/curl                : A command line tool and library for transferring data with URL syntax,

# Project info

In [150]:
projects.groupby('lang').mean()

Unnamed: 0_level_0,stars,commits,issues,tags,releases
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c,24635.7,18062.9,3003.7,216.8,28.5
c#,14271.0,14889.5,7380.4,64.2,46.1
c++,42427.0,16033.2,8127.6,157.5,81.6
go,39973.6,14497.0,9239.5,240.2,149.7
java,37679.2,13107.3,8437.9,123.2,63.2
javascript,97969.9,15028.8,10812.0,261.7,128.1
php,23632.5,16920.0,4324.9,263.0,83.2
python,45574.6,17683.7,8960.0,287.0,109.1
ruby,27698.7,24407.6,4972.8,316.1,87.0
typescript,56844.8,21431.2,21130.8,287.6,166.5


In [151]:
projects.groupby('lang').std()

Unnamed: 0_level_0,stars,commits,issues,tags,releases
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c,12593.248756,17176.293634,2746.911116,217.791848,37.924633
c#,3907.607623,20713.242443,7876.873988,45.183084,29.531339
c++,15920.778373,11931.555676,7110.362754,297.044796,178.978708
go,15218.886864,14767.901875,11650.271845,111.040333,89.532179
java,8103.647105,16749.056975,8672.7012,87.571939,68.813435
javascript,41327.487201,11249.771808,5438.347584,158.264933,77.809525
php,13461.235219,17215.832945,5468.828809,232.159428,102.311941
python,10302.625169,15535.806155,9610.680055,326.672109,200.93805
ruby,9765.722469,25339.712768,4243.63803,346.763913,139.136224
typescript,19465.538659,19616.088866,27050.194108,237.174666,168.963671


In [145]:
summary = projects.groupby('lang').sum()
summary['per_commits'] = summary.commits / summary.commits.sum()

In [146]:
summary.sort_values('per_commits', ascending=False)

Unnamed: 0_level_0,stars,commits,issues,tags,releases,per_commits
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ruby,276987,244076,49728,3161,870,0.141854
typescript,568448,214312,211308,2876,1665,0.124556
c,246357,180629,30037,2168,285,0.10498
python,455746,176837,89600,2870,1091,0.102776
php,236325,169200,43249,2630,832,0.098337
c++,424270,160332,81276,1575,816,0.093183
javascript,979699,150288,108120,2617,1281,0.087346
c#,142710,148895,73804,642,461,0.086536
go,399736,144970,92395,2402,1497,0.084255
java,376792,131073,84379,1232,632,0.076178


In [149]:
summary.describe()

Unnamed: 0,stars,commits,issues,tags,releases,per_commits
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,410707.0,172061.2,86389.6,2217.3,943.0,0.1
std,235523.202569,34380.143329,50213.518537,815.27324,442.233724,0.019981
min,142710.0,131073.0,30037.0,642.0,285.0,0.076178
25%,254014.5,149243.25,55747.0,1723.25,678.0,0.086738
50%,388264.0,164766.0,82827.5,2509.5,851.0,0.09576
75%,447877.0,179681.0,91696.25,2810.0,1233.5,0.104429
max,979699.0,244076.0,211308.0,3161.0,1665.0,0.141854
