# GitHub Filter

This notebook filter the projects based on GitHub metadata. 

This notebook filter the projects based on the following criteria:

 - developed using one of the top 10 languages from stackoverflow survey
 - had at least 10 tags

The results are stored in the `data/01_github_project_results.json` and we cloned the projects using the `src/02_github_clone.sh`


In [1]:
import json
import os

In [2]:
from util import DATA_PATH

In [3]:
with open(DATA_PATH / '00_github_project_search-results.json', 'r') as infile:
    projects = json.load(infile)
    
print("GitHub Projects: {:,}".format(len(projects)))

GitHub Projects: 4,870


In [4]:
# define top languages
top_langs = set(lang.lower() for lang in ['JavaScript', 'Python', 'Java', 'TypeScript', 'C#', 'C++', 'PHP', 'C', 'Go', 'Kotlin'])
top_langs

{'c',
 'c#',
 'c++',
 'go',
 'java',
 'javascript',
 'kotlin',
 'php',
 'python',
 'typescript'}

In [5]:
# Apply filters
def project_criteria(project):
    # fewer potential releases
    if project['tags']['totalCount'] < 10:
        return False
    
    # no language
    if not project['topLanguage']['edges']:
        return False
    
    # unselected language
    if project['topLanguage']['edges'][0]['node']['name'].lower() not in top_langs:
        return False
    
    return True
        
selected_projects = [project for project in filter(project_criteria, projects)]
    
# selected_projects = [project for project in projects
#                      if project['tags']['totalCount'] >= 10
#                  and projects[0]['topLanguage']['edges'][0]['node']['name'].lower() in set(lang.lower() for lang in ['JavaScript', 'Python', 'Java', 'TypeScript', 'C#', 'C++', 'PHP', 'C', 'Go', 'Kotlin'])]

In [6]:
print("Selected projects: {:,}".format(len(selected_projects)))

Selected projects: 2,772


In [7]:
print("Tags: {:,}".format(sum(map(lambda project: project['tags']['totalCount'], selected_projects))))

Tags: 608,257


In [8]:
print("HEAD commits: {:,}".format(sum(map(lambda project: project['commits']['history']['totalCount'], selected_projects))))

HEAD commits: 21,317,678


In [9]:
with open(DATA_PATH / '01_github_project_results.json', 'w') as outfile:
    json.dump(selected_projects, outfile)

In [10]:
with open(DATA_PATH / '01_github_project_results.txt', 'w') as outfile:
    outfile.write(
        '\n'.join(
            map(lambda p: p['nameWithOwner'], selected_projects)))