# Select Project Corpus

This notebook search github for projects that are suitable for this experiment

Search github for projects

In [23]:
from github import Github
import pprint
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# Parameters

In [24]:
languages = [
    "javascript",
    "typescript",
    "java",
    "python",
    "php",
    "c++",
    "c#",
    #"shell",
    "c",
    "ruby",
    "go"
]

num_projects_per_language = 20

## Connect to github

In [25]:
token=!cat token
github = Github(token[0])

## Search for repositories

In [26]:
projects_data = []
for lang in languages:
    repos = github.search_repositories(query=f"language:{lang}", sort="stars", order="desc")
    for i in range(num_projects_per_language):
        repo = repos[i]
        project_data = {}
        project_data["name"] = repo.full_name
        project_data["description"] = repo.description
        project_data["stars"] = repo.stargazers_count
        project_data["url"] = repo.url
        project_data["git_url"] = repo.git_url
        project_data["language"] = repo.language
        project_data["created_at"] = repo.created_at
        project_data["ncreated_at"] = repo.created_at
        project_data["created_at"] = repo.created_at
        repo_tags = repo.get_tags()
        project_data["num_tags"] = repo_tags.totalCount
        repo_releases = repo.get_releases()
        project_data["num_releases"] = repo_releases.totalCount
        projects_data.append(project_data)


In [42]:
projects = pd.DataFrame(projects_data)
projects.set_index(keys=["name"], inplace=True)
projects["software_project"] = True

Exclude **nom software** projects

In [46]:
projects.loc["bayandin/awesome-awesomeness", "software_project"] = False
projects.loc["tensorflow/models", "software_project"] = False
projects.loc["TheAlgorithms/Python", "software_project"] = False
projects.loc["vinta/awesome-python", "software_project"] = False
projects.loc["public-apis/public-apis", "software_project"] = False
projects.loc["donnemartin/system-design-primer", "software_project"] = False
projects.loc["danielmiessler/SecLists", "software_project"] = False
projects.loc["domnikl/DesignPatternsPHP", "software_project"] = False
projects.loc["laravel/framework", "software_project"] = False
projects.loc["airbnb/javascript", "software_project"] = False
projects.loc["kdn251/interviews", "software_project"] = False
projects.loc["MisterBooo/LeetCodeAnimation", "software_project"] = False
projects.loc["iluwatar/java-design-patterns", "software_project"] = False
projects.loc["Snailclimb/JavaGuide", "software_project"] = False
projects.loc["CyC2018/CS-Notes", "software_project"] = False
projects.loc["doocs/advanced-java", "software_project"] = False
projects.loc["avelino/awesome-go", "software_project"] = False
projects.loc["josephmisiti/awesome-machine-learning", "software_project"] = False
projects.loc["astaxie/build-web-application-with-golang", "software_project"] = False
projects.loc["SamyPesse/How-to-Make-a-Computer-Operating-System", "software_project"] = False
projects.loc["julycoding/The-Art-Of-Programming-By-July", "software_project"] = False
projects.loc["cfenollosa/os-tutorial", "software_project"] = False
projects.loc["trekhleb/javascript-algorithms", "software_project"] = False


# Projetos grandes
projects.loc["torvalds/linux", "software_project"] = False

# Unkwnown projects
projects.loc["Falsemacrozheng/mall", "software_project"] = False # japanese
projects.loc["macrozheng/mall", "software_project"] = False # japanese


Select top 10 star projects per language

In [47]:
projects["selected"] = False
for lang, data in projects.groupby(["language"]):
    print(f"Language: {lang}")
    top_stars = data[(data.software_project == True)].nlargest(n=10, columns="stars")
    projects.loc[projects.index.isin(top_stars.index), "selected"] = True
    for name, project in top_stars.iterrows():
        print(f" - {name:30} {project.stars:6} {project.description}")
    print("\n")


Language: C
 - netdata/netdata                44046.0 Real-time performance monitoring, done right! https://my-netdata.io/
 - antirez/redis                  40702.0 Redis is an in-memory database that persists on disk. The data model is key-value, but many different kind of values are supported: Strings, Lists, Sets, Sorted Sets, Hashes, Streams, HyperLogLogs, Bitmaps.
 - git/git                        30978.0 Git Source Code Mirror - This is a publish-only repository and all pull requests are ignored. Please follow Documentation/SubmittingPatches procedure for any of your improvements.
 - php/php-src                    26186.0 The PHP Interpreter
 - bilibili/ijkplayer             25585.0 Android/iOS video player based on FFmpeg n3.4, with MediaCodec, VideoToolbox support.
 - Genymobile/scrcpy              24354.0 Display and control your Android device
 - wg/wrk                         23065.0 Modern HTTP benchmarking tool
 - ggreer/the_silver_searcher     19311.0 A code-searching too

In [48]:
projects = projects[projects["selected"] == True].copy()

In [49]:
projects.drop("selected", axis=1, inplace=True)
projects.drop("software_project", axis=1, inplace=True)

### Generate script to clone repositories

In [50]:
with open("clone-urls.sh", "w") as script_file:
    for name, project in projects.iterrows():
        script_file.write(f"[[ -d {name}.git ]] || git clone --mirror --bare {project['git_url']} {name}.git\n")

## Generate script to update repoitoriess

In [51]:
with open("update-urls.sh", "w") as script_file:
    for name, project in projects.iterrows():
        script_file.write(f"[[ ! -d {name}.git ]] || (cd {name}.git; git fetch; cd -)\n")

### Generate input for localhub

In [52]:
# ~/dev/localhub/localhub.sh  -b /usr/local/repos/ . < ~/papers/2019-mining/experiment/repos-urls.txt # rel-maintenance/experiment/repos-urls.txt

In [53]:
i = 0
with open("repos-urls.txt", "w") as url_file:
    for name, project in projects.iterrows():
        url_file.write(f"{project['git_url']}\n")
        i += 1
print(f"{i} projects")

100 projects


In [54]:
projects.to_pickle("projects.zip")

In [55]:
projects.to_json("projects.json", orient='index')

In [56]:
projects.num_tags.sum()

19915.0

In [57]:
projects.num_releases.sum()

6436.0