# Select Project Corpus

This notebook search github for projects that are suitable for this experiment

Search github for projects

In [3]:
from github import Github
import pprint
import json
import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

# Parameters

In [2]:
languages = [
    "javascript",
    "typescript",
    "java",
    "python",
    "php",
    "c++",
    "c#",
    #"shell",
    "c",
    "ruby",
    "go"
]

num_projects_per_language = 10
num_projects_per_language_to_search = num_projects_per_language + 10
active_project_date = datetime.datetime(2019, 10, 12)

## Connect to github

In [None]:
token=!cat token
github = Github(token[0])

## Search for repositories

In [None]:
projects_data = []
for lang in languages:
    repos = github.search_repositories(query=f"language:{lang}", sort="stars", order="desc")
    for i in range(num_projects_per_language_to_search):
        repo = repos[i]
        print(repo)
        project_data = {}
        project_data["name"] = repo.full_name
        project_data["description"] = repo.description
        project_data["stars"] = repo.stargazers_count
        project_data["url"] = repo.url
        project_data["git_url"] = repo.git_url
        project_data["language"] = repo.language
        project_data["created_at"] = repo.created_at
        
        commits = repo.get_commits()#since=active_project_date)
        active_project = False
        last_commit = commits[0]
        if last_commit:
            last_commit = last_commit.commit
            committer = last_commit.committer
            last_commit_date = committer.date
            if last_commit_date >= active_project_date:
                active_project = True
        project_data["active"] = active_project
        projects_data.append(project_data)

In [None]:
projects = pd.DataFrame(projects_data)
projects.set_index(keys=["name"], inplace=True)

Exclude **nom software** projects

In [None]:
projects["software_project"] = True
projects["discard"] = False

In [None]:
projects.loc["bayandin/awesome-awesomeness", "software_project"] = False
projects.loc["tensorflow/models", "software_project"] = False
projects.loc["TheAlgorithms/Python", "software_project"] = False
projects.loc["vinta/awesome-python", "software_project"] = False
projects.loc["public-apis/public-apis", "software_project"] = False
projects.loc["donnemartin/system-design-primer", "software_project"] = False
projects.loc["danielmiessler/SecLists", "software_project"] = False
projects.loc["domnikl/DesignPatternsPHP", "software_project"] = False
projects.loc["laravel/framework", "software_project"] = False
projects.loc["airbnb/javascript", "software_project"] = False
projects.loc["kdn251/interviews", "software_project"] = False
projects.loc["MisterBooo/LeetCodeAnimation", "software_project"] = False
projects.loc["iluwatar/java-design-patterns", "software_project"] = False
projects.loc["Snailclimb/JavaGuide", "software_project"] = False
projects.loc["CyC2018/CS-Notes", "software_project"] = False
projects.loc["doocs/advanced-java", "software_project"] = False
projects.loc["avelino/awesome-go", "software_project"] = False
projects.loc["josephmisiti/awesome-machine-learning", "software_project"] = False
projects.loc["astaxie/build-web-application-with-golang", "software_project"] = False
projects.loc["SamyPesse/How-to-Make-a-Computer-Operating-System", "software_project"] = False
projects.loc["julycoding/The-Art-Of-Programming-By-July", "software_project"] = False
projects.loc["cfenollosa/os-tutorial", "software_project"] = False
projects.loc["trekhleb/javascript-algorithms", "software_project"] = False
projects.loc["freeCodeCamp/devdocs", "software_project"] = False

# big projects
projects.loc["torvalds/linux", "discard"] = True

# Unkwnown projects
#projects.loc["Falsemacrozheng/mall", "discard"] = True # japanese
projects.loc["macrozheng/mall", "discard"] = True # japanese

### Select top 10 star projects per language

In [None]:
projects["selected"] = False
for lang, data in projects.groupby(["language"]):
    print(f"Language: {lang}")
    top_stars = data[(data.software_project == True) & (data.active == True) & (data.discard == False)].nlargest(n=num_projects_per_language, columns="stars")
    projects.loc[projects.index.isin(top_stars.index), "selected"] = True
    for name, project in top_stars.iterrows():
        print(f" - {name:30} {project.stars:6} {project.description}")
    print("\n") 


### Discard reason


In [None]:
other = 0
nom_software = 0
inactive = 0
for lang, data in projects.groupby(["language"]):
    print(f"Language: {lang}")
    i = 0
    
    lang_other = 0
    lang_nom_software = 0
    lang_inactive = 0

    for name, project in data.sort_values(by="stars", ascending=False).iterrows():
        if i < num_projects_per_language:
            if not project.active:
                lang_inactive += 1
            elif not project.software_project:
                lang_nom_software += 1
            elif project.discard:
                lang_other += 1

        if not project.selected and i < num_projects_per_language:
            print(f" {'X' if project.selected else '':1} {name:50} {project.stars:6} {'A' if project.active else 'IN':2} {'S' if project.software_project else 'NS':2} {'OT' if project.discard else '':2}")
        if project.selected:
            i += 1 

    print(f"Inactive: {lang_inactive:2} Non Software: {lang_nom_software:2} Other: {lang_other:2} ")    
    print("\n")
    other += lang_other
    nom_software += lang_nom_software
    inactive += lang_inactive

print(f"---\nInactive: {inactive:2} Non Software: {nom_software:2} Other: {other:2} ")    

### Generate the final dataset

In [None]:
projects = projects[projects["selected"] == True].copy()

In [None]:
projects.drop("selected", axis=1, inplace=True)
projects.drop("software_project", axis=1, inplace=True)
projects.drop("discard", axis=1, inplace=True)

### Generate script to clone repositories

In [None]:
with open("clone-urls.sh", "w") as script_file:
    for name, project in projects.iterrows():
        script_file.write(f"[[ -d {name}.git ]] || git clone --mirror --bare {project['git_url']} {name}.git\n")

## Generate script to update repoitoriess

In [None]:
with open("update-urls.sh", "w") as script_file:
    for name, project in projects.iterrows():
        script_file.write(f"[[ ! -d {name}.git ]] || (cd {name}.git; git fetch; cd -)\n")

### Generate input for localhub

In [None]:
# ~/dev/localhub/localhub.sh  -b /usr/local/repos/ . < ~/papers/2019-mining/experiment/repos-urls.txt # rel-maintenance/experiment/repos-urls.txt

In [None]:
i = 0
with open("repos-urls.txt", "w") as url_file:
    for name, project in projects.iterrows():
        url_file.write(f"{project['git_url']}\n")
        i += 1
print(f"{i} projects")

In [None]:
projects.to_pickle("projects.zip")

In [None]:
projects.to_json("projects.json", orient='index')