In [None]:
# pip install github
# pip install tqdm
# pip install python-dotenv
# pip install gitpython


In [None]:
# import github
import pandas as pd
import csv
from tqdm import tqdm
import time
import os
from dotenv import load_dotenv
import git
import subprocess
from datetime import datetime
load_dotenv()
tqdm.pandas()

import logging
logger = logging.getLogger("ETL")
logging.basicConfig(format='%(asctime)s %(levelname):%(message)', level=logging.DEBUG, datefmt='%H:%M:%S')
logger.debug("ETL logging!")

In [None]:
GITHUB_USERNAME = os.getenv("GITHUB_USERNAME")
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

## GitHub Client setup

In [None]:
client = await github.GHClient(username=GITHUB_USERNAME, token=GITHUB_TOKEN)

## F-Droid apps list, only github apps

In [None]:
apps_list = pd.read_csv("data/fdroid_apps_list_complete.csv")
apps_list = apps_list[~apps_list["source_code"].isna()]


In [None]:
gh_filter = apps_list["source_code"].str.contains("github")
github_apps_list = apps_list[gh_filter]

In [None]:
def extract_repo_owner(url: str) -> dict:
    offset = -1
    if url.endswith("/"):
        offset -= 1
    splitted = url.split("/")
    return {
        "repo": splitted[offset],
        "owner": splitted[offset - 1]
    }

class DummyRepoInfo:
    def __init__(self) -> None:
        self.stargazers_count = None
        self.language = None
        self.archived = None
        self.disabled = None
        self.updated_at = None
        self.clone_url = None
        self.forks = None
        self.is_fork = None

In [None]:
github_apps_list_complete = github_apps_list

In [None]:
async def get_info_from_gh(df: pd.DataFrame, start_after: str = None):
    first_line = True
    should_skip = True if start_after else False
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        if should_skip:
            first_line = False
            print("skipping", df["package"])
            if df["package"] == start_after:
                should_skip = False
            continue
        row_dict = row.to_dict()
        time.sleep(0.8)
        try:
            repo_info = await client.get_repo(**extract_repo_owner(row["source_code"]))
        except github.RepositoryNotFound:
            repo_info = DummyRepoInfo()
        row_dict["stars"] = repo_info.stargazers_count
        row_dict["primary_language"] = repo_info.language
        row_dict["is_archived"] = repo_info.archived
        row_dict["is_disabled"] = repo_info.disabled
        row_dict["updated_at"] = repo_info.updated_at
        row_dict["clone_url"] = repo_info.clone_url
        row_dict["forks"] = repo_info.forks
        row_dict["is_fork"] = repo_info.is_fork
        # Save to csv
        with open("data/github_apps_list_complete.csv", 'a') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=row_dict.keys())
            if first_line == True:
                first_line = False
                writer.writeheader()
            writer.writerow(row_dict)


In [None]:
# await get_info_from_gh(github_apps_list) # uncomment for ingestion

In [None]:
github_apps_list_complete = pd.read_csv("data/github_apps_list_complete.csv")

## Basic stats

#### Number of projects by language

In [None]:
github_apps_list_complete["primary_language"].value_counts().head()

#### Number of stars by language

In [None]:
github_apps_list_complete.groupby(["primary_language"])["stars"].mean().sort_values(ascending=False)

#### Number of forks by language

In [None]:
github_apps_list_complete.groupby(["primary_language"])["forks"].sum().sort_values(ascending=False)

#### Java projects with most stars

In [None]:
java_filter = (github_apps_list_complete["primary_language"] == "Java").fillna(False)
java_github_top20 = github_apps_list_complete[java_filter] \
        .sort_values("stars", ascending=False) \
        .drop_duplicates(subset=["clone_url"]) \
        .head(20)
java_github_top20

#### Clone repos

In [None]:
for _, repo in java_github_top20.iterrows():
    git.Git("../repos/").clone(repo["clone_url"])

#### Run BOHR

In [None]:
CHECKPOINTS_PATH = "data/bohr_checkpoints.txt"
def has_java_files(list_of_filenames):
    return bool([elem for elem in list_of_filenames if elem.endswith(".java")])
def timestamp():
    return int(datetime.now().timestamp() * 1000 // 1)
def load_checkpoints():
    if os.path.exists(CHECKPOINTS_PATH):
        with open(CHECKPOINTS_PATH, 'r') as checkpoint_file:
            lines = checkpoint_file.readlines()
            lines = [line.strip() for line in lines]
            return lines
    else:
        return []
def save_checkpoint(path):
    with open(CHECKPOINTS_PATH, 'a') as checkpoint_file:
        checkpoint_file.write(path + "\n")
    logger.info(f"saved {path}")

def run_bohr(repo_path, file_name):
    checkpoints = load_checkpoints()
    for workdir, repo_list, files in os.walk(repo_path):
        java_files = [file for file in files if file.endswith(".java")]
        for file in java_files:
            java_file_path = f"{workdir}/{file}"
            if java_file_path in checkpoints:
                logger.info(f"skipping {java_file_path}")
                continue
            ms = timestamp()
            subprocess.call(['java', '-jar', 'bohr.jar', java_file_path, f"{file_name}_{ms}"])
            save_checkpoint(java_file_path)


In [None]:
def checkpoints_run_bohr(repo_path: str):
    csv_path = "data/java_files.csv"
    first_line = False if os.path.exists(csv_path) else True
    row_dict = {"path": repo_path}
    # Save to csv
    with open(csv_path, 'a') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=row_dict.keys())
        if first_line == True:
            writer.writeheader()
        writer.writerow(row_dict)

In [None]:
%%capture mylogs
logger.info("Started!")
workdir, repo_list, _ = next(os.walk("../repos"))
for repo in repo_list:
    repo_path = f"{os.path.abspath(workdir)}/{repo}"
    run_bohr(repo_path, f"./bohr_reports/{repo}/{repo}")
logger.info("Finished!")
with open('bohr_run.txt') as f:
    f.write(mylogs.stdout)