In [None]:
from datetime import datetime
import os
import subprocess

import logging
logger = logging.getLogger("ETL")
logging.basicConfig(format='%(asctime)s %(levelname):%(message)', level=logging.DEBUG, datefmt='%H:%M:%S')

In [None]:
CHECKPOINTS_PATH = "data/bohr_checkpoints.txt"
def has_java_files(list_of_filenames):
    return bool([elem for elem in list_of_filenames if elem.endswith(".java")])
def timestamp():
    return int(datetime.now().timestamp() * 1000 // 1)
def load_checkpoints():
    if os.path.exists(CHECKPOINTS_PATH):
        with open(CHECKPOINTS_PATH, 'r') as checkpoint_file:
            lines = checkpoint_file.readlines()
            lines = [line.strip() for line in lines]
            return lines
    else:
        return []
def save_checkpoint(path):
    with open(CHECKPOINTS_PATH, 'a') as checkpoint_file:
        checkpoint_file.write(path + "\n")
    logger.info(f"saved {path}")

def run_bohr(repo_path, file_name):
    checkpoints = load_checkpoints()
    for workdir, repo_list, files in os.walk(repo_path):
        java_files = [file for file in files if file.endswith(".java")]
        for file in java_files:
            java_file_path = f"{workdir}/{file}"
            if java_file_path in checkpoints:
                logger.info(f"skipping {java_file_path}")
                continue
            ms = timestamp()
            subprocess.call(['java', '-jar', 'bohr.jar', java_file_path, f"{file_name}_{ms}"])
            save_checkpoint(java_file_path)