In [1]:
import logging
import os
import shutil
import subprocess
from subprocess import PIPE, STDOUT

In [18]:
pip install charset-normalizer


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
from sqlalchemy.exc import IntegrityError
from bs4 import UnicodeDammit

In [2]:
logging.basicConfig(
    filename="HISTORYlistener.log",
    level=logging.DEBUG,
    format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

logger = logging.getLogger()

In [3]:
CHECKPOINTS_PATH = "checkpoints_ck.txt"
AOSP_SYNC_PATH = "/mnt/4846A54B46A53A98/AOSP/"
AOSP_ROOT_MATCHER = "<AOSP_ROOT>/"

In [29]:
def load_checkpoints():
    if os.path.exists(CHECKPOINTS_PATH):
        with open(CHECKPOINTS_PATH, "r") as checkpoint_file:
            lines = checkpoint_file.readlines()
            lines = [line.strip() for line in lines]
            return lines
    else:
        return []


def save_checkpoint(path):
    with open(CHECKPOINTS_PATH, "a") as checkpoint_file:
        checkpoint_file.write(path + "\n")
    logger.info(f"saved checkpoint on {path}")

def copy_results(destination_folder):
    GENERATED_FILES = ['class.csv', 'field.csv', 'method.csv', 'variable.csv']
    for file in GENERATED_FILES:
        src = f"./{file}"
        destination_file = destination_folder + f"/{file}"
        if not os.path.exists(destination_folder):
            os.makedirs(destination_folder)
        shutil.copy(src, destination_file)
        logger.info(f"Copied {src} into {destination_file}")
def log_subprocess_output(pipe):
    for line in iter(pipe.readline, b''): # b'\n'-separated lines
        logger.info('got line from subprocess: %r', line)
def gen_output_folder(git_dir_norm: str):
    return "./ck_output/aosp/" + git_dir_norm.split(AOSP_ROOT_MATCHER)[-1]
def gen_aosp_folder(git_dir_norm: str):
    return AOSP_SYNC_PATH + git_dir_norm.split(AOSP_ROOT_MATCHER)[-1]
def load_txt(path: str):
    with open(path, "r") as txt_file:
        return [line.strip() for line in txt_file.readlines()]
def get_loc(java_file_path):
    # get encoding
    logger.info(f"Processing encoding on {java_file_path}")
    with open(java_file_path, "rb") as java_file:
        java_file_bytes = java_file.read()
        ud = UnicodeDammit(java_file_bytes)
        enc = ud.original_encoding

        if enc is None:
            logger.critical(f"Could not get encoding for file {java_file_path}, got None.")
            logger.warning("Trying utf-8")
            enc = 'utf-8'
        else:
            logger.info(f"Detected encoding: {enc}")
        buffer = java_file_bytes.decode(encoding=enc)
    return len(buffer.split('\n'))

In [5]:
CK_JAR_PATH = "../tools/ck.jar"
def run_ck(java_file_path):
    java_dir_path = gen_aosp_folder(dir)
    process = subprocess.Popen(["java", "-jar", CK_JAR_PATH, java_dir_path], stdout=PIPE, stderr=STDOUT)
    with process.stdout:
        log_subprocess_output(process.stdout)
    if process.wait() != 0:
        logger.critical(f"Error executing {java_file_path}")
        error_msg = "Return code is not zero, error in execution of ck. Check logs."
        logger.critical(error_msg)
        raise RuntimeError(error_msg)
    ck_output = gen_output_folder(java_file_path)
    copy_results(ck_output)
    save_checkpoint(java_file_path)

## RUN CK

In [6]:
checkpoints = load_checkpoints()
git_dirs_norm = load_txt("./git_dirs_NORM.txt")
files_to_run = (file for file in git_dirs_norm if file not in checkpoints)
for dir in files_to_run:
    try:
        run_ck(dir)
    except:
        with open("./checkpoints_ck_failed.txt", 'a') as txt_file:
            txt_file.write(dir + '\n')
        save_checkpoint(dir)
        logger.error(f"Failed to run ck on {dir}. Skipping to the next.")
        print(f"Failed to run ck on {dir}. Skipping to the next.")

## Aggregate into SQL

In [7]:
from sqlalchemy import create_engine, select
from sqlalchemy import ForeignKey
from sqlalchemy import String, Integer, Double, Float, Boolean
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import Session
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship
from typing import List

import pandas as pd

##### Build models

In [8]:
class Base(DeclarativeBase):
    pass

class AOSPBase(DeclarativeBase):
    pass

class Project(Base):
    __tablename__ = "projects"
    name = mapped_column(String(), nullable=False, unique=True, primary_key=True)
    files: Mapped[List["File"]] = relationship(back_populates="project")
    aoc_reports: Mapped[List["AoCReport"]] = relationship()

class File(Base):
    __tablename__ = "files"
    path: Mapped[str] = mapped_column(String(), nullable=False, unique=True, primary_key=True)
    loc: Mapped[int] = mapped_column(Integer())
    classes: Mapped[List["Klass"]] = relationship(back_populates="file")
    project: Mapped[Project] = relationship(back_populates="files")
    project_name: Mapped[str] = mapped_column(ForeignKey("projects.name"))

class Klass(Base):
    __tablename__ = "classes"
    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    file: Mapped[File] = relationship(back_populates="classes")
    methods: Mapped[List["Method"]] = relationship(back_populates="class_")
    file_path: Mapped[str] = mapped_column(ForeignKey("files.path"))
    name: Mapped[str] = mapped_column(String(), nullable=False)
    type_: Mapped[str] = mapped_column(String(), nullable=False, name="type")
    cbo: Mapped[int] = mapped_column(Integer(), nullable=True)
    cboModified: Mapped[int] = mapped_column(Integer(), nullable=True)
    fanin: Mapped[int] = mapped_column(Integer(), nullable=True)
    fanout: Mapped[int] = mapped_column(Integer(), nullable=True)
    wmc: Mapped[int] = mapped_column(Integer(), nullable=True)
    dit: Mapped[int] = mapped_column(Integer(), nullable=True)
    noc: Mapped[int] = mapped_column(Integer(), nullable=True)
    rfc: Mapped[int] = mapped_column(Integer(), nullable=True)
    lcom: Mapped[int] = mapped_column(Integer(), nullable=True)
    lcom_normalized: Mapped[float] = mapped_column(Double(), nullable=True)
    tcc: Mapped[float] = mapped_column(Double(), nullable=True)
    lcc: Mapped[float] = mapped_column(Double(), nullable=True)
    totalMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    staticMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    publicMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    privateMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    protectedMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    defaultMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    visibleMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    abstractMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    finalMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    synchronizedMethodsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    totalFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    staticFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    publicFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    privateFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    protectedFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    defaultFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    finalFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    synchronizedFieldsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    nosi: Mapped[int] = mapped_column(Integer(), nullable=True)
    loc: Mapped[int] = mapped_column(Integer(), nullable=True)
    returnQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    loopQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    comparisonsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    tryCatchQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    parenthesizedExpsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    stringLiteralsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    numbersQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    assignmentsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    mathOperationsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    variablesQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    maxNestedBlocksQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    anonymousClassesQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    innerClassesQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    lambdasQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    uniqueWordsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    modifiers: Mapped[int] = mapped_column(Integer(), nullable=True)
    logStatementsQty: Mapped[int] = mapped_column(Integer(), nullable=True)

class Method(Base):
    __tablename__ = "methods"
    id_: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    file: Mapped[File] = relationship()
    class_: Mapped[Klass] = relationship(back_populates="methods")
    file_path: Mapped[str] = mapped_column(ForeignKey("files.path"))
    class_id: Mapped[int] = mapped_column(ForeignKey("classes.id"))
    name: Mapped[str] = mapped_column(String(), nullable=False)
    constructor: Mapped[bool] = mapped_column(Boolean(), nullable=False)
    line: Mapped[int] = mapped_column(Integer(), nullable=True)
    cbo: Mapped[int] = mapped_column(Integer(), nullable=True)
    cboModified: Mapped[int] = mapped_column(Integer(), nullable=True)
    fanin: Mapped[int] = mapped_column(Integer(), nullable=True)
    fanout: Mapped[int] = mapped_column(Integer(), nullable=True)
    wmc: Mapped[int] = mapped_column(Integer(), nullable=True)
    rfc: Mapped[int] = mapped_column(Integer(), nullable=True)
    loc: Mapped[int] = mapped_column(Integer(), nullable=True)
    returnsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    variablesQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    parametersQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    methodsInvokedQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    methodsInvokedLocalQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    methodsInvokedIndirectLocalQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    loopQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    comparisonsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    tryCatchQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    parenthesizedExpsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    stringLiteralsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    numbersQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    assignmentsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    mathOperationsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    maxNestedBlocksQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    anonymousClassesQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    innerClassesQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    lambdasQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    uniqueWordsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    modifiers: Mapped[int] = mapped_column(Integer(), nullable=True)
    logStatementsQty: Mapped[int] = mapped_column(Integer(), nullable=True)
    hasJavaDoc: Mapped[bool] = mapped_column(Boolean(), nullable=True)


In [9]:
class AOSPProject(AOSPBase):
    __tablename__ = "aosp_projects"
    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    name: Mapped[str] = mapped_column(String(), nullable=False)
    description: Mapped[str] = mapped_column(String(), nullable=True)
    package: Mapped[str] = mapped_column(String(), nullable=False)
    category: Mapped[str] = mapped_column(String(), nullable=False)
    aoc_reports: Mapped[List["AOSPAoCReport"]] = relationship(back_populates="project")

class AOSPAoCReport(AOSPBase):
    __tablename__ = "aosp_aoc_reports"
    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    project_id: Mapped[int] = mapped_column(ForeignKey("aosp_projects.id"))
    project: Mapped["AOSPProject"] = relationship(back_populates="aoc_reports")
    line: Mapped[int] = mapped_column(Integer())
    snippet: Mapped[str] = mapped_column(String(), nullable=False)
    class_: Mapped[str] = mapped_column(String(), nullable=False, name="class")
    aoc: Mapped[str] = mapped_column(String(), nullable=False)
    path: Mapped[str] = mapped_column(String(), nullable=True)
    commit: Mapped[str] = mapped_column(String(), nullable=True)
    loc: Mapped[int] = mapped_column(String(), nullable=True)

class AoCReport(Base):
    __tablename__ = "aoc_reports"
    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    project_name: Mapped[str] = mapped_column(ForeignKey("projects.name"))
    project: Mapped["Project"] = relationship(back_populates="aoc_reports")
    line: Mapped[int] = mapped_column(Integer())
    snippet: Mapped[str] = mapped_column(String(), nullable=False)
    class_name: Mapped[str] = mapped_column(String(), nullable=True)
    aoc: Mapped[str] = mapped_column(String(), nullable=False)
    path: Mapped[str] = mapped_column(String(), nullable=True)
    commit: Mapped[str] = mapped_column(String(), nullable=True)
    loc: Mapped[int] = mapped_column(String(), nullable=True)

In [10]:
db = create_engine("sqlite+pysqlite:///aosp_ck_output.db")
aosp_aoc_db = create_engine("sqlite+pysqlite:///aosp_dataset.db")

In [11]:

Base.metadata.create_all(db)

In [12]:
def query_class(class_name, file_name) -> Klass:
    with Session(db) as session:
        stmt = select(Klass).where(Klass.name.like(f"%{class_name}%") & Klass.file_path.like(file_name))
        return session.scalar(stmt)

#### Query from AOC dataset

In [13]:
def get_reports_by_project(project: AOSPProject) -> List[AOSPAoCReport]:
    stmt = select(AOSPAoCReport).where(AOSPAoCReport.project_id == project.id)
    with Session(aosp_aoc_db) as session:
        exec = session.execute(stmt, execution_options={"prebuffer_rows": True})
        return exec.scalars()
def get_aosp_project_by_name(name: str) -> AOSPProject:
    with Session(aosp_aoc_db) as session:
        return session.query(AOSPProject).filter(AOSPProject.name.like(name)).first()

In [14]:
def create_classes_sqlalchemy(df: pd.DataFrame) -> List[Klass]:
    classes = []
    for i, row in df.iterrows():
        c = Klass(
            file_path=row["file"],
            name=row["class"],
            type_=row["type"],
            cbo=row["cbo"],
            cboModified=row["cboModified"],
            fanin=row["fanin"],
            fanout=row["fanout"],
            wmc=row["wmc"],
            dit=row["dit"],
            noc=row["noc"],
            rfc=row["rfc"],
            lcom=row["lcom"],
            lcom_normalized=row["lcom*"],
            tcc=row["tcc"],
            lcc=row["lcc"],
            totalMethodsQty=row["totalMethodsQty"],
            staticMethodsQty=row["staticMethodsQty"],
            publicMethodsQty=row["publicMethodsQty"],
            privateMethodsQty=row["privateMethodsQty"],
            protectedMethodsQty=row["protectedMethodsQty"],
            defaultMethodsQty=row["defaultMethodsQty"],
            visibleMethodsQty=row["visibleMethodsQty"],
            abstractMethodsQty=row["abstractMethodsQty"],
            finalMethodsQty=row["finalMethodsQty"],
            synchronizedMethodsQty=row["synchronizedMethodsQty"],
            totalFieldsQty=row["totalFieldsQty"],
            staticFieldsQty=row["staticFieldsQty"],
            publicFieldsQty=row["publicFieldsQty"],
            privateFieldsQty=row["privateFieldsQty"],
            protectedFieldsQty=row["protectedFieldsQty"],
            defaultFieldsQty=row["defaultFieldsQty"],
            finalFieldsQty=row["finalFieldsQty"],
            synchronizedFieldsQty=row["synchronizedFieldsQty"],
            nosi=row["nosi"],
            loc=row["loc"],
            returnQty=row["returnQty"],
            loopQty=row["loopQty"],
            comparisonsQty=row["comparisonsQty"],
            tryCatchQty=row["tryCatchQty"],
            parenthesizedExpsQty=row["parenthesizedExpsQty"],
            stringLiteralsQty=row["stringLiteralsQty"],
            numbersQty=row["numbersQty"],
            assignmentsQty=row["assignmentsQty"],
            mathOperationsQty=row["mathOperationsQty"],
            variablesQty=row["variablesQty"],
            maxNestedBlocksQty=row["maxNestedBlocksQty"],
            anonymousClassesQty=row["anonymousClassesQty"],
            innerClassesQty=row["innerClassesQty"],
            lambdasQty=row["lambdasQty"],
            uniqueWordsQty=row["uniqueWordsQty"],
            modifiers=row["modifiers"],
            logStatementsQty=row["logStatementsQty"],
        )
        classes.append(c)
    return classes
def create_methods_sqlalchemy(df: pd.DataFrame, file_path: str) -> List[Method]:
    methods = []
    for i, row in df.iterrows():
        class_: Klass = query_class(row["class"], row["file"])
        m = Method(
        file_path=row["file"],
        class_id= class_.id,
        name=row["method"],
        constructor=row["constructor"],
        line=row["line"],
        cbo=row["cbo"],
        cboModified=row["cboModified"],
        fanin=row["fanin"],
        fanout=row["fanout"],
        wmc=row["wmc"],
        rfc=row["rfc"],
        loc=row["loc"],
        returnsQty=row["returnsQty"],
        variablesQty=row["variablesQty"],
        parametersQty=row["parametersQty"],
        methodsInvokedQty=row["methodsInvokedQty"],
        methodsInvokedLocalQty=row["methodsInvokedLocalQty"],
        methodsInvokedIndirectLocalQty=row["methodsInvokedIndirectLocalQty"],
        loopQty=row["loopQty"],
        comparisonsQty=row["comparisonsQty"],
        tryCatchQty=row["tryCatchQty"],
        parenthesizedExpsQty=row["parenthesizedExpsQty"],
        stringLiteralsQty=row["stringLiteralsQty"],
        numbersQty=row["numbersQty"],
        assignmentsQty=row["assignmentsQty"],
        mathOperationsQty=row["mathOperationsQty"],
        maxNestedBlocksQty=row["maxNestedBlocksQty"],
        anonymousClassesQty=row["anonymousClassesQty"],
        innerClassesQty=row["innerClassesQty"],
        lambdasQty=row["lambdasQty"],
        uniqueWordsQty=row["uniqueWordsQty"],
        modifiers=row["modifiers"],
        logStatementsQty=row["logStatementsQty"],
        hasJavaDoc=row["hasJavaDoc"]
        )
        methods.append(m)
    return methods

##### Persist Projects

In [15]:
git_dirs_norm = load_txt("./git_dirs_NORM.txt")
failed_git_dirs = load_txt("./checkpoints_ck_failed.txt")
files_to_run = (f for f in git_dirs_norm if f not in failed_git_dirs)
logger.info("Start persisting on sqlite")
for path in files_to_run:
    logger.info(f"Processing {path}")
    report_path = gen_output_folder(path)
    aosp_file_path = gen_aosp_folder(path)
    project_name = path.split(AOSP_ROOT_MATCHER)[-1]
    df_class = pd.read_csv(report_path + "/class.csv")
    df_method = pd.read_csv(report_path + "/method.csv")
    if (df_class.empty and df_method.empty):
        logger.info(f"project in path {path} does not have java files. Skipping.")
        continue
    proj = Project(name=project_name)
    # Persist Classes & Methods
    classes = create_classes_sqlalchemy(df_class)
    logger.info("Saving class and project to database")
    with Session(db) as session:
        session.add(proj)
        session.add_all(classes)
        session.commit()
    methods = create_methods_sqlalchemy(df_method, aosp_file_path)
    logger.info("Saving methods to database")
    with Session(db) as session:
        session.add_all(methods)
        session.commit()
logger.info("Done")

##### Persist files

In [30]:
git_dirs_norm = load_txt("./git_dirs_NORM.txt")
# failed_git_dirs = load_txt("./checkpoints_ck_failed.txt")
# files_to_run = (f for f in git_dirs_norm if f not in failed_git_dirs)
logger.info('Start persisting "FILES" on sqlite')
for path in git_dirs_norm:
    aosp_file_path = gen_aosp_folder(path)
    project_name = path.split(AOSP_ROOT_MATCHER)[-1]
    logger.info(f"\tProject: {project_name}")
    logger.info(f"\tRoot path: {aosp_file_path}")
    file_entries = []
    for curpath, _, files in os.walk(aosp_file_path):
        logger.info(f"Adding java files on {curpath}")
        file_entries.extend(
            [
                File(
                    path=f"{curpath}/{java_file}",
                    loc=get_loc(f"{curpath}/{java_file}"),
                    project_name=project_name,
                )
                for java_file in files
                if java_file.endswith(".java")
            ]
        )
    logger.info("Saving to database...")
    try:
        with Session(db) as session:
            session.add_all(file_entries)
            session.commit()
    except(IntegrityError):
        pass #Already in the database


### Merge datasets

In [31]:
logger.info("Start merging")
git_dirs_norm = load_txt("./git_dirs_NORM.txt")
for path in git_dirs_norm:
    logger.info(f"On path {path}")
    project_name = path.split(AOSP_ROOT_MATCHER)[-1]
    aosp_project: AOSPProject = get_aosp_project_by_name(project_name)
    if not aosp_project:
        logger.error(f"No AOCS found for project {project_name}, skipping.")
        continue
    aocs: List[AOSPAoCReport] = get_reports_by_project(aosp_project)
    aocs_transformed = []
    for aoc in aocs:
        aocs_transformed.append(
            AoCReport(
                project_name=project_name,
                line=aoc.line,
                snippet=aoc.snippet,
                class_name=aoc.class_,
                aoc=aoc.aoc,
                path=aoc.path,
                commit=aoc.commit,
                loc=aoc.loc,
            )
        )
    with Session(db) as session:
        logger.info("Saving to database...")
        session.add_all(aocs_transformed)
        session.commit()
