In [1]:
import pandas as pd
import enum
import os
from typing import List
from typing import Optional
from sqlalchemy import create_engine, select
from sqlalchemy import ForeignKey
from sqlalchemy import String, Integer
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped
from sqlalchemy.orm import Session
from sqlalchemy.orm import mapped_column
from sqlalchemy.orm import relationship
import javalang
import logging

In [2]:
logging.basicConfig(
    filename="HISTORYlistener.log",
    level=logging.DEBUG,
    format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
)

logger = logging.getLogger()
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
# fhandler.setFormatter(formatter)
# logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [3]:
def get_project_name(csv_report_path: str):
    proj_path = csv_report_path.split(".csv")[0]
    proj_path = proj_path.split("bohr_report_aosp")[-1]
    return proj_path

## Build Models

In [4]:
class Base(DeclarativeBase):
    pass


class AOSPProject(Base):
    __tablename__ = "aosp_projects"
    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    name: Mapped[str] = mapped_column(String(), nullable=False)
    description: Mapped[str] = mapped_column(String(), nullable=True)
    package: Mapped[str] = mapped_column(String(), nullable=False)
    category: Mapped[str] = mapped_column(String(), nullable=False)
    aoc_reports: Mapped[List["AOSPAoCReport"]] = relationship(back_populates="project")


class AOSPAoCReport(Base):
    __tablename__ = "aosp_aoc_reports"
    id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
    project_id: Mapped[int] = mapped_column(ForeignKey("aosp_projects.id"))
    project: Mapped["AOSPProject"] = relationship(back_populates="aoc_reports")
    line: Mapped[int] = mapped_column(Integer())
    snippet: Mapped[str] = mapped_column(String(), nullable=False)
    class_: Mapped[str] = mapped_column(String(), nullable=False, name="class")
    aoc: Mapped[str] = mapped_column(String(), nullable=False)
    path: Mapped[str] = mapped_column(String(), nullable=True)
    commit: Mapped[str] = mapped_column(String(), nullable=True)
    loc: Mapped[int] = mapped_column(String(), nullable=True)

## Get packages from AOC

In [5]:
def get_repo_name_from_report_name(csv_report_path: str):
    df_report = pd.read_csv(csv_report_path)
    return df_report.loc[0]["project_name"] # all project names in a report csv file are the same.
def get_aocs_repos_list(reports_path: str) -> List[str]:
    _, _, files = next(os.walk(reports_path))
    files = [f for f in files if f.endswith(".csv")]
    return files
def get_files_with_aoc(report_path: str) -> List[str]:
    df_aocs = pd.read_csv(report_path)
    return df_aocs["filename"].drop_duplicates().to_list()
def get_package_from_java_file(java_file_path: str) -> str:
    with open(java_file_path, "r") as java_file:
        try:
            tree = javalang.parse.parse(java_file.read())
            package_name = tree.package.name
        except:
            logger.error(f"Could not get package name from {java_file_path}")
            package_name = ""
    return package_name

### Create connection engine

In [6]:
engine = create_engine("sqlite+pysqlite:///aosp_dataset.db")
Base.metadata.create_all(engine)

### Persist projects

In [7]:
REPORTS_SUMMARY_PATH = "./reports/summary/"
list_repos_reports = get_aocs_repos_list(REPORTS_SUMMARY_PATH)

In [9]:

projects: List[AOSPProject] = []
for row in list_repos_reports:
    name = get_repo_name_from_report_name(REPORTS_SUMMARY_PATH + row)
    description = f"The {name} repository in the Android Open Source Project (AOSP)"
    projects.append(
        AOSPProject(
            name=name,
            description=description,
            package=name, # a repo can have multiple root packages...
            category="AOSP", # all AOSP repos will have the same AOSP category.
        )
    )
with Session(engine) as session:
    session.add_all(projects)
    session.commit()


### Persist AOCs

In [None]:
def query_project(project_name):
    with Session(engine) as session:
        stmt = select(AOSPProject).where(AOSPProject.name.like(project_name))
        return session.scalar(stmt)

In [None]:
for repo in list_repos_reports:
    df_aocs = pd.read_csv(REPORTS_SUMMARY_PATH + repo)
    aocs = []
    for _, row in df_aocs.iterrows():
        project = query_project(row["project_name"])
        aocs.append(
            AOSPAoCReport(
                project_id=project.id,
                line=row["Line"],
                snippet=row["Snippet"],
                class_=row["Class"],
                aoc=row["Atom"],
                path=row["filename"],
                commit=row["commit"],
                loc=row["loc"]
            )
        )
    with Session(engine) as session:
        session.add_all(aocs)
        session.commit()
