In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlite3
from typing import List, Tuple, Dict
import warnings

Connect to the dataset

In [None]:
db = sqlite3.connect("aosp_acs.db")

Queries

In [None]:
def cursor_to_dataframe(cursor):
    with warnings.catch_warnings():
        # TODO: pandas 2.1.0 has a FutureWarning for concatenating DataFrames with Null entries
        warnings.filterwarnings("ignore", category=FutureWarning)
        columns = [desc[0] for desc in (cursor.description)]
        df_records = pd.DataFrame(columns=columns)
        for row in cursor:
            df_temp = pd.DataFrame([row], columns=columns)
            if df_temp.empty:
                continue
            df_records = pd.concat([df_records, df_temp])
    return df_records.reset_index(drop=True)

In [None]:
def acs_in_a_project(project_name) -> pd.DataFrame:
    cursor = db.execute("SELECT count(id) as ac_count,path,ac,loc FROM ac_reports WHERE path IN (SELECT path FROM files WHERE files.project_name = ?) GROUP BY path,ac",(project_name,))
    return cursor_to_dataframe(cursor)

def ck_in_a_project(project_name) -> pd.DataFrame:
    cursor = db.execute("SELECT * from classes WHERE file_path IN (SELECT path FROM files WHERE files.project_name = ?)", (project_name,))
    df = cursor_to_dataframe(cursor)
    NUMERIC_FIELDS = ['cbo', 'cboModified', 'fanin',
       'fanout', 'wmc', 'dit', 'noc', 'rfc', 'lcom', 'lcom_normalized', 'tcc',
       'lcc', 'totalMethodsQty', 'staticMethodsQty', 'publicMethodsQty',
       'privateMethodsQty', 'protectedMethodsQty', 'defaultMethodsQty',
       'visibleMethodsQty', 'abstractMethodsQty', 'finalMethodsQty',
       'synchronizedMethodsQty', 'totalFieldsQty', 'staticFieldsQty',
       'publicFieldsQty', 'privateFieldsQty', 'protectedFieldsQty',
       'defaultFieldsQty', 'finalFieldsQty', 'synchronizedFieldsQty', 'nosi',
       'loc', 'returnQty', 'loopQty', 'comparisonsQty', 'tryCatchQty',
       'parenthesizedExpsQty', 'stringLiteralsQty', 'numbersQty',
       'assignmentsQty', 'mathOperationsQty', 'variablesQty',
       'maxNestedBlocksQty', 'anonymousClassesQty', 'innerClassesQty',
       'lambdasQty', 'uniqueWordsQty', 'modifiers', 'logStatementsQty']
    for field in NUMERIC_FIELDS:
        df[field] = df[field].astype(float)
    return df


"""Prevalence report of AC in a project"""
def report_types_of_aocs(project_name) -> dict:
    sql = f"SELECT ac, count(id) as qty FROM ac_reports WHERE project_name LIKE '{project_name}' GROUP BY ac ORDER BY qty DESC"
    cursor = db.execute(sql)
    return cursor_to_dataframe(cursor)

def files_in_a_project(project_name) -> pd.DataFrame:
    cursor = db.execute("SELECT * FROM files WHERE project_name = ?", (project_name,))
    return cursor_to_dataframe(cursor)

def loc_of_file(file_name) -> int:
    cursor = db.execute("SELECT loc FROM files WHERE path = ?", (file_name,))
    return int(next(cursor)[0])


Get the list of core apps

In [None]:
cursor = db.execute("SELECT * FROM files WHERE path LIKE '/AOSP/packages/apps%';")
apps_set = set()
for path,loc, project_name in cursor:
    apps_set.add(project_name)

In [None]:
apps_data :Dict[str, Dict[str, pd.DataFrame]] = {}
for project in apps_set:
    apps_data[project] = {
        "ck": ck_in_a_project(project),
        "acs": acs_in_a_project(project),
        "files": files_in_a_project(project)
    }

print("Projects processed: ", len(apps_data.keys()))

AVG, median, percentiles...

In [None]:
def process_ck(df: pd.DataFrame) -> pd.DataFrame:
    output = pd.DataFrame()
    for col in df.columns:
        nonnull = df[col].dropna()
        try:
            output[f"{col}__mean"] = nonnull.mean(skipna=True)
            output[f"{col}__median"] = nonnull.median(skipna=True)
            output[f"{col}__mode"] = nonnull.mode(dropna=True)
            output[f"{col}__90_perc"] = nonnull.quantile(.90 )
        except Exception as e:
            print("Exception: ", e)
            print(f"Skipping column {col} of type {nonnull.dtype}")
    return output

In [None]:
apps_data[project_name]["acs"].loc[apps_data[project_name]["acs"]["ac"] == "Logic as Control Flow"]["ac_count"].sum()

In [None]:
def countacs(project_name: str) -> pd.DataFrame:
    loc_sum = apps_data[project_name]["files"]["loc"].sum()
    ac_count = apps_data[project_name]["acs"]["ac_count"].sum()
    loc_by_acs = loc_sum/ac_count if ac_count > 0 else -1
    return pd.DataFrame([(loc_sum, ac_count, loc_by_acs)], columns=["loc_sum", "ac_count", "loc_by_acs"])



df_apps_ac: pd.DataFrame = pd.DataFrame()
for idx, project_name in enumerate(apps_set):
    apps_ac_map = countacs(project_name)
    apps_ac_map = apps_ac_map.assign(project_name=[project_name])
    df_apps_ac = pd.concat([df_apps_ac, apps_ac_map])
df_apps_ac = df_apps_ac.reset_index(drop=True)


In [None]:
df_apps_ac

In [None]:
df_loc_by_acs = df_apps_ac.sort_values("loc_by_acs", ascending=True)

fig, ax = plt.subplots()


ax.bar(df_loc_by_acs["project_name"], df_loc_by_acs["loc_by_acs"], 0.9)
ax.figure.set_figwidth(12)
ax.figure.set_figheight(5)
ax.yaxis.set_label_text("Lines of code per AC")
ax.xaxis.set_label_text("Project Name")

plt.xticks(rotation=90)
plt.show()

In [None]:
def plot_ac_report_bar(project_name):
    df_report = report_types_of_aocs(project_name)
    df_report = df_report.assign(perc=(df_report["qty"] / df_report["qty"].sum()).astype(float).round(2))
    df_report = df_report.assign(label=df_report["ac"] + " " + df_report["perc"].astype(str))
    fig, ax = plt.subplots()
    ax.figure.set_figwidth(12)
    ax.figure.set_figheight(5)
    wedges, _ = ax.pie(df_report["perc"], wedgeprops=dict(width=0.5), startangle=0)

    # https://matplotlib.org/stable/gallery/pie_and_polar_charts/pie_and_donut_labels.html
    bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
    kw = dict(arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

    for i, p in enumerate(wedges):
        Epsilon = .00001
        ang = (p.theta2 - p.theta1)/2. + p.theta1
        if ang == 180:
            ang = 179
        y = np.sin(np.deg2rad(ang))
        x = np.cos(np.deg2rad(ang))
        horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
        connectionstyle = f"angle,angleA=0,angleB={ang}"
        kw["arrowprops"].update({"connectionstyle": connectionstyle})
        ax.annotate(df_report["label"][i], xy=(x + Epsilon, y + Epsilon), xytext=(1.35*np.sign(x), 1.4*y),
                    horizontalalignment=horizontalalignment, **kw)
    ax.set_title(f"Prevalence of {project_name}")
    plt.savefig(f"imgs/prevalence_{project_name.replace('/', '_')}.png")
    plt.close()



In [None]:
for project_name in apps_set:
    try:
        plot_ac_report_bar(project_name)
    except:
        print(f"Failed to render graph for {project_name}")

### CellBroadcastReceiver analysis

In [None]:
df_ck_cellbroadcastreceiver = ck_in_a_project("packages/apps/CellBroadcastReceiver").groupby("file_path").mean(numeric_only=True).reset_index()
# df_ck_cellbroadcastreceiver = df_ck_cellbroadcastreceiver[["wmc", "file_path"]]
df_ck_cellbroadcastreceiver["file_path"] = df_ck_cellbroadcastreceiver["file_path"].astype("str")
# df_ck_cellbroadcastreceiver["wmc_sum"] = df_ck_cellbroadcastreceiver["wmc"].sum()
# print(df_ck_cellbroadcastreceiver.dtypes)
df_ck_cellbroadcastreceiver["path"] = df_ck_cellbroadcastreceiver["file_path"]
df_files_cellbroadcastreceiver = files_in_a_project("packages/apps/CellBroadcastReceiver")[["path", "loc"]]
df_files_cellbroadcastreceiver["path"] = df_files_cellbroadcastreceiver["path"].astype(str)
df_ck_cellbroadcastreceiver.drop(columns=["loc"], inplace=True) # drop loc, use from files instead.

pd_merged = pd.merge(df_files_cellbroadcastreceiver, df_ck_cellbroadcastreceiver, how="left", on="path").dropna()
df_ac_cellbroadcastreceiver = acs_in_a_project("packages/apps/CellBroadcastReceiver")[["path", "ac_count"]]

pd_merged = pd.merge(pd_merged, df_ac_cellbroadcastreceiver, how="left", on="path")
pd_merged.infer_objects(copy=False)
pd_merged["ac_count"] = pd_merged["ac_count"].astype(float).fillna(0).astype(int)
pd_merged["loc"] = pd_merged["loc"].astype(int)
pd_merged["wmc"] = pd_merged["wmc"].astype(int)
pd_merged.drop(columns=["file_path"], inplace=True)
pd_merged


In [None]:
pd_merged.corr(method="pearson", numeric_only=True)["ac_count"].sort_values(ascending=False)