In [16]:
import logging
import os
import pandas as pd
from enum import Enum
from pandas.errors import EmptyDataError
logger = logging.getLogger("ETL")
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG, datefmt='%H:%M:%S')
logger.debug("ETL logging!")

15:16:18 DEBUG:sETL logging!


In [2]:
BOHR_REPORTS_PATH = "./bohr_reports_from_jupyter/"
BOHR_REPORTS_FOLDERS = next(os.walk(BOHR_REPORTS_PATH))[1]

In [3]:
def load_and_merge(path: str, file_list):
    folder_name = path.split("/")[-1]
    df_aggregate = pd.DataFrame()
    for file in file_list:
        try:
            file_to_read = f"{path}/{file}"
            df_load = pd.read_csv(file_to_read)
        except EmptyDataError:
            logger.error(f"Empty data error while reading {file_to_read}")
            continue
        if df_aggregate.empty:
            df_aggregate = df_load
        else:
            df_aggregate = pd.concat([df_aggregate, df_load], ignore_index=True)
    # filter for duplicates
    DUP_COLUMNS = ["Class", "Atom", "Line"]
    df_filtered = df_aggregate.drop_duplicates()
    filtered = df_aggregate.shape[0] - df_filtered.shape[0]
    logger.info(f"{folder_name}: Removed {filtered} lines.")
    return df_filtered


In [61]:
for folder in BOHR_REPORTS_FOLDERS:
    print(f"{BOHR_REPORTS_PATH}{folder}")
    path = f"{BOHR_REPORTS_PATH}{folder}"
    reports = next(os.walk(path))[2] # files only
    reports = [report for report in reports if report.endswith(".csv")] # only csvs
    df = load_and_merge(path, reports)
    df.to_csv(f"{path}/../{folder}_consolidated.csv", index=False)

./bohr_reports_from_jupyter/AmazeFileManager


15:42:35 INFO:sAmazeFileManager: Removed 781 lines.


./bohr_reports_from_jupyter/OsmAnd


15:42:41 INFO:sOsmAnd: Removed 12149 lines.


./bohr_reports_from_jupyter/pixel-dungeon


15:42:42 INFO:spixel-dungeon: Removed 1172 lines.


./bohr_reports_from_jupyter/AntennaPod


15:42:43 INFO:sAntennaPod: Removed 1074 lines.


./bohr_reports_from_jupyter/Infinity-For-Reddit


15:42:44 INFO:sInfinity-For-Reddit: Removed 1799 lines.


./bohr_reports_from_jupyter/NewPipe


15:42:44 INFO:sNewPipe: Removed 1122 lines.
15:42:44 ERROR:sEmpty data error while reading ./bohr_reports_from_jupyter/.ipynb_checkpoints/.ipynb_checkpoints_consolidated.csv
15:42:44 INFO:s.ipynb_checkpoints: Removed 0 lines.


./bohr_reports_from_jupyter/.ipynb_checkpoints
./bohr_reports_from_jupyter/android


15:42:45 INFO:sandroid: Removed 1114 lines.


./bohr_reports_from_jupyter/VirtualXposed


15:42:46 INFO:sVirtualXposed: Removed 633 lines.


./bohr_reports_from_jupyter/PocketHub


15:42:47 INFO:sPocketHub: Removed 123 lines.


./bohr_reports_from_jupyter/Aegis


15:42:47 INFO:sAegis: Removed 290 lines.


./bohr_reports_from_jupyter/NekoX


15:42:56 INFO:sNekoX: Removed 48955 lines.


./bohr_reports_from_jupyter/Mindustry


15:42:58 INFO:sMindustry: Removed 5945 lines.


./bohr_reports_from_jupyter/OpenHub


15:42:58 INFO:sOpenHub: Removed 552 lines.


./bohr_reports_from_jupyter/bitcoin-wallet


15:42:59 INFO:sbitcoin-wallet: Removed 397 lines.


./bohr_reports_from_jupyter/Launcher3


15:42:59 INFO:sLauncher3: Removed 1675 lines.


./bohr_reports_from_jupyter/termux-app


15:43:00 INFO:stermux-app: Removed 692 lines.
15:43:00 INFO:sXposedInstaller: Removed 92 lines.
15:43:00 INFO:sMifareClassicTool: Removed 192 lines.


./bohr_reports_from_jupyter/XposedInstaller
./bohr_reports_from_jupyter/MifareClassicTool
./bohr_reports_from_jupyter/zxing


15:43:01 INFO:szxing: Removed 1495 lines.


./bohr_reports_from_jupyter/AppManager


15:43:02 INFO:sAppManager: Removed 1927 lines.


In [38]:
class AoC(Enum):
    IOP = "Infix Operator Precedence"
    POSTINCDEC = "Post Increment Decrement"
    PREINCDEC = "Pre Increment Decrement"
    CO = "Conditional Operator"
    AL = "Arithmetic as Logic"
    LCF = "Logic as Control Flow"
    RV = "Repurposed Variables"
    CLE = "Change of Literal Encoding"
    OCB = "Omitted Curly Braces"
    TC = "Type Conversion"


class AoCItem:
    def __init__(
        self, path: str, class_: str, atom: str, snippet: str, line: int
    ) -> None:
        self.path = path
        self.class_ = class_
        self.atom = AoC(atom)
        self.snippet = snippet
        self.line = line

    def __eq__(self, value: object) -> bool:
        if not isinstance(value, AoCItem):
            return False
        rhs: AoCItem = value
        same_class = self.class_ == rhs.class_
        same_line = self.line == rhs.line
        same_atom = self.atom == rhs.atom

        return same_class and same_line and same_atom

    def __str__(self) -> str:
        return f"AoCItem.{self.atom}, {self.class_} @ {self.line}"

    def __repr__(self) -> str:
        return f"AoCItem({self.path}, {self.class_}, {self.atom}, {self.snippet}, {self.line})"
