Skip to content

Commit

Permalink
qc: add report module
Browse files Browse the repository at this point in the history
  • Loading branch information
abhidg committed Oct 17, 2023
1 parent a129b0a commit 6412927
Show file tree
Hide file tree
Showing 6 changed files with 211 additions and 70 deletions.
18 changes: 16 additions & 2 deletions adtl/qc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""
Quality Control module for ADTL
"""
import copy
import json
import functools
from typing import List, Union
from pathlib import Path
from typing import TypedDict, Dict, List, Any, Optional
from typing import List, Union, TypedDict, Any, Optional, Dict

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -35,6 +36,7 @@ class WorkUnitResult(TypedDict):
file: str
rows_success: int
rows_fail: int
rows: int
ratio_success: float
rows_fail_idx: List[int]
success: bool
Expand Down Expand Up @@ -81,6 +83,7 @@ def wrapper(df, **kwargs):
return dict(
rows_success=int(rows_success),
rows_fail=int(rows_fail),
rows=int(rows_success) + int(rows_fail),
ratio_success=ratio_success,
success=bool(ratio_success >= mostly),
mostly=mostly,
Expand All @@ -104,6 +107,17 @@ def schema(schema_path: Union[str, Path], pattern: str = "*.csv"):
pass

Check warning on line 107 in adtl/qc/__init__.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/__init__.py#L107

Added line #L107 was not covered by tests


def get_result_from_insertion(data: Dict[str, Any]) -> WorkUnitResult:
result: Dict[str, Any] = copy.deepcopy(data) # type: ignore
if result["fail_data"]:
result["fail_data"] = pd.DataFrame(json.loads(result["fail_data"]))
if result["rows_fail_idx"]:
result["rows_fail_idx"] = [

Check warning on line 115 in adtl/qc/__init__.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/__init__.py#L111-L115

Added lines #L111 - L115 were not covered by tests
int(float(x)) for x in str(result["rows_fail_idx"]).split(",")
]
return result

Check warning on line 118 in adtl/qc/__init__.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/__init__.py#L118

Added line #L118 was not covered by tests


def main(args=None):
from .runner import _main

Check warning on line 122 in adtl/qc/__init__.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/__init__.py#L122

Added line #L122 was not covered by tests

Expand Down
161 changes: 161 additions & 0 deletions adtl/qc/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
"""
Quality Control module for ADTL, report submodule
"""

import json
import sqlite3
from string import Template
from pathlib import Path
from typing import List, Any, Dict
from functools import partial

import pandas as pd

from . import get_result_from_insertion, WorkUnitResult, Rule

RULES_SUBFOLDER = "r"
DATASET_SUBFOLDER = "d"
TEMPLATES = Path(__file__).parent / "templates"
STYLE = TEMPLATES / "style.css"
INDEX = "index.html"


def render_result(result: Dict[str, Any], show_rule: bool = False) -> str:
result = get_result_from_insertion(result) # type: ignore
result["rule_str"] = (

Check warning on line 25 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L24-L25

Added lines #L24 - L25 were not covered by tests
f"""<a href="../r/{result["rule"]}.html">{result["rule"]}</a>, """
if show_rule
else ""
)
tmpl = (

Check warning on line 30 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L30

Added line #L30 was not covered by tests
"<li><tt>[{rows_fail} / {rows}]</tt> {rule_str}{dataset} / {file}".format(
**result
)
if result["success"] != 1
else "<li>✔ {rule_str}{dataset} / {file}</li>".format(**result)
)
if result["fail_data"]:
fail_data = pd.DataFrame(json.loads(result["fail_data"]))
tmpl += """

Check warning on line 39 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L37-L39

Added lines #L37 - L39 were not covered by tests
<details>
<summary>Failed rows</summary>
<pre>{log}</p>
</details></li>""".format(
log=str(fail_data)
)
else:
tmpl += "</li>"
return tmpl

Check warning on line 48 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L47-L48

Added lines #L47 - L48 were not covered by tests


def render_results_by_rule(
results: List[WorkUnitResult], rules: List[Rule]
) -> Dict[str, str]:
result_data = "\n".join(map(render_result, results))
rules_in_results = [r["rule"] for r in results]

Check warning on line 55 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L54-L55

Added lines #L54 - L55 were not covered by tests

out = {}
for rule_name in rules_in_results:
rule = [r for r in rules if r["name"] == rule_name][0]
out[rule_name] = Template((TEMPLATES / "rule.html").read_text()).substitute(

Check warning on line 60 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L57-L60

Added lines #L57 - L60 were not covered by tests
dict(
name=rule["name"],
description=rule["description"],
long_description='<p class="long_description">\n'
+ rule["long_description"]
+ "</p>"
if rule["long_description"]
else "",
results=result_data,
)
)
return out

Check warning on line 72 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L72

Added line #L72 was not covered by tests


def render_results_by_dataset(
results: List[WorkUnitResult], datasets: List[str]
) -> Dict[str, str]:
def filter_dataset(dataset: str) -> List[WorkUnitResult]:
return [r for r in results if r["dataset"] == dataset]

Check warning on line 79 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L78-L79

Added lines #L78 - L79 were not covered by tests

out = {}

Check warning on line 81 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L81

Added line #L81 was not covered by tests

for dataset in datasets:
result_data = "\n".join(

Check warning on line 84 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L83-L84

Added lines #L83 - L84 were not covered by tests
map(partial(render_result, show_rule=True), filter_dataset(dataset))
)
out[dataset] = Template((TEMPLATES / "dataset.html").read_text()).substitute(

Check warning on line 87 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L87

Added line #L87 was not covered by tests
dataset=dataset, results=result_data
)
return out

Check warning on line 90 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L90

Added line #L90 was not covered by tests


def render_index(rules: List[Dict[str, Any]], datasets: List[str]) -> str:
dataset_index = "\n".join(

Check warning on line 94 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L94

Added line #L94 was not covered by tests
f"""<li><a href="d/{dataset}.html">{dataset}</a></li>""" for dataset in datasets
)
rule_index = "\n".join(

Check warning on line 97 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L97

Added line #L97 was not covered by tests
f"""<li><a href="r/{r["name"]}.html">{r["description"]}</a></li>"""
for r in rules
)
return Template((TEMPLATES / "index.html").read_text()).substitute(

Check warning on line 101 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L101

Added line #L101 was not covered by tests
dict(dataset_index=dataset_index, rule_index=rule_index)
)


def read_sql(
conn: sqlite3.Connection, sql: str, columns: List[str]
) -> List[Dict[str, Any]]:
cur = conn.cursor()
res = cur.execute(sql)
return [dict(zip(columns, r)) for r in res.fetchall()]

Check warning on line 111 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L109-L111

Added lines #L109 - L111 were not covered by tests


def make_report(store_database: str, output_folder: Path = Path("qc_report")):
"Makes report from results database"

output_folder.mkdir(exist_ok=True)
(output_folder / "r").mkdir(exist_ok=True)
(output_folder / "d").mkdir(exist_ok=True)

Check warning on line 119 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L117-L119

Added lines #L117 - L119 were not covered by tests

conn = sqlite3.connect(store_database)
datasets = read_sql(conn, "SELECT DISTINCT dataset FROM results", ["dataset"])
datasets = [n["dataset"] if n["dataset"] else "_unlabelled" for n in datasets]
rules = read_sql(

Check warning on line 124 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L121-L124

Added lines #L121 - L124 were not covered by tests
conn,
"SELECT name, description, long_description FROM rules",
["name", "description", "long_description"],
)
(output_folder / "style.css").write_text(STYLE.read_text())
(output_folder / INDEX).write_text(render_index(rules, datasets))
results = read_sql(

Check warning on line 131 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L129-L131

Added lines #L129 - L131 were not covered by tests
conn,
"SELECT * from results",
[
"rule",
"dataset",
"file",
"rows_success",
"rows_fail",
"rows",
"ratio_success",
"rows_fail_idx",
"success",
"mostly",
"fail_data",
],
)
results_by_rule = render_results_by_rule(results, rules)
results_by_dataset = render_results_by_dataset(results, datasets)
for rule in results_by_rule:
(output_folder / "r" / (rule + ".html")).write_text(results_by_rule[rule])
print(f"wrote r/{rule}.html")
for dataset in datasets:
(output_folder / "d" / (dataset + ".html")).write_text(

Check warning on line 154 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L148-L154

Added lines #L148 - L154 were not covered by tests
results_by_dataset[dataset]
)
print(f"wrote d/{dataset}.html")

Check warning on line 157 in adtl/qc/report.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/report.py#L157

Added line #L157 was not covered by tests


if __name__ == "__main__":
make_report("adtl-qc.db")
31 changes: 16 additions & 15 deletions adtl/qc/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pandas as pd

from . import Dataset, Rule, WorkUnit, WorkUnitResult
from .report import make_report

DEFAULT_PATTERN = "*.csv"

Expand All @@ -26,6 +27,7 @@
file TEXT,
rows_success INTEGER,
rows_fail INTEGER,
rows INTEGER,
ratio_success REAL,
rows_fail_idx TEXT,
success INTEGER,
Expand All @@ -34,14 +36,14 @@
)"""

DDL_RULES = """CREATE TABLE IF NOT EXISTS rules (
rule TEXT,
name TEXT,
description TEXT,
long_description TEXT
)"""

INSERT_RESULTS = """INSERT INTO results VALUES (
:rule, :dataset, :file, :rows_success,
:rows_fail, :ratio_success, :rows_fail_idx,
:rows_fail, :rows, :ratio_success, :rows_fail_idx,
:success, :mostly, :fail_data
)"""

Expand All @@ -59,7 +61,10 @@ def collect_datasets(
folders = defaultdict(list)
for f in files:
folders[f.parent.stem].append(f)
return [Dataset(folder=folder, files=folders[folder]) for folder in folders]
return [
Dataset(folder=folder if folder else "_unlabelled", files=folders[folder])
for folder in folders
]


def collect_rules(root: Path = Path("qc")) -> List[Rule]:
Expand Down Expand Up @@ -111,12 +116,12 @@ def collect_work_units(datasets: List[Dataset], rules: List[Rule]) -> List[WorkU
def prepare_result_for_insertion(work_unit_result: WorkUnitResult) -> Dict[str, Any]:
result: Dict[str, Any] = copy.deepcopy(work_unit_result) # type: ignore
result["fail_data"] = (
None
""
if result["fail_data"].empty
else json.dumps(result["fail_data"].to_dict(orient="records"))
)
result["rows_fail_idx"] = (
None
""
if not result["rows_fail_idx"]
else ",".join(map(str, result["rows_fail_idx"]))
)
Expand All @@ -125,15 +130,6 @@ def prepare_result_for_insertion(work_unit_result: WorkUnitResult) -> Dict[str,
return result


def get_result_from_insertion(data: Dict[str, Any]) -> WorkUnitResult:
result: Dict[str, Any] = copy.deepcopy(data) # type: ignore
if result["fail_data"]:
result["fail_data"] = pd.DataFrame(json.loads(result["fail_data"]))
if result["rows_fail_idx"]:
result["rows_fail_idx"] = [int(x) for x in result["rows_fail_idx"].split(",")]
return result


def process_work_unit(unit: WorkUnit, save_db: Optional[str] = None) -> WorkUnitResult:
rule = unit["rule"]
module = importlib.import_module(rule["module"])
Expand All @@ -158,6 +154,7 @@ def start(
rules_path: Path = Path("qc"),
data_file_formats: List[str] = ["csv"],
store_database: Optional[str] = None,
disable_report: bool = False,
) -> List[WorkUnitResult]:
rules = collect_rules(rules_path)
datasets = collect_datasets(data_path, data_file_formats)
Expand All @@ -174,7 +171,10 @@ def start(

pool = multiprocessing.Pool()
process_work_unit_db = functools.partial(process_work_unit, save_db=store_database)
return pool.map(process_work_unit_db, work_units)
res = pool.map(process_work_unit_db, work_units)
if store_database and not disable_report:
make_report(store_database)
return res

Check warning on line 177 in adtl/qc/runner.py

View check run for this annotation

Codecov / codecov/patch

adtl/qc/runner.py#L172-L177

Added lines #L172 - L177 were not covered by tests


def _main(args=None):
Expand All @@ -198,4 +198,5 @@ def _main(args=None):
Path(args.rule_root),
data_file_formats=args.format.split(","),
store_database=args.database,
disable_report=args.no_report,
)
31 changes: 7 additions & 24 deletions adtl/qc/templates/dataset.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,45 +3,28 @@

<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<title>✱ dataset - {{ dataset }}</title>
<link rel="stylesheet" href="style.css">
<title>✱ dataset - $dataset</title>
<link rel="stylesheet" href="../style.css">
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>

<body>

<nav class="breadcrumb">
<a class="breadcrumb-item" href="/"></a> |
<a class="breadcrumb-item" href="../index.html"></a> |
updated {{ run_date }}
</nav>

<main>
<h1>{{ dataset }}</h1>
<h2>Files</h2>
<ul class="files">
{{ #files }}
<li><tt>{{ hash }}</tt> {{ file }}</li>
{{ /files }}
</ul>

<h1>$dataset</h1>

<h2>Triggered rules</h2>
<ul class="rules">
{{#rules}}
<li><tt>[{{ success }} / {{ total }}]</tt> {{ rule }} - {{ file }}

{{ #log }}
<details>
<summary>Failed rows</summary>
<pre>{{ log }}</p>
</details>
{{ /log }}
</li>
{{/rules}}
<ul class="results">
$results
</ul>


</main>
</body>

</html>
</html>
12 changes: 4 additions & 8 deletions adtl/qc/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,19 @@
<h1>✱ adtl-qc report</h1>

<p class="runinfo">
Run on {{ date }}
Updated on {{ date }}
</p>
<h2>Datasets</h2>
<ul id="datasets">
{{#datasets}}
<li><a href="/d/{{ name }}.html">{{ name }}</a></li>
{{/datasets}}
$dataset_index
</ul>
</body>

</html>

<h2>Rules</h2>
<ul id="datasets">
{{#rules}}
<li><a href="/r/{{ rule }}.html">{{ description }}</a></li>
{{/rules}}
<ul id="rules">
$rule_index
</ul>


Expand Down

0 comments on commit 6412927

Please sign in to comment.