# CC certificate id evaluation
This notebook can be used to evaluate our heuristics for certificate id assignment
and canonicalization.

It looks at several aspects & issues:

1. Certificates with no id assigned.
2. Duplicate certificate id assignments (when two certificates get the same ID assigned).
3. Certificates that have the same certification report document (an issue of the input data that we get
   that explains some of the duplicate certificate id assignments).
4. Compares a random sample of certificates with assigned ground truth ID.
5. Compares a random sample of certificates with assigned ground truth references and context.

In [None]:
from sec_certs.dataset import CCDataset
from sec_certs.cert_rules import cc_rules
import csv
import pandas as pd

In [None]:
num_schemes = len(cc_rules["cc_cert_id"])
num_scheme_rules = sum(len(rules) for rules in cc_rules["cc_cert_id"].values())
print(f"\\newcommand{{\\numccschemes}}{{{num_schemes}}}")
print(f"\\newcommand{{\\numccschemeidrules}}{{{num_scheme_rules}}}")

In [None]:
dset = CCDataset.from_json("cc_09_10_2022/cc.json")

In [None]:
num_ids = len(list(filter(lambda cert: cert.heuristics.cert_id, dset)))
print(f"\\newcommand{{\\numcccerts}}{{{len(dset)}}}")
print(f"\\newcommand{{\\numccids}}{{{num_ids}}}")

## 1. Certificates with no id

Here we report the number of certificates in our dataset that we have no certificate ID for.

In [None]:
missing_id_dgsts = set()
missing_id = []
for cert in dset:
    if not cert.heuristics.cert_id:
        missing_id_dgsts.add(cert.dgst)
        missing_id.append((cert.dgst, cert.scheme))
pd.DataFrame(missing_id, columns=["id", "scheme"])

### Check manually evaluated missing


In [None]:
missing_manual = pd.read_csv("../../data/cert_id_eval/missing_ids.csv")
print(set(missing_manual.id) == missing_id_dgsts)
print(set(missing_manual.id).difference(missing_id_dgsts))
print(set(missing_id_dgsts).difference(missing_manual.id))

In [None]:
num_missing_manual = missing_manual.shape[0]
num_missing_manual_fixable = missing_manual.cert_id.count()
num_missing_manual_unfixable = num_missing_manual - num_missing_manual_fixable
print(f"\\newcommand{{\\numccmissingid}}{{{num_missing_manual}}}")
print(f"\\newcommand{{\\numccmissingidfixable}}{{{num_missing_manual_fixable}}}")
print(f"\\newcommand{{\\numccmissingidunfixable}}{{{num_missing_manual_unfixable}}}")

In [None]:
missing_manual.loc[missing_manual.cert_id.isnull()].reason.value_counts()

In [None]:
missing_manual.loc[missing_manual.cert_id.notnull()].reason.value_counts()

## 2. Duplicate certificate id assignment

Here we report the number of certificates in our dataset that have a duplicate certiticate
ID assigned.

In [None]:
id_mapping = {}
for cert in dset:
    if cert.heuristics.cert_id is not None:
        c_list = id_mapping.setdefault(cert.heuristics.cert_id, [])
        c_list.append(cert.dgst)

duplicate_id_dgsts = set()
for idd, entries in id_mapping.items():
    if len(entries) > 1 and idd:
        print(idd, entries)
        duplicate_id_dgsts.update(entries)

## 3. Duplicate report documents

Some certificates have erroneously uploaded certificate reports, here we check their
hashes and report such duplicates in the input data.

In [None]:
duplicate_docs = {}

for cert in dset:
    if cert.state.report_pdf_hash is not None:
        r_list = duplicate_docs.setdefault(cert.state.report_pdf_hash, [])
        r_list.append(cert.dgst)

duplicate_doc_dgsts = set()
for hash, entries in duplicate_docs.items():
    if len(entries) > 1:
        print(hash, entries)
        for entry in entries:
            duplicate_doc_dgsts.add(entry)
            
duplicate_ids_due_doc = duplicate_doc_dgsts.intersection(duplicate_id_dgsts)
duplicate_ids_issue = duplicate_id_dgsts.difference(duplicate_doc_dgsts)

The following prints the amount of certificate id duplicates that are not due to input data (and are really our problem).

In [None]:
for id in duplicate_ids_issue:
    print(id, dset[id].heuristics.cert_id)

In [None]:
print(f"\\newcommand{{\\numccduplicateid}}{{{len(duplicate_id_dgsts)}}}")
print(f"\\newcommand{{\\numccduplicateidcolission}}{{{len(duplicate_ids_due_doc)}}}")
print(f"\\newcommand{{\\numccduplicateidissue}}{{{len(duplicate_ids_issue)}}}")

### Check manually evaluated duplicates

In [None]:
duplicate_manual = pd.read_csv("../../data/cert_id_eval/duplicate_ids.csv")
set(duplicate_manual.id) == duplicate_id_dgsts
print(set(duplicate_manual.id).difference(duplicate_id_dgsts))
print(set(duplicate_id_dgsts).difference(duplicate_manual.id))

In [None]:
duplicate_manual[duplicate_manual.result == "tp"].reason.value_counts()

In [None]:
duplicate_manual[duplicate_manual.result == "fp"].reason.value_counts()

The following cell lists those duplicates that were fixed by changes since manual analysis.

## 4. Manually assigned ground truth comparison (cert_id)

In [None]:
correct = set()
possible = set()
impossible = set()
with open("../../data/cert_id_eval/truth.csv", "r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        if line["id"] not in dset.certs:
            continue
        else:
            cert = dset[line["id"]] # and (line["cert_id"] or cert.heuristics.cert_id is not None)
        if line["cert_id"] != cert.heuristics.cert_id:
            print(line["id"], line["cert_id"], cert.heuristics.cert_id, line["source"], line["possible"])
            if line["possible"] == "y":
                possible.add(line["id"])
            else:
                impossible.add(line["id"])
        else:
            correct.add(line["id"])
print(len(correct), len(possible), len(impossible))

In [None]:
correct = set()
incorrect = set()
with open("../../data/cert_id_eval/random.csv", "r") as f:
    reader = csv.DictReader(f)
    for line in reader:
        cert = dset[line["id"]]
        if line["cert_id"] != cert.heuristics.cert_id:
            print(line["id"], line["cert_id"], cert.heuristics.cert_id)
            incorrect.add(line["id"])
        else:
            correct.add(line["id"])
print(len(correct), len(incorrect))

In [None]:
print(f"\\newcommand{{\\numccideval}}{{{len(correct) + len(incorrect)}}}")
print(f"\\newcommand{{\\numccidevalcorrect}}{{{len(correct)}}}")
print(f"\\newcommand{{\\numccidevalincorrect}}{{{len(incorrect)}}}")

## 5. Manually assigned ground truth comparison (references)

In [None]:
manual_references = pd.read_csv("../../data/cert_id_eval/random_references.csv")

In [None]:
print("The referenced cert is a...")
print(manual_references[manual_references.reason != "self"].reason.value_counts())
print("... in the current cert.")
print("Total refs:", sum(manual_references.reason != "self"))

In [None]:
print(f"\\newcommand{{\\numCcRefEval}}{{{manual_references.id.nunique()}}}")
print(f"\\newcommand{{\\numCcRefEvalNotSelf}}{{{sum(manual_references.reason != 'self')}}}")
print(f"\\newcommand{{\\numCcRefEvalComponent}}{{{sum(manual_references.reason == 'component used')}}}")
print(f"\\newcommand{{\\numCcRefEvalRecertification}}{{{sum(manual_references.reason == 'basis of recertification')}}}")
print(f"\\newcommand{{\\numCcRefEvalUsedEval}}{{{sum(manual_references.reason == 'basis of eval')}}}")
print(f"\\newcommand{{\\numCcRefEvalIsUsed}}{{{sum(manual_references.reason == 'basis for')}}}")
print(f"\\newcommand{{\\numCcRefEvalPrevVersion}}{{{sum(manual_references.reason == 'previous version')}}}")

print(f"\\newcommand{{\\numCcRefEvalInReport}}{{{sum((manual_references.location == 'report') & (manual_references.reason != 'self'))}}}")
print(f"\\newcommand{{\\numCcRefEvalInTarget}}{{{sum((manual_references.location == 'target') & (manual_references.reason != 'self'))}}}")