# Scheme data matching evaluation
This notebook evaluates the performance of matching the data extracted from scheme websites to data from the commoncriteriaportal.

In [None]:
import pandas as pd
from pprint import pprint
from tqdm.auto import trange, tqdm

from sec_certs.dataset import CCDataset, CCSchemeDataset
from sec_certs.model import CCSchemeMatcher
from sec_certs.sample.cc_certificate_id import canonicalize
from sec_certs.sample.cc_scheme import CCScheme, EntryType
from sec_certs.configuration import config
from sec_certs.dataset.auxiliary_dataset_handling import CCSchemeDatasetHandler

In [None]:
dset = CCDataset.from_json("../../dset.json")

In [None]:
schemes = CCSchemeDataset.from_json("../../schemes_new.json")
#schemes = CCSchemeDataset.from_web(enhanced=True)
#schemes.to_json("../../schemes_new.json")

In [None]:
dset.aux_handlers[CCSchemeDatasetHandler].dset = schemes

count_was = 0
count_is = 0
for cert in dset:
    if cert.heuristics.scheme_data is not None:
        count_was += 1
    cert.heuristics.old_scheme_data = cert.heuristics.scheme_data
    cert.heuristics.scheme_data = None
dset._compute_scheme_data()
for cert in dset:
    if cert.heuristics.scheme_data is not None:
        count_is += 1
print(count_was, count_is)

In [None]:
def build_df(dset):
    df = pd.DataFrame([(cert.scheme, cert.name, cert.manufacturer, cert.status, cert.heuristics.cert_id, cert.not_valid_before, cert.heuristics.scheme_data)  for cert in dset],
                      columns=["scheme", "name", "vendor", "status", "cert_id", "cert_date", "scheme_data"])
    df["scheme_cert_id"] = df["scheme_data"].map(lambda data: (data.get("cert_id") or data.get("enhanced", {}).get("cert_id")) if data else None)
    def try_canonicalize(cert_id, scheme):
        try:
            return canonicalize(cert_id, scheme)
        except:
            return None
    df["scheme_cert_id_canonical"] = df.apply(lambda x: try_canonicalize(x["scheme_cert_id"], x["scheme"]), axis=1)
    def get_from_entry(entry, *keys: str):
        if e := entry.get("enhanced"):
            for key in keys:
                if val := e.get(key):
                    return val
        for key in keys:
            if val := entry.get(key):
                return val
        return None
    df["scheme_cert_date"] = df["scheme_data"].map(lambda data: get_from_entry(data, "certification_date") if data else None)
    return df

In [None]:
df = build_df(dset)

## Evaluate all schemes

Let's look at how the threshold setting changes the match rate.

In [None]:
original_threshold = config.cc_matching_threshold
thresholds = list(range(100, -10, -10))
rates = {}
dfs = {}
for threshold in tqdm(thresholds):
    config.cc_matching_threshold = threshold
    for cert in dset:
        cert.heuristics.scheme_data = None
    dset._compute_scheme_data()
    count = 0
    for cert in dset:
        if cert.heuristics.scheme_data is not None:
            count += 1
    print(f"Threshold: {threshold}")
    print(f"Assigned count: {count}")
    df = build_df(dset)
    dfs[threshold] = df
    for scheme in schemes:
        country = scheme.country
        total = df[df["scheme"] == country]
        assigned = df[(df["scheme"] == country) & df["scheme_data"].notnull()]
        rate = len(assigned)/len(total) * 100 if len(total) != 0 else 0
        rate_list = rates.setdefault(country, [])
        rate_list.append(rate)

        print(f"{country}: {len(assigned)} assigned out of {len(total)} -> {rate:.1f}%")
        total_active = total[total["status"] == "active"]
        assigned_active = assigned[assigned["status"] == "active"]
        print(f"\t- active: {len(assigned_active)} out of {len(total_active)}, entries: {len(scheme.lists.get(EntryType.Certified, []))}")
        total_archived = total[total["status"] == "archived"]
        assigned_archived = assigned[assigned["status"] == "archived"]
        print(f"\t- archived: {len(assigned_archived)} out of {len(total_archived)}, entries: {len(scheme.lists.get(EntryType.Archived, []))}")
    print()

config.cc_matching_threshold = original_threshold

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from itertools import cycle

lines = ["-","--","-.",":"]
linecycler = cycle(lines)

fig, ax = plt.subplots(figsize=(12,4))
for scheme in schemes:
    ax.plot(thresholds, rates[scheme.country], next(linecycler), label=scheme.country)
ax.legend(bbox_to_anchor=(1.04, 1), loc="upper left");

## Evaluate a scheme

In [None]:
scheme = "DE"
threshold = 70
df = dfs[threshold]
df[df["scheme"] == scheme].sample(10)

In [None]:
un_df = pd.DataFrame(schemes[scheme].lists[EntryType.Certified])
un_df

In [None]:
sd = list(df["scheme_data"])
unmatched_certs = [cert for cert in dset if cert.scheme == scheme and cert.heuristics.scheme_data is None and cert.status == "active"]
unmatched_entries = [entry for entry in schemes[scheme].lists[EntryType.Certified] if entry not in sd]
matches = CCSchemeMatcher.match_all(unmatched_entries, scheme, unmatched_certs)
matches

In [None]:
pd.DataFrame([cert.pandas_tuple[:5] for cert in unmatched_certs])

In [None]:
pd.DataFrame(unmatched_entries)