## Manual CPE matching evaluation

This notebook assists the manual matching evaluation

In [None]:
from sec_certs.dataset import CCDataset
import pandas as pd
import json

## Prepare the input data for label studio

In [None]:
dset = CCDataset.from_json("/Users/adam/phd/projects/certificates/sec-certs/datasets/cc_04_10_2022/cc.json")
df = dset.to_pandas()

eval_digests = pd.read_csv("./../../data/cpe_eval/random.csv").set_index("dgst").index
eval_certs = df.loc[df.index.isin(eval_digests)]

# It may be handy to display max number of assigned cpe matches here
eval_certs["n_cpes"] = eval_certs.cpe_matches.map(len)
max_n_cpes = eval_certs.n_cpes.max()

# Now you may want to adjust the key `cpe_n_max_matches` config in sec_certs/config/settings.yml according to max_n_cpes
# This helps to avoid clutter in label studio interface

dset.certs = {x.dgst: x for x in dset if x.dgst in eval_certs.index.tolist()}
dset.to_label_studio_json("./label_studio_input_data.json")

`Now you import this data to label studio and label it`

## Load the data from label studio and show the results

In [None]:
with open("/Users/adam/Downloads/cpe_hundred.json", "r") as handle:
    data = json.load(handle)


results = []
for sample in data:
    option_keys = [key for key in sample.keys() if "option_" in key]
    n_cpe_matches = len([sample[key] for key in option_keys if sample[key] != "No good match"])

    if not "verified_cpe_match" in sample.keys():
        n_wrong = 0
    elif isinstance(sample["verified_cpe_match"], str):
        n_wrong = 1
    else:
        n_wrong = len(sample["verified_cpe_match"])

    results.append([n_cpe_matches, n_wrong])

    #if isinstance(sample["verified_cpe_match"], str)

correct = [x[0] for x in results]
wrong = [x[1] for x in results]
n_candidates = [x[0] + x[1] for x in results]
completely_right = [x == 0 for x in wrong]

print(f"Evaluated {sum(n_candidates)} CPE matches in {len(results)} certificates")
print(f"In total, {sum(correct)} ({(100 * sum(correct) / sum(n_candidates)):.2f}%) are correct (precision of the positive class).")
print(f"Also, {sum(completely_right)} ({(100 * sum(completely_right) / len(n_candidates)):.2f}%) certificates have perfect matches.")