In [1]:
from sec_certs.dataset import CCDataset
from pathlib import Path
import shutil
import subprocess
from sec_certs.utils import helpers
from urllib.parse import unquote_plus, urlparse

In [None]:
def hash_file(path):
    return helpers.get_sha256_filepath(path)
def extract_filename(link):
    return unquote_plus(str(urlparse(link).path).split("/")[-1])

In [2]:
# A directory with a dump of the documents that will enrich the dataset.
doc_dump_dir = Path("cc_certs_09_10_2022")
print(doc_dump_dir.exists())

# An output directory for the dataset.
dataset_dir = Path("cc_09_10_2022")
print(dataset_dir.exists())

True
True


In [3]:
# Print the current commit hash
subprocess.run(["git", "rev-parse", "--short", "HEAD"])

95d1ec0


CompletedProcess(args=['git', 'rev-parse', '--short', 'HEAD'], returncode=0)

In [4]:
# Print tool versions
subprocess.run(["pdftotext", "-v"])
subprocess.run(["tesseract", "-v"]);

tesseract 5.2.0
 leptonica-1.82.0
  libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 2.1.3) : libpng 1.6.38 : libtiff 4.4.0 : zlib 1.2.12 : libwebp 1.2.4 : libopenjp2 2.5.0
 Found AVX2
 Found AVX
 Found FMA
 Found SSE4.1
 Found OpenMP 201511
 Found libarchive 3.6.1 zlib/1.2.12 liblzma/5.2.5 bz2lib/1.0.8 liblz4/1.9.3 libzstd/1.5.2
 Found libcurl/7.85.0 OpenSSL/1.1.1q zlib/1.2.12 brotli/1.0.9 zstd/1.5.2 libidn2/2.3.3 libpsl/0.21.1 (+libidn2/2.3.0) libssh2/1.10.0 nghttp2/1.50.0


pdftotext version 22.09.0
Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011, 2022 Glyph & Cog, LLC


In [5]:
# Create the dataset
dset = CCDataset(root_dir=dataset_dir, name="cc", description="Final run on 09.10.2022")

In [6]:
# Get the metadata
dset.get_certs_from_web()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:14<00:00,  7.01s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.00s/it]
The CSV cc_09_10_2022/web/cc_products_active.csv contains 8 duplicates by the primary key.
The CSV cc_09_10_2022/web/cc_products_archived.csv contains 10 duplicates by the primary key.
When merging certificates with dgst b26ce64e0c677e3d, the following mismatch occured: Attribute=security_level, self[security_level]={'ALC_DVS.1', 'EAL2+'}, other[security_level]={'EAL2'}
When merging certificates with dgst 7f043162f88a1c3a, the following mismatch occured: Attribute=not_valid_after, self[not_valid_after]=2024-09-19, other[not_valid_after]=2024-09-18
When merging certificates with dgst 109eb2158ca6a2f9, the following mismatch occured: Attribute=security_level, self[security_le

In [7]:
# Process the profiles?
dset.process_protection_profiles()

Duplicate entry in PP dataset: ('Protection Profile for Enterprise Security Management Policy Management', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ESM_PM_V1.4.pdf')
Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')
Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')
Duplicate entry in PP dataset: ('Public Key-Enabled Application Family of Protection Profiles', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_VID3004-PP.pdf')
Duplicate entry in PP dataset: ('Protection Profile for Wireless Local Area Network (WLAN) Access Systems', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_WLAN_AS_V1.0.pdf')
Duplicate entry in PP dataset: ('PC Client Specific Trusted Platform Mod

In [8]:
# Enrich the dataset with the pre-downloaded PDFs (collected over multiple runs to fill in the timeouts and disappearances).
dset.reports_pdf_dir.mkdir(parents=True, exist_ok=True)
dset.targets_pdf_dir.mkdir(parents=True, exist_ok=True)
for cert in dset:
    report_pdf = doc_dump_dir / "report" / "pdf" / f"{cert.dgst}.pdf"
    if report_pdf.exists():
        shutil.copy(report_pdf, cert.state.report_pdf_path)
        cert.state.report_download_ok = True
        cert.state.report_pdf_hash = hash_file(cert.state.report_pdf_path)
    cert.pdf_data.report_filename = extract_filename(cert.report_link)

    target_pdf = doc_dump_dir / "target" / "pdf" / f"{cert.dgst}.pdf"
    if target_pdf.exists():
        shutil.copy(target_pdf, cert.state.st_pdf_path)
        cert.state.st_download_ok = True
        cert.state.st_pdf_hash = hash_file(cert.state.st_pdf_path)
    cert.pdf_data.st_filename = extract_filename(cert.st_link)

In [9]:
# Try to download last missing (or ones not present in the pre-downloaded)
dset.download_all_pdfs(fresh=False)

Downloading reports:   0%|                                                                                                             | 0/14 [00:00<?, ?it/s]Cert dgst: 82fcce2db9ef6063 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/emCA%20Certification%20Report%20v1.0.pdf, code: nok
Cert dgst: 02482228eb547c15 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/st_vid4016a-vr.pdf, code: nok
Cert dgst: d79ecebfe21f2144 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/st_vid10024a-vr.pdf, code: nok
Cert dgst: a90d78924e91dc6c failed to download report from https://www.commoncriteriaportal.org/files/epfiles/ANSSI-CC_2010-03en1.pdf, code: nok
Cert dgst: 8ac22970bd8042d2 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/st_vid4016b-vr.pdf, code: nok
Cert dgst: e27eb9efbd67c4f7 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/Docu

In [10]:
# Convert all
dset.convert_all_pdfs()

Converting reports to txt:   2%|██                                                                                         | 118/5115 [00:06<04:56, 16.83it/s]Error when converting pdf->txt: poppler error creating document
Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf
Error during OCR of cc_09_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf, using garbage: pdftoppm failed: 1
Cert dgst: bfc86f14e2484335 failed to convert report pdf->txt
Converting reports to txt:   3%|███                                                                                        | 169/5115 [00:09<04:21, 18.89it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/1ee7ecee9e7e131c.pdf
Converting reports to txt:   4%|███▍                                                                                       | 194/5115 [00:10<04:03, 20.19it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/d8c205b4924f91b3.pdf
Detected garbage du

Converting reports to txt:  10%|████████▌                                                                               | 498/5115 [17:18<15:20:52, 11.97s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/47913a485c3c8a18.pdf
Converting reports to txt:  10%|████████▊                                                                                | 503/5115 [17:29<9:17:26,  7.25s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/b0b7b073ca2dfe5f.pdf
Converting reports to txt:  10%|████████▋                                                                               | 504/5115 [18:31<17:41:13, 13.81s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/60f3bd10ee9be85b.pdf
Converting reports to txt:  10%|████████▋                                                                               | 505/5115 [19:14<23:14:43, 18.15s/it]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/5c3806bf79eeab7f.pdf
Conv

Cert dgst: f9c8da9deff77ab5 failed to convert report pdf->txt
Converting reports to txt:  76%|████████████████████████████████████████████████████████████████████▌                     | 3900/5115 [28:07<01:29, 13.63it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/3570791ff9c92912.pdf
Converting reports to txt:  77%|█████████████████████████████████████████████████████████████████████▍                    | 3948/5115 [28:12<01:59,  9.80it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/ed886279d0d61096.pdf
Converting reports to txt:  86%|█████████████████████████████████████████████████████████████████████████████▌            | 4407/5115 [29:04<01:00, 11.64it/s]Detected garbage during conversion of cc_09_10_2022/certs/reports/pdf/c763b37bb95517e9.pdf
Converting reports to txt:  87%|█████████████████████████████████████████████████████████████████████████████▉            | 4430/5115 [29:05<00:53, 12.70it/s]Detected garbage during conversio

Error when converting pdf->txt: poppler error creating document
Error when converting pdf->txt: poppler error creating document
Error when converting pdf->txt: poppler error creating document
Error during OCR of cc_09_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf, using garbage: pdftoppm failed: 1
Error when converting pdf->txt: poppler error creating document
Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf
Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/422dc5758723c7d1.pdf
Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/bfc86f14e2484335.pdf
Cert dgst: 7147c2f70d983d57 failed to convert security target pdf->txt
Detected garbage during conversion of cc_09_10_2022/certs/targets/pdf/4c9468f20fdb04f7.pdf
Error during OCR of cc_09_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf, using garbage: pdftoppm failed: 1
Cert dgst: a2b962c7f1d2bc56 failed to convert security target pdf->txt
Error during OCR of cc

In [11]:
# Analyze all
dset.analyze_certificates()

Extracting report metadata:   0%|                                                                                                    | 0/5107 [00:00<?, ?it/s]Failed to read metadata of certs/reports/pdf/1e91595cd5e7fece.pdf, error: Unexpected escaped string: b'\x83'
Failed to read metadata of certs/reports/pdf/451e0806dfcfd29d.pdf, error: PDF metadata unavailable
Extracting report metadata:  14%|████████████▍                                                                            | 713/5107 [00:06<00:34, 128.26it/s]Failed to read metadata of certs/reports/pdf/e51bbdd7e3705d61.pdf, error: PDF metadata unavailable
Extracting report metadata:  25%|██████████████████████                                                                  | 1281/5107 [00:14<00:33, 113.06it/s]Failed to read metadata of certs/reports/pdf/dc54e899ab191a2a.pdf, error: PDF metadata unavailable
Extracting report metadata:  33%|████████████████████████████▉                                                          

Extracting target metadata:  32%|████████████████████████████▏                                                            | 1595/5033 [00:48<01:23, 41.13it/s]Failed to read metadata of certs/targets/pdf/bbcfe0aee7ea78ea.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/targets/pdf/f0bd6a29eee94a2c.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/targets/pdf/463ecd64b7506048.pdf, error: PDF metadata unavailable
Extracting target metadata:  36%|████████████████████████████████▎                                                        | 1825/5033 [00:51<00:51, 62.73it/s]Failed to read metadata of certs/targets/pdf/d3568613c552f9e8.pdf, error: EOF marker not found
Extracting target metadata:  44%|███████████████████████████████████████▏                                                 | 2214/5033 [00:56<00:34, 82.26it/s]Failed to read metadata of certs/targets/pdf/4d2a177384b23fd6.pdf, error: PDF metadata unavailable
Extracting target metadata:  58%|██

geee
geee
geee
geee
geee


Predicting CPE matches with the classifier: 100%|█████████████████████████████████████████████████████████████████████████| 5129/5129 [02:57<00:00, 28.82it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:13<00:00,  1.58it/s]
Building CVEDataset from jsons: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 21/21 [04:17<00:00, 12.24s/it]
parsing cpe matching (by NIST) dictionary: 100%|███████████████████████████████████████████████████████████████████| 398786/398786 [00:26<00:00, 15135.38it/s]
Building-up lookup dictionaries for fast CVE matching: 100%|███████████████████████████████████████████████████████| 197102/197102 [00:18<00:00, 10511.33it/s]
Computing related CVES: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 650/650 [00:00<00:00, 70385.13it/s]


In [12]:
# And do maintenance updates as well
dset.process_maintenance_updates()

Downloading reports: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 459/459 [01:39<00:00,  4.62it/s]
Downloading targets:  75%|█████████████████████████████████████████████████████████████████████████▏                        | 343/459 [01:56<00:46,  2.49it/s]ERROR:sec_certs.sample.certificate:Cert dgst: cert_5ab17f47ec6820f3_update_0cbfc8dd71545605 failed to download ST from https://www.commoncriteriaportal.org/files/epfiles/[ST]%20Samsung%20Multifunction%20MultiXpress%20X4300%20K4350_v1.4(???).pdf, code: nok
Downloading targets: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 459/459 [02:31<00:00,  3.03it/s]
Converting reports to txt:  86%|███████████████████████████████████████████████████████████████████████████████▏            | 395/459 [00:09<00:01, 43.29it/s]ERROR:sec_certs.utils.pdf:Error when converting pdf->txt: poppler error creating document
Converting reports t

<sec_certs.dataset.common_criteria.CCDatasetMaintenanceUpdates at 0x7f780d79df00>

In [13]:
# Finally, dump it all
dset.to_json()