In [1]:
from sec_certs.dataset import CCDataset
from pathlib import Path
import shutil
import subprocess

In [2]:
# A directory with a dump of the documents that will enrich the dataset.
doc_dump_dir = Path("cc_certs_03_10_2022")
print(doc_dump_dir.exists())

# An output directory for the dataset.
dataset_dir = Path("cc_03_10_2022")
print(dataset_dir.exists())

True
True


In [3]:
# Print tool versions
subprocess.run(["pdftotext", "-v"])
subprocess.run(["tesseract", "-v"]);

tesseract 5.2.0
 leptonica-1.82.0
  libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 2.1.3) : libpng 1.6.38 : libtiff 4.4.0 : zlib 1.2.12 : libwebp 1.2.4 : libopenjp2 2.5.0
 Found AVX2
 Found AVX
 Found FMA
 Found SSE4.1
 Found OpenMP 201511
 Found libarchive 3.6.1 zlib/1.2.12 liblzma/5.2.5 bz2lib/1.0.8 liblz4/1.9.3 libzstd/1.5.2
 Found libcurl/7.85.0 OpenSSL/1.1.1q zlib/1.2.12 brotli/1.0.9 zstd/1.5.2 libidn2/2.3.3 libpsl/0.21.1 (+libidn2/2.3.0) libssh2/1.10.0 nghttp2/1.50.0


pdftotext version 22.09.0
Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011, 2022 Glyph & Cog, LLC


In [4]:
# Create the dataset
dset = CCDataset(root_dir=dataset_dir, name="cc", description="Final run on 03.10.2022")

In [5]:
# Get the metadata
dset.get_certs_from_web()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:06<00:00,  3.01s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.00s/it]
The CSV cc_03_10_2022/web/cc_products_active.csv contains 8 duplicates by the primary key.
The CSV cc_03_10_2022/web/cc_products_archived.csv contains 10 duplicates by the primary key.
When merging certificates with dgst b26ce64e0c677e3d, the following mismatch occured: Attribute=security_level, self[security_level]={'ALC_DVS.1', 'EAL2+'}, other[security_level]={'EAL2'}
When merging certificates with dgst 7f043162f88a1c3a, the following mismatch occured: Attribute=not_valid_after, self[not_valid_after]=2024-09-19, other[not_valid_after]=2024-09-18
When merging certificates with dgst 109eb2158ca6a2f9, the following mismatch occured: Attribute=security_level, self[security_le

In [6]:
# Process the profiles?
dset.process_protection_profiles()

Duplicate entry in PP dataset: ('Protection Profile for Enterprise Security Management Policy Management', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ESM_PM_V1.4.pdf')
Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')
Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')
Duplicate entry in PP dataset: ('Public Key-Enabled Application Family of Protection Profiles', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_VID3004-PP.pdf')
Duplicate entry in PP dataset: ('Protection Profile for Wireless Local Area Network (WLAN) Access Systems', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_WLAN_AS_V1.0.pdf')
Duplicate entry in PP dataset: ('PC Client Specific Trusted Platform Mod

In [7]:
# Enrich the dataset with the pre-downloaded PDFs (collected over multiple runs to fill in the timeouts and disappearances).
dset.reports_pdf_dir.mkdir(parents=True, exist_ok=True)
dset.targets_pdf_dir.mkdir(parents=True, exist_ok=True)
for cert in dset:
    report_pdf = doc_dump_dir / "report" / "pdf" / f"{cert.dgst}.pdf"
    if report_pdf.exists():
        shutil.copy(report_pdf, cert.state.report_pdf_path)
        cert.state.report_download_ok = True
    target_pdf = doc_dump_dir / "target" / "pdf" / f"{cert.dgst}.pdf"
    if target_pdf.exists():
        shutil.copy(target_pdf, cert.state.st_pdf_path)
        cert.state.st_download_ok = True

In [8]:
# Try to download last missing (or ones not present in the pre-downloaded)
dset.download_all_pdfs(fresh=False)

Downloading reports:   0%|                                                                                                             | 0/15 [00:00<?, ?it/s]Cert dgst: e27eb9efbd67c4f7 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/Document%20SAFER%20Blue%202%20Security%20Target%20V1.10_???.pdf, code: nok
Cert dgst: 8ac22970bd8042d2 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/st_vid4016b-vr.pdf, code: nok
Cert dgst: 82fcce2db9ef6063 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/emCA%20Certification%20Report%20v1.0.pdf, code: nok
Cert dgst: d79ecebfe21f2144 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/st_vid10024a-vr.pdf, code: nok
Cert dgst: 305af0eec85bb856 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/[KECS-CR-13-28]%20AhnLab%20TrusGuard%20V2.2(??).pdf, code: nok
Cert dgst: a90d78924e91dc6c failed to dow

In [9]:
# Convert all
dset.convert_all_pdfs()

Converting reports to txt:   2%|█▋                                                                                          | 95/5112 [00:07<06:21, 13.17it/s]Error when converting pdf->txt: poppler error creating document
Converting reports to txt:   2%|██                                                                                         | 115/5112 [00:08<06:00, 13.86it/s]Detected garbage during conversion of cc_03_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf
Error during OCR of cc_03_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf, using garbage: pdftoppm failed: 1
Cert dgst: bfc86f14e2484335 failed to convert report pdf->txt
Converting reports to txt:   4%|███▏                                                                                       | 181/5112 [00:11<04:31, 18.14it/s]Detected garbage during conversion of cc_03_10_2022/certs/reports/pdf/1ee7ecee9e7e131c.pdf
Converting reports to txt:   4%|███▍                                                                          

Error during OCR of cc_03_10_2022/certs/reports/pdf/f9c8da9deff77ab5.pdf, using garbage: pdftoppm failed: 1
Cert dgst: f9c8da9deff77ab5 failed to convert report pdf->txt
Converting reports to txt:  76%|████████████████████████████████████████████████████████████████████▌                     | 3896/5112 [58:08<01:13, 16.65it/s]Detected garbage during conversion of cc_03_10_2022/certs/reports/pdf/3570791ff9c92912.pdf
Converting reports to txt:  77%|█████████████████████████████████████████████████████████████████████▍                    | 3947/5112 [58:11<01:21, 14.22it/s]Detected garbage during conversion of cc_03_10_2022/certs/reports/pdf/ed886279d0d61096.pdf
Converting reports to txt:  86%|█████████████████████████████████████████████████████████████████████████████▍            | 4399/5112 [58:56<00:59, 11.91it/s]Detected garbage during conversion of cc_03_10_2022/certs/reports/pdf/c763b37bb95517e9.pdf
Converting reports to txt:  86%|███████████████████████████████████████████████████

Error when converting pdf->txt: poppler error creating document
Detected garbage during conversion of cc_03_10_2022/certs/targets/pdf/422dc5758723c7d1.pdf
Error when converting pdf->txt: poppler error creating document
Error when converting pdf->txt: poppler error creating document
Error when converting pdf->txt: poppler error creating document
Detected garbage during conversion of cc_03_10_2022/certs/targets/pdf/4c9468f20fdb04f7.pdf
Detected garbage during conversion of cc_03_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf
Detected garbage during conversion of cc_03_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf
Error during OCR of cc_03_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf, using garbage: pdftoppm failed: 1
Cert dgst: a2b962c7f1d2bc56 failed to convert security target pdf->txt
Error during OCR of cc_03_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf, using garbage: pdftoppm failed: 1
Cert dgst: 7147c2f70d983d57 failed to convert security target pdf->txt
Error during OCR of cc

In [5]:
# Analyze all
dset.analyze_certificates()

Extracting report metadata:   0%|                                                                                                    | 0/5104 [00:00<?, ?it/s]Failed to read metadata of certs/reports/pdf/1e91595cd5e7fece.pdf, error: Unexpected escaped string: b'\x83'
Failed to read metadata of certs/reports/pdf/451e0806dfcfd29d.pdf, error: PDF metadata unavailable
Extracting report metadata:  11%|██████████                                                                               | 574/5104 [00:02<00:15, 288.04it/s]Failed to read metadata of certs/reports/pdf/e51bbdd7e3705d61.pdf, error: PDF metadata unavailable
Extracting report metadata:  26%|███████████████████████                                                                 | 1339/5104 [00:09<00:25, 147.71it/s]Failed to read metadata of certs/reports/pdf/dc54e899ab191a2a.pdf, error: PDF metadata unavailable
Extracting report metadata:  34%|██████████████████████████████                                                         

Extracting report metadata:  72%|███████████████████████████████████████████████████████████████                         | 3657/5104 [00:22<00:12, 118.13it/s]Failed to read metadata of certs/reports/pdf/9114f0f9938e558c.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/reports/pdf/08b09322ee9df0af.pdf, error: PDF metadata unavailable
Extracting report metadata:  77%|███████████████████████████████████████████████████████████████████▊                    | 3935/5104 [00:23<00:07, 163.39it/s]Failed to read metadata of certs/reports/pdf/518b3066ebfabcd3.pdf, error: PDF metadata unavailable
Extracting report metadata:  85%|██████████████████████████████████████████████████████████████████████████▊             | 4339/5104 [00:25<00:04, 180.47it/s]Failed to read metadata of certs/reports/pdf/d455e5408b744b44.pdf, error: PDF metadata unavailable
Extracting report metadata: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5104/51

Failed to read metadata of certs/targets/pdf/bbcfe0aee7ea78ea.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/targets/pdf/f0bd6a29eee94a2c.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/targets/pdf/463ecd64b7506048.pdf, error: PDF metadata unavailable
Extracting target metadata:  36%|████████████████████████████████                                                         | 1815/5029 [00:47<00:49, 64.98it/s]Failed to read metadata of certs/targets/pdf/d3568613c552f9e8.pdf, error: EOF marker not found
Extracting target metadata:  44%|███████████████████████████████████████▎                                                 | 2223/5029 [00:52<00:38, 73.68it/s]Failed to read metadata of certs/targets/pdf/4d2a177384b23fd6.pdf, error: PDF metadata unavailable
Extracting target metadata:  59%|████████████████████████████████████████████████████                                     | 2945/5029 [01:12<00:46, 44.62it/s]Failed to read metadata of certs/ta

Extracting report frontpages: 100%|██████████████████████████████████████████████████████████████████████████████████████| 5104/5104 [00:13<00:00, 374.77it/s]
Extracting target frontpages: 100%|██████████████████████████████████████████████████████████████████████████████████████| 5029/5029 [00:18<00:00, 269.71it/s]
Extracting report keywords: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 5104/5104 [08:49<00:00,  9.64it/s]
Extracting target keywords: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5029/5029 [32:59<00:00,  2.54it/s]
Fitting the CPE classifier: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 888243/888243 [00:02<00:00, 413368.35it/s]
Predicting CPE matches with the classifier: 100%|██████████████████████████████████████████████████

In [6]:
# And do maintenance updates as well
dset.process_maintenance_updates()

Downloading reports: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 457/457 [01:40<00:00,  4.56it/s]
Downloading targets:  41%|██████████████████████████████████████████████████████████████▌                                                                                         | 188/457 [01:13<03:08,  1.43it/s]ERROR:sec_certs.sample.certificate:Cert dgst: cert_4d38305c4a2ee035_update_d37e22d00eafccc0 failed to download ST from https://www.commoncriteriaportal.org/files/epfiles/0976ra1b_pdf.pdf, code: 408
ERROR:sec_certs.sample.certificate:Cert dgst: cert_bcac7e3f9f614ec6_update_531b2055de9b34f2 failed to download ST from https://www.commoncriteriaportal.org/files/epfiles/XSMART%20e-Passport%20V1.3_R4_ASE_LITE(ENG)_v1.0.1.pdf, code: 408
Downloading targets:  42%|███████████████████████████████████████████████████████████████▏                                                    

Extracting report metadata:   0%|                                                                                                                                                           | 0/456 [00:00<?, ?it/s]ERROR:sec_certs.utils.pdf:Failed to read metadata of maintenances/reports/pdf/cert_d455e5408b744b44_update_16f77dbd34439401.pdf, error: PDF metadata unavailable
Extracting report metadata: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 456/456 [00:01<00:00, 452.87it/s]
Extracting target metadata:  56%|█████████████████████████████████████████████████████████████████████████████████▉                                                               | 253/448 [00:06<00:04, 39.48it/s]ERROR:sec_certs.utils.pdf:Failed to read metadata of maintenances/targets/pdf/cert_bbc41d7d09e40c0c_update_784535cca61b58a7.pdf, error: too many values to unpack (expected 2)
Extracting target metadata:

<sec_certs.dataset.common_criteria.CCDatasetMaintenanceUpdates at 0x7fcd979bb400>

In [7]:
# Finally, dump it all
dset.to_json()