In [1]:
from sec_certs.dataset import CCDataset
from pathlib import Path
import shutil
import subprocess

In [2]:
# A directory with a dump of the documents that will enrich the dataset.
doc_dump_dir = Path("cc_certs_04_10_2022")
print(doc_dump_dir.exists())

# An output directory for the dataset.
dataset_dir = Path("cc_04_10_2022")
print(dataset_dir.exists())

True
False


In [3]:
# Print tool versions
subprocess.run(["pdftotext", "-v"])
subprocess.run(["tesseract", "-v"]);

tesseract 5.2.0
 leptonica-1.82.0
  libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 2.1.3) : libpng 1.6.38 : libtiff 4.4.0 : zlib 1.2.12 : libwebp 1.2.4 : libopenjp2 2.5.0
 Found AVX2
 Found AVX
 Found FMA
 Found SSE4.1
 Found OpenMP 201511
 Found libarchive 3.6.1 zlib/1.2.12 liblzma/5.2.5 bz2lib/1.0.8 liblz4/1.9.3 libzstd/1.5.2
 Found libcurl/7.85.0 OpenSSL/1.1.1q zlib/1.2.12 brotli/1.0.9 zstd/1.5.2 libidn2/2.3.3 libpsl/0.21.1 (+libidn2/2.3.0) libssh2/1.10.0 nghttp2/1.50.0


pdftotext version 22.09.0
Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011, 2022 Glyph & Cog, LLC


In [4]:
# Create the dataset
dset = CCDataset(root_dir=dataset_dir, name="cc", description="Final run on 04.10.2022")

In [5]:
# Get the metadata
dset.get_certs_from_web()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:04<00:00,  2.00s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.00s/it]
The CSV cc_04_10_2022/web/cc_products_active.csv contains 8 duplicates by the primary key.
The CSV cc_04_10_2022/web/cc_products_archived.csv contains 10 duplicates by the primary key.
When merging certificates with dgst b26ce64e0c677e3d, the following mismatch occured: Attribute=security_level, self[security_level]={'ALC_DVS.1', 'EAL2+'}, other[security_level]={'EAL2'}
When merging certificates with dgst 7f043162f88a1c3a, the following mismatch occured: Attribute=not_valid_after, self[not_valid_after]=2024-09-19, other[not_valid_after]=2024-09-18
When merging certificates with dgst 109eb2158ca6a2f9, the following mismatch occured: Attribute=security_level, self[security_le

In [8]:
# Process the profiles?
dset.process_protection_profiles()

Duplicate entry in PP dataset: ('Protection Profile for Enterprise Security Management Policy Management', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ESM_PM_V1.4.pdf')
Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')
Duplicate entry in PP dataset: ('Stateful Traffic Filter Firewall Extended Package for Network Device Protection Profile', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_ND_TFFWEP_V1.0.pdf')
Duplicate entry in PP dataset: ('Public Key-Enabled Application Family of Protection Profiles', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_VID3004-PP.pdf')
Duplicate entry in PP dataset: ('Protection Profile for Wireless Local Area Network (WLAN) Access Systems', 'https://www.commoncriteriaportal.org/files/ppfiles/PP_WLAN_AS_V1.0.pdf')
Duplicate entry in PP dataset: ('PC Client Specific Trusted Platform Mod

In [9]:
# Enrich the dataset with the pre-downloaded PDFs (collected over multiple runs to fill in the timeouts and disappearances).
dset.reports_pdf_dir.mkdir(parents=True, exist_ok=True)
dset.targets_pdf_dir.mkdir(parents=True, exist_ok=True)
for cert in dset:
    report_pdf = doc_dump_dir / "report" / "pdf" / f"{cert.dgst}.pdf"
    if report_pdf.exists():
        shutil.copy(report_pdf, cert.state.report_pdf_path)
        cert.state.report_download_ok = True
    target_pdf = doc_dump_dir / "target" / "pdf" / f"{cert.dgst}.pdf"
    if target_pdf.exists():
        shutil.copy(target_pdf, cert.state.st_pdf_path)
        cert.state.st_download_ok = True

In [10]:
# Try to download last missing (or ones not present in the pre-downloaded)
dset.download_all_pdfs(fresh=False)

Downloading reports:   0%|                                                                                                             | 0/20 [00:00<?, ?it/s]Cert dgst: 82fcce2db9ef6063 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/emCA%20Certification%20Report%20v1.0.pdf, code: nok
Cert dgst: e27eb9efbd67c4f7 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/Document%20SAFER%20Blue%202%20Security%20Target%20V1.10_???.pdf, code: nok
Downloading reports:  10%|██████████                                                                                           | 2/20 [00:01<00:09,  2.00it/s]Cert dgst: 305af0eec85bb856 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/[KECS-CR-13-28]%20AhnLab%20TrusGuard%20V2.2(??).pdf, code: nok
Cert dgst: d79ecebfe21f2144 failed to download report from https://www.commoncriteriaportal.org/files/epfiles/st_vid10024a-vr.pdf, code: nok
Cert dgst: 8ac22970bd80

In [11]:
# Convert all
dset.convert_all_pdfs()

Converting reports to txt:   2%|█▋                                                                                          | 96/5115 [00:07<06:29, 12.89it/s]Error when converting pdf->txt: poppler error creating document
Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf
Converting reports to txt:   2%|██                                                                                         | 114/5115 [00:09<06:14, 13.36it/s]Error during OCR of cc_04_10_2022/certs/reports/pdf/bfc86f14e2484335.pdf, using garbage: pdftoppm failed: 1
Cert dgst: bfc86f14e2484335 failed to convert report pdf->txt
Converting reports to txt:   3%|███                                                                                        | 173/5115 [00:12<05:17, 15.55it/s]Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/1ee7ecee9e7e131c.pdf
Converting reports to txt:   4%|███▎                                                                          

Converting reports to txt:  10%|████████▌                                                                               | 497/5115 [13:38<15:10:43, 11.83s/it]Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/6f8d7a6a1dea6a3a.pdf
Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/e6cffd14e732a030.pdf
Converting reports to txt:  10%|████████▌                                                                               | 498/5115 [13:57<16:59:25, 13.25s/it]Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/47913a485c3c8a18.pdf
Converting reports to txt:  10%|████████▊                                                                                | 503/5115 [14:01<7:40:59,  6.00s/it]Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/b0b7b073ca2dfe5f.pdf
Converting reports to txt:  10%|████████▋                                                                               | 504/5115 [14:27<11:21:27,  8.87s/it]Dete

Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/6db03eff148934a4.pdf
Error when converting pdf->txt: poppler error creating document
Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/f9c8da9deff77ab5.pdf
Error during OCR of cc_04_10_2022/certs/reports/pdf/f9c8da9deff77ab5.pdf, using garbage: pdftoppm failed: 1
Cert dgst: f9c8da9deff77ab5 failed to convert report pdf->txt
Converting reports to txt:  77%|████████████████████████████████████████████████████████████████████▊                     | 3913/5115 [23:36<01:23, 14.40it/s]Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/3570791ff9c92912.pdf
Converting reports to txt:  77%|█████████████████████████████████████████████████████████████████████▌                    | 3955/5115 [23:40<01:42, 11.27it/s]Detected garbage during conversion of cc_04_10_2022/certs/reports/pdf/ed886279d0d61096.pdf
Converting reports to txt:  86%|██████████████████████████████████████████████████████

Detected garbage during conversion of cc_04_10_2022/certs/targets/pdf/ce987167d42db722.pdf
Converting targets to txt:  40%|████████████████████████████████████▏                                                     | 2020/5032 [09:23<17:00,  2.95it/s]Detected garbage during conversion of cc_04_10_2022/certs/targets/pdf/c16a92b40550193d.pdf
Converting targets to txt:  40%|███████████████████████████████████▌                                                    | 2032/5032 [10:42<2:44:41,  3.29s/it]Detected garbage during conversion of cc_04_10_2022/certs/targets/pdf/2ff761edd4ed9b72.pdf
Converting targets to txt:  41%|███████████████████████████████████▊                                                    | 2050/5032 [12:33<2:34:26,  3.11s/it]Detected garbage during conversion of cc_04_10_2022/certs/targets/pdf/9e4d3347efd95ec9.pdf
Converting targets to txt:  43%|███████████████████████████████████████                                                   | 2182/5032 [13:38<14:26,  3.29it/s]Dete

Converting reports to txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  7.98it/s]
Converting targets to txt:   0%|                                                                                                        | 0/5 [00:00<?, ?it/s]Error when converting pdf->txt: poppler error creating document
Detected garbage during conversion of cc_04_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf
Error when converting pdf->txt: poppler error creating document
Error when converting pdf->txt: poppler error creating document
Error when converting pdf->txt: poppler error creating document
Error when converting pdf->txt: poppler error creating document
Error during OCR of cc_04_10_2022/certs/targets/pdf/7147c2f70d983d57.pdf, using garbage: pdftoppm failed: 1
Detected garbage during conversion of cc_04_10_2022/certs/targets/pdf/a2b962c7f1d2bc56.pdf
Detected garbage during conversion of cc_04_10_2022/certs/targets/pdf/bfc

In [12]:
# Analyze all
dset.analyze_certificates()

Extracting report metadata:   0%|                                                                                                    | 0/5107 [00:00<?, ?it/s]Failed to read metadata of certs/reports/pdf/1e91595cd5e7fece.pdf, error: Unexpected escaped string: b'\x83'
Failed to read metadata of certs/reports/pdf/451e0806dfcfd29d.pdf, error: PDF metadata unavailable
Extracting report metadata:  14%|████████████▌                                                                            | 720/5107 [00:03<00:20, 217.24it/s]Failed to read metadata of certs/reports/pdf/e51bbdd7e3705d61.pdf, error: PDF metadata unavailable
Extracting report metadata:  28%|████████████████████████▋                                                               | 1430/5107 [00:13<00:28, 128.36it/s]Failed to read metadata of certs/reports/pdf/dc54e899ab191a2a.pdf, error: PDF metadata unavailable
Extracting report metadata:  35%|██████████████████████████████▊                                                        

Failed to read metadata of certs/reports/pdf/518b3066ebfabcd3.pdf, error: PDF metadata unavailable
Extracting report metadata:  87%|████████████████████████████████████████████████████████████████████████████▊           | 4459/5107 [00:32<00:03, 181.19it/s]Failed to read metadata of certs/reports/pdf/d455e5408b744b44.pdf, error: PDF metadata unavailable
Extracting report metadata: 100%|████████████████████████████████████████████████████████████████████████████████████████| 5107/5107 [00:38<00:00, 133.02it/s]
Failed to read metadata of certs/targets/pdf/d47a03be1cb342b1.pdf, error: PDF metadata unavailable
Extracting target metadata:   2%|██▏                                                                                       | 120/5027 [00:02<01:23, 58.92it/s]Failed to read metadata of certs/targets/pdf/5af5a1b535422b51.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/targets/pdf/21d6eff0bfe3971f.pdf, error: PDF metadata unavailable
Extracting target metadata:   

Failed to read metadata of certs/targets/pdf/bbcfe0aee7ea78ea.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/targets/pdf/f0bd6a29eee94a2c.pdf, error: PDF metadata unavailable
Failed to read metadata of certs/targets/pdf/463ecd64b7506048.pdf, error: PDF metadata unavailable
Extracting target metadata:  35%|███████████████████████████████▌                                                         | 1784/5027 [00:55<00:59, 54.46it/s]Failed to read metadata of certs/targets/pdf/d3568613c552f9e8.pdf, error: EOF marker not found
Extracting target metadata:  45%|████████████████████████████████████████                                                 | 2263/5027 [01:02<00:48, 56.91it/s]Failed to read metadata of certs/targets/pdf/4d2a177384b23fd6.pdf, error: PDF metadata unavailable
Extracting target metadata:  59%|████████████████████████████████████████████████████▏                                    | 2950/5027 [01:25<00:56, 36.82it/s]Failed to read metadata of certs/ta

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:11<00:00,  1.81it/s]
Building CVEDataset from jsons: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 21/21 [01:25<00:00,  4.09s/it]
parsing cpe matching (by NIST) dictionary: 100%|███████████████████████████████████████████████████████████████████| 398394/398394 [00:27<00:00, 14339.71it/s]
Building-up lookup dictionaries for fast CVE matching: 100%|███████████████████████████████████████████████████████| 196907/196907 [00:12<00:00, 16078.76it/s]
Computing related CVES: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 698/698 [00:00<00:00, 82482.23it/s]


In [13]:
# And do maintenance updates as well
dset.process_maintenance_updates()

Downloading reports: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 457/457 [01:12<00:00,  6.33it/s]
Downloading targets:  74%|████████████████████████████████████████████████████████████████████████▋                         | 339/457 [01:08<00:24,  4.89it/s]ERROR:sec_certs.sample.certificate:Cert dgst: cert_5ab17f47ec6820f3_update_0cbfc8dd71545605 failed to download ST from https://www.commoncriteriaportal.org/files/epfiles/[ST]%20Samsung%20Multifunction%20MultiXpress%20X4300%20K4350_v1.4(???).pdf, code: nok
Downloading targets:  92%|██████████████████████████████████████████████████████████████████████████████████████████▎       | 421/457 [01:24<00:06,  5.25it/s]ERROR:sec_certs.sample.certificate:Cert dgst: cert_d07cacdb732c0b0f_update_9dad882431047687 failed to download ST from https://www.commoncriteriaportal.org/files/epfiles/Senetas%20CN%20series%20STv1.1%20Dec%2014.pdf, code: 408
Downloading targets: 100%|████████████████

<sec_certs.dataset.common_criteria.CCDatasetMaintenanceUpdates at 0x7fa02b720250>

In [14]:
# Finally, dump it all
dset.to_json()