In [None]:
from sec_certs.dataset import CCDataset
from pathlib import Path
import shutil
import subprocess

In [None]:
# A directory with a dump of the documents that will enrich the dataset.
doc_dump_dir = Path("cc_certs_09_10_2022")
print(doc_dump_dir.exists())

# An output directory for the dataset.
dataset_dir = Path("cc_09_10_2022")
print(dataset_dir.exists())

In [None]:
# Print the current commit hash
subprocess.run(["git", "rev-parse", "--short", "HEAD"])

In [None]:
# Print tool versions
subprocess.run(["pdftotext", "-v"])
subprocess.run(["tesseract", "-v"]);

In [None]:
# Create the dataset
dset = CCDataset(root_dir=dataset_dir, name="cc", description="Final run on 04.10.2022")

In [None]:
# Get the metadata
dset.get_certs_from_web()

In [None]:
# Process the profiles?
dset.process_protection_profiles()

In [None]:
# Enrich the dataset with the pre-downloaded PDFs (collected over multiple runs to fill in the timeouts and disappearances).
dset.reports_pdf_dir.mkdir(parents=True, exist_ok=True)
dset.targets_pdf_dir.mkdir(parents=True, exist_ok=True)
for cert in dset:
    report_pdf = doc_dump_dir / "report" / "pdf" / f"{cert.dgst}.pdf"
    if report_pdf.exists():
        shutil.copy(report_pdf, cert.state.report_pdf_path)
        cert.state.report_download_ok = True
    target_pdf = doc_dump_dir / "target" / "pdf" / f"{cert.dgst}.pdf"
    if target_pdf.exists():
        shutil.copy(target_pdf, cert.state.st_pdf_path)
        cert.state.st_download_ok = True

In [None]:
# Try to download last missing (or ones not present in the pre-downloaded)
dset.download_all_pdfs(fresh=False)

In [None]:
# Convert all
dset.convert_all_pdfs()

In [None]:
# Analyze all
dset.analyze_certificates()

In [None]:
# And do maintenance updates as well
dset.process_maintenance_updates()

In [None]:
# Finally, dump it all
dset.to_json()