Skip to content

Commit

Permalink
Moce CCDocumentState to cert class.
Browse files Browse the repository at this point in the history
  • Loading branch information
J08nY committed Feb 13, 2024
1 parent bb3b54d commit 77303ca
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 102 deletions.
130 changes: 70 additions & 60 deletions src/sec_certs/sample/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,63 +38,6 @@
}


@dataclass
class CCDocumentState(ComplexSerializableType):
download_ok: bool = False # Whether download went OK
convert_garbage: bool = False # Whether initial conversion resulted in garbage
convert_ok: bool = False # Whether overall conversion went OK (either pdftotext or via OCR)
extract_ok: bool = False # Whether extraction went OK

pdf_hash: str | None = None
txt_hash: str | None = None

_pdf_path: Path | None = None
_txt_path: Path | None = None

def is_ok_to_download(self, fresh: bool = True) -> bool:
return True if fresh else not self.download_ok

def is_ok_to_convert(self, fresh: bool = True) -> bool:
return self.download_ok if fresh else self.download_ok and not self.convert_ok

def is_ok_to_analyze(self, fresh: bool = True) -> bool:
if fresh:
return self.download_ok and self.convert_ok
else:
return self.download_ok and self.convert_ok and not self.extract_ok

@property
def pdf_path(self) -> Path:
if not self._pdf_path:
raise ValueError(f"pdf_path not set on {type(self)}")
return self._pdf_path

@pdf_path.setter
def pdf_path(self, pth: str | Path | None) -> None:
self._pdf_path = Path(pth) if pth else None

@property
def txt_path(self) -> Path:
if not self._txt_path:
raise ValueError(f"txt_path not set on {type(self)}")
return self._txt_path

@txt_path.setter
def txt_path(self, pth: str | Path | None) -> None:
self._txt_path = Path(pth) if pth else None

@property
def serialized_attributes(self) -> list[str]:
return [
"download_ok",
"convert_garbage",
"convert_ok",
"extract_ok",
"pdf_hash",
"txt_hash",
]


class CCCertificate(
Certificate["CCCertificate", "CCCertificate.Heuristics", "CCCertificate.PdfData"],
PandasSerializableType,
Expand Down Expand Up @@ -147,15 +90,82 @@ def __lt__(self, other):
return self.maintenance_date < other.maintenance_date

@dataclass
class DocumentState(ComplexSerializableType):
download_ok: bool = False # Whether download went OK
convert_garbage: bool = False # Whether initial conversion resulted in garbage
convert_ok: bool = False # Whether overall conversion went OK (either pdftotext or via OCR)
extract_ok: bool = False # Whether extraction went OK

pdf_hash: str | None = None
txt_hash: str | None = None

_pdf_path: Path | None = None
_txt_path: Path | None = None

def is_ok_to_download(self, fresh: bool = True) -> bool:
return True if fresh else not self.download_ok

def is_ok_to_convert(self, fresh: bool = True) -> bool:
return self.download_ok if fresh else self.download_ok and not self.convert_ok

def is_ok_to_analyze(self, fresh: bool = True) -> bool:
if fresh:
return self.download_ok and self.convert_ok
else:
return self.download_ok and self.convert_ok and not self.extract_ok

@property
def pdf_path(self) -> Path:
if not self._pdf_path:
raise ValueError(f"pdf_path not set on {type(self)}")
return self._pdf_path

@pdf_path.setter
def pdf_path(self, pth: str | Path | None) -> None:
self._pdf_path = Path(pth) if pth else None

@property
def txt_path(self) -> Path:
if not self._txt_path:
raise ValueError(f"txt_path not set on {type(self)}")
return self._txt_path

@txt_path.setter
def txt_path(self, pth: str | Path | None) -> None:
self._txt_path = Path(pth) if pth else None

@property
def serialized_attributes(self) -> list[str]:
return [
"download_ok",
"convert_garbage",
"convert_ok",
"extract_ok",
"pdf_hash",
"txt_hash",
]

@dataclass(init=False)
class InternalState(ComplexSerializableType):
"""
Holds internal state of the certificate, whether downloads and converts of individual components succeeded. Also
holds information about errors and paths to the files.
"""

report: CCDocumentState = field(default_factory=CCDocumentState)
st: CCDocumentState = field(default_factory=CCDocumentState)
cert: CCDocumentState = field(default_factory=CCDocumentState)
report: CCCertificate.DocumentState
st: CCCertificate.DocumentState
cert: CCCertificate.DocumentState

def __init__(
self,
report: CCCertificate.DocumentState | None = None,
st: CCCertificate.DocumentState | None = None,
cert: CCCertificate.DocumentState | None = None,
):
super().__init__()
self.report = report if report is not None else CCCertificate.DocumentState()
self.st = st if st is not None else CCCertificate.DocumentState()
self.cert = cert if cert is not None else CCCertificate.DocumentState()

@property
def serialized_attributes(self) -> list[str]:
Expand Down
6 changes: 3 additions & 3 deletions tests/data/cc/analysis/cc_full_dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -65,7 +65,7 @@
"txt_hash": "35627594d3806ac3926ec47f466503fe27781533da12beb6f8705882fccf125e"
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -74,7 +74,7 @@
"txt_hash": "c8b4c5667a3f60edc845051e5a31a2d17b9d9a11df9e56dd89681d25e727a622"
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down
18 changes: 9 additions & 9 deletions tests/data/cc/analysis/reference_dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -54,7 +54,7 @@
"txt_hash": "460e8010dbc8f5de5b87bf96fd45c71cfd9f3869f34ca6ac1ab02cbd70d2523f"
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -63,7 +63,7 @@
"txt_hash": "81c53d1e5b1c2fcb129ce1053d13cd1308f7a556921f0b9024cedf75c6b2efb7"
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down Expand Up @@ -622,7 +622,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -631,7 +631,7 @@
"txt_hash": "0535df1c56fb4f87153cbffee51ba4d77fac47a6f17f024aa7d9df461028bc65"
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -640,7 +640,7 @@
"txt_hash": "926668bea7c427a4fcf82857bfc63420f3597b6bff39699927a58f335620eaac"
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down Expand Up @@ -1277,7 +1277,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -1286,7 +1286,7 @@
"txt_hash": "11e1262fd8f5df1b140f5e8813883b71447503781399427b35adbbecd00b4d63"
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -1295,7 +1295,7 @@
"txt_hash": "179b07b4fc7402066a884edea494b28e324315108a5e0820184031f2e2062ad5"
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down
18 changes: 9 additions & 9 deletions tests/data/cc/analysis/transitive_vulnerability_dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -64,7 +64,7 @@
"txt_hash": "9d360141a98e764b15855f519b456c4e4639f993c4f8b5ab67e9c8ae7fbfc9e4"
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -73,7 +73,7 @@
"txt_hash": "66271d8bf0b581a2f189301438f2aee13ff3da0bb0bb180bcf518261eb695496"
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down Expand Up @@ -1378,7 +1378,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -1387,7 +1387,7 @@
"txt_hash": "dd120ba7667c2385839c96ee70c56f2a4d464fc95e3ea2818d31b3347d06fd4f"
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -1396,7 +1396,7 @@
"txt_hash": "f7f7b8f31dddde3f0756cde8843061f01b606bdf266eca71dbcc56b3672d1db5"
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down Expand Up @@ -2346,7 +2346,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -2355,7 +2355,7 @@
"txt_hash": "0a7c65e3d11f082c8f75aba7de0079c0b1aa5e67bb28d4635cbcaa4cd200d1c2"
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand All @@ -2364,7 +2364,7 @@
"txt_hash": "90b8e48add278faea4668eccba591d3992bf782669cca1b0a63bf6f21b514cd9"
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down
12 changes: 6 additions & 6 deletions tests/data/cc/analysis/vulnerable_dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": true,
"convert_garbage": false,
"convert_ok": true,
Expand All @@ -49,7 +49,7 @@
"txt_hash": null
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": true,
"convert_garbage": false,
"convert_ok": true,
Expand All @@ -58,7 +58,7 @@
"txt_hash": null
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down Expand Up @@ -121,7 +121,7 @@
"state": {
"_type": "sec_certs.sample.cc.CCCertificate.InternalState",
"report": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": true,
"convert_garbage": false,
"convert_ok": true,
Expand All @@ -130,7 +130,7 @@
"txt_hash": null
},
"st": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": true,
"convert_garbage": false,
"convert_ok": true,
Expand All @@ -139,7 +139,7 @@
"txt_hash": null
},
"cert": {
"_type": "sec_certs.sample.cc.CCDocumentState",
"_type": "sec_certs.sample.cc.CCCertificate.DocumentState",
"download_ok": false,
"convert_garbage": false,
"convert_ok": false,
Expand Down
Loading

0 comments on commit 77303ca

Please sign in to comment.