From eabcc26e4a7e38f47d9b83e1059d9148bbe3326b Mon Sep 17 00:00:00 2001
From: Adam Janovsky <janovsky.vm@gmail.com>
Date: Wed, 29 Mar 2023 16:33:42 +0200
Subject: [PATCH] switch to pydantic in settings management

---
 docs/configuration.md                       |  46 ++++-
 pyproject.toml                              |   1 +
 src/sec_certs/cli.py                        |  25 +--
 src/sec_certs/config/__init__.py            |   0
 src/sec_certs/config/configuration.py       |  44 -----
 src/sec_certs/config/settings-schema.json   | 182 --------------------
 src/sec_certs/config/settings.yaml          |  59 -------
 src/sec_certs/configuration.py              | 128 ++++++++++++++
 src/sec_certs/dataset/cc.py                 |   2 +-
 src/sec_certs/dataset/dataset.py            |   2 +-
 src/sec_certs/dataset/fips.py               |   2 +-
 src/sec_certs/dataset/fips_iut.py           |   2 +-
 src/sec_certs/dataset/fips_mip.py           |   2 +-
 src/sec_certs/dataset/protection_profile.py |   2 +-
 src/sec_certs/sample/fips.py                |   2 +-
 src/sec_certs/sample/fips_iut.py            |   2 +-
 src/sec_certs/sample/fips_mip.py            |   2 +-
 src/sec_certs/utils/parallel_processing.py  |   6 +-
 src/sec_certs/utils/tqdm.py                 |   2 +-
 tests/conftest.py                           |   6 +-
 tests/data/settings_tests.yml               |  60 +------
 tests/test_config.py                        |  69 ++++++++
 22 files changed, 264 insertions(+), 382 deletions(-)
 delete mode 100644 src/sec_certs/config/__init__.py
 delete mode 100644 src/sec_certs/config/configuration.py
 delete mode 100644 src/sec_certs/config/settings-schema.json
 delete mode 100644 src/sec_certs/config/settings.yaml
 create mode 100644 src/sec_certs/configuration.py
 create mode 100644 tests/test_config.py

diff --git a/docs/configuration.md b/docs/configuration.md
index 93c926ba..dcf2b8d8 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -6,21 +6,49 @@ mystnb:
 ---
 # Configuration
 
-The configuration is stored in yaml file `settings.yaml` at `sec_certs.config` package. Below are the supported options, descriptions and default values.
+The configuration class is defined in [configuration.py](https://github.com/crocs-muni/sec-certs/tree/main/src/sec_certs/configuration.py). From CLI, you can load custom configuration yaml with `-c` or `--config` argument. From Python, you can replace the default configuration with
+
+```python
+from pathlib import Path
+import sec_certs.configuration as config_module
+
+config_module.config.load_from_yaml("/path/to/your/config.yaml")
+
+# or just set the individual key
+config_module.config.log_filepath = Path("/some/path/where/log/will/be/stored.txt")
+```
+
+The configuration yaml is a simple flat dictionary of keys and values. The configuration file can specify only *some* of the fields. For the content of unspecified fields, environment variable with `seccerts_` prefix (case insensitive) will be checked. If such variable is not set, default value will be used. Content in the yaml always beats the environment variable.
+
+For instance, when user provides the following yaml
+
+```yaml
+log_filepath: my_own_log_file.txt
+n_threads: 7
+```
+
+and sets `SECCERTS_MINIMAL_TOKEN_LENGTH=4` as environment variable, only these 3 keys will be loaded with `config.load_from_yaml()`, others will be untouched.
+
+```{tip}
+You can load settings even without providing yaml configuration. Simply set the corresponding environment variables or use `.env` file.
+```
+
+## Configuration keys, types, default values and descriptions
 
 
 ```{code-cell} python
-from sec_certs.config import configuration
+from sec_certs.configuration import config, Configuration
 from myst_nb import glue
 from IPython.display import Markdown
+import typing
 
-cfg = configuration.config
+type_hints = typing.get_type_hints(Configuration)
 text = ""
-for key in cfg.__dict__:
-    text += f"`{key}`\n\n- Description: {cfg.get_desription(key)}\n"
-    text += f"- Default value: `{cfg.__getattribute__(key)}`\n\n"
+for field, value in config.__fields__.items():
+    text += f"`{field}`\n\n"
+    text += f"- type: `{type_hints[field]}`\n"
+    text += f"- default: `{value.default}`\n"
+    text += f"- description: {value.field_info.description}\n"
+    text += f"- env name: `{list(value.field_info.extra['env_names'])[0]}`\n\n"
 glue("text", Markdown(text))
 ```
-```{glue:md} text
-:format: myst
-```
diff --git a/pyproject.toml b/pyproject.toml
index 0c7f33d0..7b7715f0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,7 @@
     "pySankeyBeta",
     "scipy>=1.9.0",
     "networkx",
+    "pydantic",
   ]
 
   [project.optional-dependencies]
diff --git a/src/sec_certs/cli.py b/src/sec_certs/cli.py
index 08ef6abd..d1a1eb90 100644
--- a/src/sec_certs/cli.py
+++ b/src/sec_certs/cli.py
@@ -9,8 +9,9 @@
 from typing import Callable
 
 import click
+from pydantic import ValidationError
 
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.dataset import CCDataset, FIPSDataset
 from sec_certs.dataset.dataset import Dataset
 from sec_certs.utils.helpers import warn_if_missing_poppler, warn_if_missing_tesseract
@@ -157,7 +158,7 @@ def build_or_load_dataset(
     "configpath",
     default=None,
     type=click.Path(file_okay=True, dir_okay=False, writable=True, readable=True),
-    help="Path to your own config yaml file that will override the default one.",
+    help="Path to your own config yaml file that will override the default config.",
 )
 @click.option(
     "-i",
@@ -176,6 +177,16 @@ def main(
     quiet: bool,
 ):
     try:
+        if configpath:
+            try:
+                config.load_from_yaml(configpath)
+            except FileNotFoundError:
+                click.echo("Error: Bad path to configuration file", err=True)
+                sys.exit(EXIT_CODE_NOK)
+            except (ValueError, ValidationError) as e:
+                click.echo(f"Error: Bad format of configuration file: {e}", err=True)
+                sys.exit(EXIT_CODE_NOK)
+
         file_handler = logging.FileHandler(config.log_filepath)
         stream_handler = logging.StreamHandler(sys.stderr)
         formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
@@ -185,16 +196,6 @@ def main(
         logging.basicConfig(level=logging.INFO, handlers=handlers)
         start = datetime.now()
 
-        if configpath:
-            try:
-                config.load(configpath)
-            except FileNotFoundError:
-                click.echo("Error: Bad path to configuration file", err=True)
-                sys.exit(EXIT_CODE_NOK)
-            except ValueError as e:
-                click.echo(f"Error: Bad format of configuration file: {e}", err=True)
-                sys.exit(EXIT_CODE_NOK)
-
         actions_set = (
             {"build", "process-aux-dsets", "download", "convert", "analyze"} if "all" in actions else set(actions)
         )
diff --git a/src/sec_certs/config/__init__.py b/src/sec_certs/config/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/src/sec_certs/config/configuration.py b/src/sec_certs/config/configuration.py
deleted file mode 100644
index 43cfe698..00000000
--- a/src/sec_certs/config/configuration.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any
-
-import jsonschema
-import yaml
-
-
-class Configuration:
-    def load(self, filepath: str | Path) -> None:
-        with Path(filepath).open("r") as file:
-            state = yaml.load(file, Loader=yaml.FullLoader)
-
-        script_dir = Path(__file__).parent
-
-        with (Path(script_dir) / "settings-schema.json").open("r") as file:
-            schema = json.loads(file.read())
-
-        try:
-            jsonschema.validate(state, schema)
-        except jsonschema.exceptions.ValidationError as e:
-            print(f"{e}\n\nIn file {filepath}")
-
-        for k, v in state.items():
-            setattr(self, k, v)
-
-    def __getattribute__(self, key: str) -> Any:
-        res = object.__getattribute__(self, key)
-        if isinstance(res, dict) and "value" in res:
-            return res["value"]
-        return object.__getattribute__(self, key)
-
-    def get_desription(self, key: str) -> str | None:
-        res = object.__getattribute__(self, key)
-        if isinstance(res, dict) and "description" in res:
-            return res["description"]
-        return None
-
-
-DEFAULT_CONFIG_PATH = Path(__file__).parent / "settings.yaml"
-config = Configuration()
-config.load(DEFAULT_CONFIG_PATH)
diff --git a/src/sec_certs/config/settings-schema.json b/src/sec_certs/config/settings-schema.json
deleted file mode 100644
index 2d5c17a6..00000000
--- a/src/sec_certs/config/settings-schema.json
+++ /dev/null
@@ -1,182 +0,0 @@
-{
-    "title": "settings for sec-certs",
-    "type": "object",
-    "definitions": {
-        "settings_string_entry": {
-            "required": [
-                "description",
-                "value"
-            ],
-            "type": "object",
-            "properties": {
-                "description": {
-                    "type": "string"
-                },
-                "value": {
-                    "type": "string"
-                }
-            }
-        },
-        "settings_boolean_entry": {
-            "type": "object",
-            "required": [
-                "description",
-                "value"
-            ],
-            "properties": {
-                "description": {
-                    "type": "string"
-                },
-                "value": {
-                    "type": "boolean"
-                }
-            }
-        },
-        "settings_number_entry": {
-            "type": "object",
-            "required": [
-                "description",
-                "value"
-            ],
-            "properties": {
-                "description": {
-                    "type": "string"
-                },
-                "value": {
-                    "type": "number"
-                }
-            }
-        },
-        "settings_url_entry": {
-            "type": "object",
-            "required": [
-                "description",
-                "value"
-            ],
-            "properties": {
-                "description": {
-                    "type": "string"
-                },
-                "value": {
-                    "type": "string",
-                    "format": "uri",
-                    "pattern": "^(https?|http?)://",
-                    "minLength": 1,
-                    "maxLength": 255
-                }
-            }
-        }
-    },
-    "properties": {
-        "log_filepath": {
-            "$ref": "#/definitions/settings_string_entry"
-        },
-        "always_false_positive_fips_cert_id_threshold": {
-            "$ref": "#/definitions/settings_number_entry"
-        },
-        "year_difference_between_validations": {
-            "allOf": [
-                {
-                    "$ref": "#/definitions/settings_number_entry"
-                },
-                {
-                    "properties": {
-                        "value": {
-                            "minimum": 0
-                        }
-                    }
-                }
-            ]
-        },
-        "n_threads": {
-            "allOf": [
-                {
-                    "$ref": "#/definitions/settings_number_entry"
-                },
-                {
-                    "properties": {
-                        "value": {
-                            "minimum": -1
-                        }
-                    }
-                }
-            ]
-        },
-        "cpe_n_matching_threshold": {
-            "allOf": [
-                {
-                    "$ref": "#/definitions/settings_number_entry"
-                },
-                {
-                    "properties": {
-                        "value": {
-                            "minimum": 0,
-                            "maximum": 100
-                        }
-                    }
-                }
-            ]
-        },
-        "cpe_n_max_matches": {
-            "allOf": [
-                {
-                    "$ref": "#/definitions/settings_number_entry"
-                },
-                {
-                    "properties": {
-                        "value": {
-                            "exclusiveMinimum": 0
-                        }
-                    }
-                }
-            ]
-        },
-        "cc_latest_snapshot": {
-            "$ref": "#/definitions/settings_url_entry"
-        },
-        "cc_maintenances_latest_snapshot": {
-            "$ref": "#/definitions/settings_url_entry"
-        },
-        "pp_latest_snapshot": {
-            "$ref": "#/definitions/settings_url_entry"
-        },
-        "ignore_first_page": {
-            "$ref": "#/definitions/settings_boolean_entry"
-        },
-        "cert_threshold": {
-            "allOf": [
-                {
-                    "$ref": "#/definitions/settings_number_entry"
-                },
-                {
-                    "properties": {
-                        "value": {
-                            "minimum": 0
-                        }
-                    }
-                }
-            ]
-        },
-        "fips_latest_snapshot": {
-            "$ref": "#/definitions/settings_url_entry"
-        },
-        "enable_progress_bars": {
-            "$ref": "#/definitions/settings_boolean_entry"
-        }
-    },
-    "required": [
-        "log_filepath",
-        "always_false_positive_fips_cert_id_threshold",
-        "year_difference_between_validations",
-        "n_threads",
-        "cpe_matching_threshold",
-        "cpe_n_max_matches",
-        "cc_latest_snapshot",
-        "cc_maintenances_latest_snapshot",
-        "pp_latest_snapshot",
-        "ignore_first_page",
-        "cert_threshold",
-        "fips_latest_snapshot",
-        "enable_progress_bars"
-    ]
-}
\ No newline at end of file
diff --git a/src/sec_certs/config/settings.yaml b/src/sec_certs/config/settings.yaml
deleted file mode 100644
index c69f7cdf..00000000
--- a/src/sec_certs/config/settings.yaml
+++ /dev/null
@@ -1,59 +0,0 @@
----
-log_filepath:
-  description: Path to the file, relative to working directory, where the log will be stored
-  value: ./cert_processing_log.txt
-always_false_positive_fips_cert_id_threshold:
-  description:
-    During validation we don't connect certificates with number lower than
-    _this_ to connections due to these numbers being typically false positives
-  value: 40
-year_difference_between_validations:
-  description:
-    During validation we don't connect certificates with validation dates
-    difference higher than _this_
-  value: 7
-n_threads:
-  description: How many threads to use for parallel computations. Set to -1 to use all cores (*2 with multithreading).
-  value: -1
-cpe_matching_threshold:
-  description: Level of required string similarity between CPE and certificate name on CC CPE matching, 0-100. Lower values yield more false negatives, higher values more false positives
-  value: 92
-cpe_n_max_matches:
-  description: Maximum number of candidate CPE items that may be related to given certificate, >0
-  value: 99
-cc_latest_snapshot:
-  description: URL from where to fetch the latest snapshot of fully processed CC dataset
-  value: https://seccerts.org/cc/dataset.json
-cc_maintenances_latest_snapshot:
-  description: URL from where to fetch the latest snapshot of CC maintenance updates
-  value: https://seccerts.org/cc/maintenance_updates.json
-pp_latest_snapshot:
-  description: URL from where to fetch the latest snapshot of the PP dataset
-  value: https://seccerts.org/static/pp.json
-ignore_first_page:
-  description: During keyword search, first page usually contains addresses - ignore it.
-  value: true
-cert_threshold:
-  description: Used with --higher-precision-results. Determines the amount of mismatched algorithms to be considered faulty.
-  value: 5
-fips_latest_snapshot:
-  description: URL for the latest snapshot of FIPS dataset
-  value: https://seccerts.org/fips/dataset.json
-fips_iut_dataset:
-  description: URL for the dataset of FIPS IUT data
-  value: https://seccerts.org/fips/iut/dataset.json
-fips_iut_latest_snapshot:
-  description: URL for the latest snapshot of FIPS IUT data
-  value: https://seccerts.org/fips/iut/latest.json
-fips_mip_dataset:
-  description: URL for the dataset of FIPS MIP data
-  value: https://seccerts.org/fips/mip/dataset.json
-fips_mip_latest_snapshot:
-  description: URL for the latest snapshot of FIPS MIP data
-  value: https://seccerts.org/fips/mip/latest.json
-minimal_token_length:
-  description: Minimal length of a string that will be considered as a token during keyword extraction in CVE matching
-  value: 3
-enable_progress_bars:
-  description: Whether to enable pretty-printed progress bars while processing.
-  value: true
diff --git a/src/sec_certs/configuration.py b/src/sec_certs/configuration.py
new file mode 100644
index 00000000..59ed0d34
--- /dev/null
+++ b/src/sec_certs/configuration.py
@@ -0,0 +1,128 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Optional
+
+import yaml
+from pydantic import AnyHttpUrl, BaseSettings, Field
+
+
+class Configuration(BaseSettings):
+    """
+    Class that holds configuration.
+    While not a singleton, the `config` instance from this module is meant to be primarily used.
+    """
+
+    class Config:
+        env_prefix = "seccerts_"
+
+    log_filepath: Path = Field(
+        "./cert_processing_log.txt",
+        description="Path to the file, relative to working directory, where the log will be stored.",
+    )
+    always_false_positive_fips_cert_id_threshold: int = Field(
+        40,
+        description="During validation we don't connect certificates with number lower than _this_ to connections due to these numbers being typically false positives.",
+        ge=0,
+    )
+    year_difference_between_validations: int = Field(
+        7,
+        description=" During validation we don't connect certificates with validation dates difference higher than _this_.",
+    )
+    n_threads: int = Field(
+        -1, description="How many threads to use for parallel computations. Set to -1 to use all logical cores.", ge=-1
+    )
+    cpe_matching_threshold: int = Field(
+        92,
+        description="Level of required string similarity between CPE and certificate name on CC CPE matching, 0-100. Lower values yield more false negatives, higher values more false positives",
+        ge=0,
+        le=100,
+    )
+    cpe_n_max_matches: int = Field(
+        99, description="Maximum number of candidate CPE items that may be related to given certificate, >0", gt=0
+    )
+    cc_latest_snapshot: AnyHttpUrl = Field(
+        "https://seccerts.org/cc/dataset.json",
+        description="URL from where to fetch the latest snapshot of fully processed CC dataset.",
+    )
+    cc_maintenances_latest_snapshot: AnyHttpUrl = Field(
+        "https://seccerts.org/cc/maintenance_updates.json",
+        description="URL from where to fetch the latest snapshot of CC maintenance updates",
+    )
+    pp_latest_snapshot: AnyHttpUrl = Field(
+        "https://seccerts.org/static/pp.json",
+        description="URL from where to fetch the latest snapshot of the PP dataset.",
+    )
+    fips_latest_snapshot: AnyHttpUrl = Field(
+        "https://seccerts.org/fips/dataset.json", description="URL for the latest snapshot of FIPS dataset."
+    )
+    fips_iut_dataset: AnyHttpUrl = Field(
+        "https://seccerts.org/fips/iut/dataset.json", description="URL for the dataset of FIPS IUT data."
+    )
+    fips_iut_latest_snapshot: AnyHttpUrl = Field(
+        "https://seccerts.org/fips/iut/latest.json", description="URL for the latest snapshot of FIPS IUT data."
+    )
+    fips_mip_dataset: AnyHttpUrl = Field(
+        "https://seccerts.org/fips/mip/dataset.json", description="URL for the dataset of FIPS MIP data"
+    )
+    fips_mip_latest_snapshot: AnyHttpUrl = Field(
+        "https://seccerts.org/fips/mip/latest.json", description="URL for the latest snapshot of FIPS MIP data"
+    )
+    minimal_token_length: int = Field(
+        3,
+        description="Minimal length of a string that will be considered as a token during keyword extraction in CVE matching",
+        ge=0,
+    )
+    ignore_first_page: bool = Field(
+        True, description="During keyword search, first page usually contains addresses - ignore it."
+    )
+    cc_reference_annotator_dir: Optional[Path] = Field(  # noqa: UP007
+        None,
+        description="Path to directory with serialized reference annotator model. If set to `null`, tool will search default directory for the given dataset.",
+    )
+    cc_reference_annotator_should_train: bool = Field(
+        True, description="True if new reference annotator model shall be build, False otherwise."
+    )
+
+    enable_progress_bars: bool = Field(
+        True, description="If true, progress bars will be printed to stdout during computation."
+    )
+
+    def _get_nondefault_keys(self) -> set[str]:
+        """
+        Returns keys of the config that have non-default value, i.e. were provided as kwargs, env. vars. or additionaly set.
+        """
+        return {key for key, value in Configuration.__fields__.items() if getattr(self, key) != value.default}
+
+    def _set_attrs_from_cfg(self, other_cfg: Configuration, fields_to_set: set[str] | None) -> None:
+        if not fields_to_set:
+            fields_to_set = set(Configuration.__fields__.keys())
+        for field in [x for x in other_cfg.__fields__ if x in fields_to_set]:
+            setattr(self, field, getattr(other_cfg, field))
+
+    def load_from_yaml(self, yaml_path: str | Path) -> None:
+        """
+        Will read configuration keys from `yaml_path` and overwrite the corresponding keys in `self`.
+        Also, will check environment variables with `seccerts_` prefix.
+
+        :param str | Path yaml_path: path to yaml to read for configuration.
+        """
+        with Path(yaml_path).open("r") as handle:
+            data = yaml.safe_load(handle)
+        other_cfg = Configuration.parse_obj(data)
+        keys_to_rewrite = set(data.keys()).union(other_cfg._get_nondefault_keys())
+        self._set_attrs_from_cfg(other_cfg, keys_to_rewrite)
+
+    def to_yaml(self, yaml_path: str | Path) -> None:
+        """
+        Will dump the configuration to yaml file.
+
+        :param str | Path yaml_path: path where the configuration will be dumped.
+        """
+        model_dict = json.loads(self.json())  # to assure that we have serializable values
+        with Path(yaml_path).open("w") as handle:
+            yaml.safe_dump(model_dict, handle)
+
+
+config = Configuration()
diff --git a/src/sec_certs/dataset/cc.py b/src/sec_certs/dataset/cc.py
index c2849772..7d0fd207 100644
--- a/src/sec_certs/dataset/cc.py
+++ b/src/sec_certs/dataset/cc.py
@@ -18,7 +18,7 @@
 
 import sec_certs.utils.sanitization
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.dataset.cpe import CPEDataset
 from sec_certs.dataset.cve import CVEDataset
 from sec_certs.dataset.dataset import AuxiliaryDatasets, Dataset, logger
diff --git a/src/sec_certs/dataset/dataset.py b/src/sec_certs/dataset/dataset.py
index 350e5126..d388db2e 100644
--- a/src/sec_certs/dataset/dataset.py
+++ b/src/sec_certs/dataset/dataset.py
@@ -15,7 +15,7 @@
 import pandas as pd
 
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.dataset.cpe import CPEDataset
 from sec_certs.dataset.cve import CVEDataset
 from sec_certs.model.cpe_matching import CPEClassifier
diff --git a/src/sec_certs/dataset/fips.py b/src/sec_certs/dataset/fips.py
index ce7563a0..3c057266 100644
--- a/src/sec_certs/dataset/fips.py
+++ b/src/sec_certs/dataset/fips.py
@@ -12,7 +12,7 @@
 from bs4 import BeautifulSoup, NavigableString
 
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.dataset.cpe import CPEDataset
 from sec_certs.dataset.cve import CVEDataset
 from sec_certs.dataset.dataset import AuxiliaryDatasets, Dataset
diff --git a/src/sec_certs/dataset/fips_iut.py b/src/sec_certs/dataset/fips_iut.py
index ce0f2f76..0251bf9b 100644
--- a/src/sec_certs/dataset/fips_iut.py
+++ b/src/sec_certs/dataset/fips_iut.py
@@ -8,7 +8,7 @@
 import requests
 
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.dataset.dataset import logger
 from sec_certs.dataset.json_path_dataset import JSONPathDataset
 from sec_certs.sample.fips_iut import IUTSnapshot
diff --git a/src/sec_certs/dataset/fips_mip.py b/src/sec_certs/dataset/fips_mip.py
index 05ca5854..1b0d2032 100644
--- a/src/sec_certs/dataset/fips_mip.py
+++ b/src/sec_certs/dataset/fips_mip.py
@@ -8,7 +8,7 @@
 import requests
 
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.dataset.dataset import logger
 from sec_certs.dataset.json_path_dataset import JSONPathDataset
 from sec_certs.sample.fips_mip import MIPSnapshot
diff --git a/src/sec_certs/dataset/protection_profile.py b/src/sec_certs/dataset/protection_profile.py
index 9730a477..af7733a8 100644
--- a/src/sec_certs/dataset/protection_profile.py
+++ b/src/sec_certs/dataset/protection_profile.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.sample.protection_profile import ProtectionProfile
 from sec_certs.serialization.json import get_class_fullname
 from sec_certs.utils import helpers
diff --git a/src/sec_certs/sample/fips.py b/src/sec_certs/sample/fips.py
index f3c82117..82aa1c18 100644
--- a/src/sec_certs/sample/fips.py
+++ b/src/sec_certs/sample/fips.py
@@ -16,7 +16,7 @@
 
 from sec_certs import constants
 from sec_certs.cert_rules import FIPS_ALGS_IN_TABLE, fips_rules
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.sample.certificate import Certificate, References, logger
 from sec_certs.sample.certificate import Heuristics as BaseHeuristics
 from sec_certs.sample.certificate import PdfData as BasePdfData
diff --git a/src/sec_certs/sample/fips_iut.py b/src/sec_certs/sample/fips_iut.py
index f6010346..a603a2c2 100644
--- a/src/sec_certs/sample/fips_iut.py
+++ b/src/sec_certs/sample/fips_iut.py
@@ -10,7 +10,7 @@
 from bs4 import BeautifulSoup, Tag
 
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.serialization.json import ComplexSerializableType
 from sec_certs.utils.helpers import to_utc
 
diff --git a/src/sec_certs/sample/fips_mip.py b/src/sec_certs/sample/fips_mip.py
index 7e0ddff0..f1ef05c8 100644
--- a/src/sec_certs/sample/fips_mip.py
+++ b/src/sec_certs/sample/fips_mip.py
@@ -12,7 +12,7 @@
 from bs4 import BeautifulSoup, Tag
 
 from sec_certs import constants
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.constants import FIPS_MIP_STATUS_RE
 from sec_certs.serialization.json import ComplexSerializableType
 from sec_certs.utils.helpers import to_utc
diff --git a/src/sec_certs/utils/parallel_processing.py b/src/sec_certs/utils/parallel_processing.py
index b3016695..ae3f0b44 100644
--- a/src/sec_certs/utils/parallel_processing.py
+++ b/src/sec_certs/utils/parallel_processing.py
@@ -2,12 +2,10 @@
 
 import time
 from multiprocessing import cpu_count
-from multiprocessing.pool import ThreadPool
+from multiprocessing.pool import Pool, ThreadPool
 from typing import Any, Callable, Iterable
 
-from billiard.pool import Pool
-
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 from sec_certs.utils.tqdm import tqdm
 
 
diff --git a/src/sec_certs/utils/tqdm.py b/src/sec_certs/utils/tqdm.py
index 77eeae94..581295ad 100644
--- a/src/sec_certs/utils/tqdm.py
+++ b/src/sec_certs/utils/tqdm.py
@@ -1,6 +1,6 @@
 from tqdm import tqdm as tqdm_original
 
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 
 
 def tqdm(*args, **kwargs):
diff --git a/tests/conftest.py b/tests/conftest.py
index b00c1bbe..1c8d23c0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,10 +3,10 @@
 import pytest
 
 import tests.data
-from sec_certs.config.configuration import config
+from sec_certs.configuration import config
 
 
-@pytest.fixture(scope="session", autouse=True)
+@pytest.fixture(scope="module", autouse=True)
 def load_test_config():
     pth = Path(tests.data.__path__[0]) / "settings_tests.yml"
-    config.load(pth)
+    config.load_from_yaml(pth)
diff --git a/tests/data/settings_tests.yml b/tests/data/settings_tests.yml
index cf5a29b7..645f11ab 100644
--- a/tests/data/settings_tests.yml
+++ b/tests/data/settings_tests.yml
@@ -1,59 +1 @@
----
-log_filepath:
-  description: Path to the file, relative to working directory, where the log will be stored
-  value: ./cert_processing_log.txt
-always_false_positive_fips_cert_id_threshold:
-  description:
-    During validation we don't connect certificates with number lower than
-    _this_ to connections due to these numbers being typically false positives
-  value: 40
-year_difference_between_validations:
-  description:
-    During validation we don't connect certificates with validation dates
-    difference higher than _this_
-  value: 7
-n_threads:
-  description: How many threads to use for parallel computations
-  value: 8
-cpe_matching_threshold:
-  description: Level of required string similarity between CPE and certificate name on CC CPE matching, 0-100. Lower values yield more false negatives, higher values more false positives
-  value: 92
-cpe_n_max_matches:
-  description: Maximum number of candidate CPE items that may be related to given certificate, >0
-  value: 99
-cc_latest_snapshot:
-  description: URL from where to fetch the latest snapshot of fully processed CC dataset
-  value: https://seccerts.org/cc/dataset.json
-cc_maintenances_latest_snapshot:
-  description: URL from where to fetch the latest snapshot of CC maintenance updates
-  value: https://seccerts.org/cc/maintenance_updates.json
-pp_latest_snapshot:
-  description: URL from where to fetch the latest snapshot of the PP dataset
-  value: https://seccerts.org/static/pp.json
-ignore_first_page:
-  description: During keyword search, first page usually contains addresses - ignore it.
-  value: true
-cert_threshold:
-  description: Used with --higher-precision-results. Determines the amount of mismatched algorithms to be considered faulty.
-  value: 5
-fips_latest_snapshot:
-  description: URL for the latest snapshot of FIPS dataset
-  value: https://seccerts.org/fips/dataset.json
-fips_iut_dataset:
-  description: URL for the dataset of FIPS IUT data
-  value: https://seccerts.org/fips/iut/dataset.json
-fips_iut_latest_snapshot:
-  description: URL for the latest snapshot of FIPS IUT data
-  value: https://seccerts.org/fips/iut/latest.json
-fips_mip_dataset:
-  description: URL for the dataset of FIPS MIP data
-  value: https://seccerts.org/fips/mip/dataset.json
-fips_mip_latest_snapshot:
-  description: URL for the latest snapshot of FIPS MIP data
-  value: https://seccerts.org/fips/mip/latest.json
-minimal_token_length:
-  description: Minimal length of a string that will be considered as a token during keyword extraction in CVE matching
-  value: 3
-enable_progress_bars:
-  description: Whether to enable pretty-printed progress bars while processing.
-  value: False
+enable_progress_bars: false
diff --git a/tests/test_config.py b/tests/test_config.py
new file mode 100644
index 00000000..39102111
--- /dev/null
+++ b/tests/test_config.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+from typing import Any
+
+import pytest
+import yaml
+
+import sec_certs.configuration as config_module
+import tests.data
+
+
+@pytest.fixture(autouse=True)
+def load_test_config():
+    pth = Path(tests.data.__path__[0]) / "settings_tests.yml"
+    config_module.config.load_from_yaml(pth)
+
+
+@pytest.fixture
+def simple_config_dict() -> dict[str, Any]:
+    return {
+        "always_false_positive_fips_cert_id_threshold": 42,
+        "cc_reference_annotator_should_train": False,
+    }
+
+
+@pytest.fixture
+def simple_config_yaml(simple_config_dict, tmp_path) -> Path:
+    yaml_path = tmp_path / "config.yaml"
+    with yaml_path.open("w") as handle:
+        yaml.safe_dump(simple_config_dict, handle)
+    return yaml_path
+
+
+def test_config_from_yaml(simple_config_dict, simple_config_yaml: Path) -> None:
+    config_module.config.load_from_yaml(simple_config_yaml)
+
+    for key, val in simple_config_dict.items():
+        assert getattr(config_module.config, key) == val
+
+
+def test_load_env_values(simple_config_dict, simple_config_yaml):
+    os.environ["seccerts_log_filepath"] = "/some/nonsense/path"
+    os.environ["always_false_positive_fips_cert_id_threshold"] = "10"
+
+    config_module.config.load_from_yaml(simple_config_yaml)
+
+    # this should also beat the env set above
+    for key, val in simple_config_dict.items():
+        assert getattr(config_module.config, key) == val
+
+    assert config_module.config.log_filepath == Path("/some/nonsense/path")
+
+
+def test_complex_config_load(simple_config_dict, simple_config_yaml):
+    config_module.config.year_difference_between_validations = 123456789
+    config_module.config.n_threads = 987654321
+    os.environ["seccerts_n_threads"] = "1"
+
+    config_module.config.load_from_yaml(simple_config_yaml)
+    for key, val in simple_config_dict.items():
+        assert getattr(config_module.config, key) == val
+
+    # year_difference_between_validations should not get overwritten
+    assert config_module.config.year_difference_between_validations == 123456789
+
+    # n_threads should get overwritten
+    assert config_module.config.n_threads == 1