### Validate mandatory fields in water_column and softsediment sampling sheets (not measured)

In [1]:
import sys
from pathlib import Path

# Add project directory to path
PROJECT_DIR = Path.cwd().parents[0]
sys.path.append(str(PROJECT_DIR))
sys.path.append(str(PROJECT_DIR / "src"))

In [2]:
# %load_ext mypy_ipython
# %load_ext autoreload
# %autoreload 2
# %reload_ext autoreload

import math
import sys
from pathlib import Path
from pprint import pprint

import pandas as pd
import validators
from pydantic import ValidationError

from validation_classes import (
    measuredModel,
    samplingModel,
    softSedimentMandatoryModel,
    waterColumnMandatoryModel,
)

############################ CAUTION ##################################################
MANDATORY: bool = True  # As defined by Ioulia but not checking for mandatory fields
#######################################################################################


def parse_sample_sheets(
    sampling_strategy: str,
    sheet_type: str,
    addresses: list[tuple[str, str]],
) -> None:
    for observatory in addresses:
        observatory_id, sheet_link = observatory
        # print(f"Observatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            # print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float):
                if math.isnan(sheet_link):
                    print(
                        f"\nObservatory {observatory_id} does not do {sampling_strategy}"
                    )
                    continue
            else:
                raise ValueError(
                    f"Unknown URL value {sheet_link} to observatory {observatory_id}"
                )
        else:
            if not validators.url(sheet_link):
                raise ValueError(f"URL {sheet_link=} is not valid")

            if observatory_id == "Plenzia":
                continue  # Sheets not publically available
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "soft_sediment" and observatory_id == "UMF":
                continue

            print(f"\n\nProcessing {observatory_id}...")
            sampling_sheet_base: str = sheet_link.split("/edit")[0]
            sampling_sheet_suffix: str = "/gviz/tq?tqx=out:csv&sheet=%s"
            sample_sheet_link: str = (
                sampling_sheet_base + sampling_sheet_suffix % sheet_type
            )
            print(f"Sample sheet data: {sample_sheet_link}")
            print(f"Sheet type link: {sheet_link}")
            df: pd.core.frame.DataFrame = pd.read_csv(
                sample_sheet_link, encoding="utf-8"
            )
            data_records_all: dict[str, str] = df.to_dict(orient="records")

            def filter_on_source_mat_id(d):
                try:
                    value = d["source_mat_id"]
                except KeyError as e:
                    raise ValueError("Cannot find source_mat_id field") from e  # noqa: B023
                if isinstance(value, float):
                    if math.isnan(value):
                        return False
                    else:  # not nan
                        raise ValueError(
                            f"Unrecognised float value: {value} in source_mat_id"
                        )
                elif value is None or len(value.split("_")) < 4:
                    return False

                # VB_IMEV has "à vérifier si les filets sont présents" for source_mat_id_orig
                # but valid source_mat_id
                # https://github.com/emo-bon/observatory-profile/issues/37
                try:
                    value = d["source_mat_id_orig"]
                except KeyError:
                    # Measured sheets dont have source_mat_id_orig
                    return True
                # Check the sampling sheets
                return "si les filets" not in value

            data_records_filtered: list[dict[str, str]] = list(
                filter(filter_on_source_mat_id, data_records_all)
            )

            if len(data_records_all) > len(data_records_filtered):
                print(
                    f"Discarded {len(data_records_all) - len(data_records_filtered)}"
                    f" records leaving {len(data_records_filtered)}"
                )
            else:
                print(f"All {len(data_records_filtered)} records passed through filter")

            if MANDATORY:
                model_type = f"{sampling_strategy}_mandatory"
            else:
                raise ValueError(
                    "No model type specified set MANDATORY to True or False"
                )
            print(f"Using model: {model_type}")

            validator = validator_classes[model_type]
            validated_rows = []
            errors = []  # type is way too complicated to include :)
            for row in data_records_filtered:
                try:
                    vr = validator(**row)
                except ValidationError as e:
                    errors.append([(row["source_mat_id"], e.errors())])
                else:
                    validated_rows.append(vr.model_dump())

            if errors:
                # errors is a list of lists where each inner list is a dict of row errors
                # where each isof key = source_mat_id and values is list of dicts each of which
                # is an error:
                total_number_errors: int = sum(
                    [len(row[1]) for e in errors for row in e]
                )
                print(f"Errors were found... {total_number_errors} in total")
                save_dir_errors: Path = PROJECT_DIR / "logs" / "validation_errors"
                outfile_name_log: str = (
                    f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.log"
                )
                out_path_log: Path = save_dir_errors / outfile_name_log
                with open(out_path_log, "w") as f:
                    pprint(errors, f)
                print(f"Written {out_path_log}")
            else:
                if len(validated_rows) != len(data_records_filtered):
                    raise RuntimeError(
                        f"Error: {len(validated_rows)=} != {len(data_records_filtered)=}"
                    )
                else:
                    print("All records passed without errors!")
                    ndf = pd.DataFrame.from_records(
                        validated_rows, index="source_mat_id"
                    )
                    save_dir_logsheets: Path = (
                        PROJECT_DIR / "validated-data" / "logsheets_mandatory"
                    )
                    outfile_name: Path = f"{observatory_id}_{sampling_strategy}_{model_type}_validated.csv"
                    ndf.to_csv(save_dir_logsheets / outfile_name)
                    print(f"Written {save_dir_logsheets / outfile_name}")


validator_classes = {
    "sampling": samplingModel,
    "measured": measuredModel,
    "water_column_mandatory": waterColumnMandatoryModel,
    "soft_sediment_mandatory": softSedimentMandatoryModel,
}

# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
validated_csv = (
    PROJECT_DIR / "validated-data" / "governance" / "logsheets_validated.csv"
)
df: pd.core.frame.DataFrame = pd.read_csv(validated_csv)
water_column_sheet_addresses: list[tuple[str, str]] = df[
    ["observatory_id", "water_column"]
].values.tolist()
soft_sediment_sheet_addresses: list[tuple[str, str]] = df[
    ["observatory_id", "soft_sediment"]
].values.tolist()
del df

parse_sample_sheets("water_column", "sampling", water_column_sheet_addresses)
parse_sample_sheets("soft_sediment", "sampling", soft_sediment_sheet_addresses)



Processing ESC68N...
Sample sheet data: https://docs.google.com/spreadsheets/d/11_Eu0W1-sDiuzKx1cIl6YuxjRHmWezN6u9v3Ly8JZ3A/gviz/tq?tqx=out:csv&sheet=sampling
Sheet type link: https://docs.google.com/spreadsheets/d/11_Eu0W1-sDiuzKx1cIl6YuxjRHmWezN6u9v3Ly8JZ3A/edit?usp=sharing
All 170 records passed through filter
Using model: water_column_mandatory
All records passed without errors!
Written /home/cymon/vscode/git-repos/emo-bon-data-validation/validated-data/logsheets_mandatory/ESC68N_water_column_water_column_mandatory_validated.csv


Processing Bergen...
Sample sheet data: https://docs.google.com/spreadsheets/d/1HuXHiUJICZrmCrJ4EZDyU5aSCMzDAc1cy_tne5YVPTg/gviz/tq?tqx=out:csv&sheet=sampling
Sheet type link: https://docs.google.com/spreadsheets/d/1HuXHiUJICZrmCrJ4EZDyU5aSCMzDAc1cy_tne5YVPTg/edit?usp=sharing
All 150 records passed through filter
Using model: water_column_mandatory
Errors were found... 150 in total
Written /home/cymon/vscode/git-repos/emo-bon-data-validation/logs/valida

In [3]:
# %reload_ext mypy_ipython
# %mypy