### Mandatory fields

The problem is that there are different mandatory fields for water-column sampling, soft-sediment sampling, and (possibly) wc-measured and ss-measured.

Need to parse the "Updated definition" sheets of both sampling types and compare between to figure out what are common, what are not common, what are mandatory and what are optional.



In [1]:
import math
import sys
from pathlib import Path

import pandas as pd

# Add project directory to path
PROJECT_DIR = Path.cwd().parents[0]
sys.path.append(str(PROJECT_DIR))
sys.path.append(str(PROJECT_DIR / "src"))

### UTILS

In [3]:
def get_fields_and_requirements_from_governance_sheet(sheet_link):
    """
    Parse the definitions sheet and return a dictionary of the fields
    and their status (mandatory or optional) in Requirment column.
    """
    sampling_sheet_base = sheet_link.split("/edit")[0]
    sampling_sheet_suffix = "/gviz/tq?tqx=out:csv&sheet=Updated%20definitions"
    sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
    print(f"Sample sheet data: {sample_sheet_link}")
    print(f"Sheet type link: {sheet_link}")
    df = pd.read_csv(sample_sheet_link, encoding="utf-8")
    categories = ["Sampling; Measured", "Sampling", "sampling"]
    mask = df["Metadata_Category"].isin(categories)
    sampling = df[mask]
    df = sampling[["Metadata_term", "Requirement"]]
    # for _, row in df.iterrows():
    #    print(f"Field: {row['Metadata_term']}, Requirement: {row['Requirement']}")
    return df.values.tolist()


def get_observatory_ids_from_governance_sheet():
    """
    Get list of all URL links to sampling sheets
    NB  you cant use a "with" closure here when reading the Pandas df
    """
    validated_csv = (
        PROJECT_DIR / "validated-data" / "governance" / "logsheets_validated.csv"
    )
    df = pd.read_csv(validated_csv)
    return df["observatory_id"].values.tolist()


def get_sheet_link_from_governance_sheet(observatory_id, env_package):
    """
    Get the link to the sampling sheet for a given observatory_id
    """
    validated_csv = (
        PROJECT_DIR / "validated-data" / "governance" / "logsheets_validated.csv"
    )
    df = pd.read_csv(validated_csv)
    ss_sheet_link = df.loc[df["observatory_id"] == observatory_id, env_package].values[
        0
    ]
    if isinstance(ss_sheet_link, float) and math.isnan(ss_sheet_link):
        print(f"Soft sediment sheet link is not available for {observatory_id}")
    return ss_sheet_link


def get_mandatory_and_optional_fields(sheet_link):
    """
    Get the mandatory and optional fields from the governance sheet
    """
    fields_and_requirements = get_fields_and_requirements_from_governance_sheet(
        sheet_link
    )

    # for field, requirement in fields_and_requirements:
    #    print(f"Field: {field}, Requirement: {requirement}")

    mandatory_fields = []
    optional_fields = []
    for field, requirement in fields_and_requirements:
        if requirement == "Mandatory (M)":
            mandatory_fields.append(field)
        elif requirement == "Optional (O)":
            optional_fields.append(field)
    return mandatory_fields, optional_fields


def check_for_duplicates(fields):
    """
    Check for duplicates in the fields list
    """
    fields_set = set(fields)
    if len(fields) != len(fields_set):
        dups = []
        for field in fields:
            if field in dups:
                continue
            if fields.count(field) > 1:
                print(f"Field {field} appears {fields.count(field)} times")
                dups.append(field)

### Numbers of mandatory fields

In [6]:
def show_statistics_for_fields(
    wc_fields_mandatory, wc_fields_optional, ss_fields_mandatory, ss_fields_optional
):
    print(f"\nLength of wc_fields: {len(wc_fields_mandatory + wc_fields_optional)}")
    print(f"Length of ss_fields: {len(ss_fields_mandatory + ss_fields_optional)}")

    print("Checking for duplicates in water_column...")
    check_for_duplicates(wc_fields_mandatory)
    check_for_duplicates(wc_fields_optional)
    print("Checking for duplicates in soft_sediment...")
    check_for_duplicates(ss_fields_mandatory)
    check_for_duplicates(ss_fields_optional)

    # print(f"Water column mandatory fields: {wc_fields_mandatory}")
    print(f"Water column mandatory fields: {len(wc_fields_mandatory)}")
    print(f"Water column optional fields: {len(wc_fields_optional)}")
    # print(f"Soft sediment mandatory fields: {ss_fields_mandatory}")
    print(f"Soft sediment mandatory fields: {len(ss_fields_mandatory)}")
    print(f"Soft sediment optional fields: {len(ss_fields_optional)}")

    # union
    union = set(wc_fields_mandatory) | set(ss_fields_mandatory)
    # print(f"All MANDATORY fields: {union}")
    print(f"All MANDATORY fields: {len(union)}")

    # intersection
    intersection = set(wc_fields_mandatory) & set(ss_fields_mandatory)
    # print(f"Common MANDATORY fields in both WC and SS: {intersection}")
    print(f"Common MANDATORY fields in both WC and SS: {len(intersection)}")

    # # difference
    # print("Difference :", A - B)
    difference = set(wc_fields_mandatory) - set(ss_fields_mandatory)
    # print(f"MANDATORY fields only in WC: {difference}")
    print(f"MANDATORY fields only in WC: {len(difference)}")

    # # difference
    # print("Difference :", B - A)
    difference = set(ss_fields_mandatory) - set(wc_fields_mandatory)
    # print(f"MANDATORY fields only in SS: {difference}")
    print(f"MANDATORY fields only in SS: {len(difference)}")

    # intersection
    intersection = set(wc_fields_optional) & set(ss_fields_optional)
    # print(f"Common OPTIONAL fields in both WC and SS: {intersection}")
    print(f"Common OPTIONAL fields in both WC and SS: {len(intersection)}")

    # union
    # print("Intersection :", A | B)
    union = set(wc_fields_optional) | set(ss_fields_optional)
    # print(f"All OPTIONAL fields: {union}")
    print(f"All OPTIONAL fields: {len(union)}")

    # # difference
    # print("Difference :", A - B)
    difference = set(wc_fields_optional) - set(ss_fields_optional)
    # print(f"OPTIONAL fields only in WC: {difference}")
    print(f"OPTIONAL fields only in WC: {len(difference)}")

    # # difference
    # print("Difference :", B - A)
    difference = set(ss_fields_optional) - set(wc_fields_optional)
    # print(f"OPTIONAL fields only in SS: {difference}")
    print(f"OPTIONAL fields only in SS: {len(difference)}")

### Main loop

In [7]:
obs_ids = get_observatory_ids_from_governance_sheet()

obs_ids = ["BPNS"]

for observatory_id in obs_ids:
    if observatory_id == "Plenzia":
        continue

    sheet_link = get_sheet_link_from_governance_sheet(observatory_id, "water_column")
    if isinstance(sheet_link, float) and math.isnan(sheet_link):
        continue
    else:
        print(f"\n\nProcessing {observatory_id}... water column")
        wc_fields_mandatory, wc_fields_optional = get_mandatory_and_optional_fields(
            sheet_link
        )
        print(f"Length wc_mandatory fields {len(wc_fields_mandatory)}")
        print(f"Length wc_optional fields {len(wc_fields_optional)}")
        print(f"Fields: {wc_fields_mandatory}")
        print(f"Fields: {wc_fields_optional}")

    sheet_link = get_sheet_link_from_governance_sheet(observatory_id, "soft_sediment")
    if isinstance(sheet_link, float) and math.isnan(sheet_link):
        continue
    else:
        print(f"\n\nProcessing {observatory_id}... soft sediment")
        ss_fields_mandatory, ss_fields_optional = get_mandatory_and_optional_fields(
            sheet_link
        )
        print(f"Length ss_mandatory fields {len(ss_fields_mandatory)}")
        print(f"Length ss_optional fields {len(ss_fields_optional)}")
        print(f"Fields: {ss_fields_mandatory}")
        print(f"Fields: {ss_fields_optional}")

    show_statistics_for_fields(
        wc_fields_mandatory, wc_fields_optional, ss_fields_mandatory, ss_fields_optional
    )



Processing BPNS... water column
Sample sheet data: https://docs.google.com/spreadsheets/d/1mEi4Bd2YR63WD0j54FQ6QkzcUw_As9Wilue9kaXO2DE/gviz/tq?tqx=out:csv&sheet=Updated%20definitions
Sheet type link: https://docs.google.com/spreadsheets/d/1mEi4Bd2YR63WD0j54FQ6QkzcUw_As9Wilue9kaXO2DE/edit?usp=sharing
Length wc_mandatory fields 32
Length wc_optional fields 6
Fields: ['arr_date_hq', 'arr_date_seq', 'collection_date', 'depth', 'env_material', 'failure', 'failure_comment', 'investigation_type', 'long_store', 'membr_cut', 'replicate', 'samp_collect_device', 'samp_description', 'samp_mat_process', 'samp_mat_process_dev', 'samp_size_vol', 'samp_store_date', 'samp_store_loc', 'samp_store_temp', 'sampl_person', 'sampling_event', 'ship_date', 'ship_date_seq', 'size_frac', 'size_frac_low', 'size_frac_up', 'source_mat_id_orig', 'source_mat_id', 'store_person', 'store_temp_hq', 'tax_id', 'time_fi']
Fields: ['noteworthy_env_cond', 'other_person', 'other_person_orcid', 'sampl_person_orcid', 'store_p

### Run the mandatory validators

In [4]:
# %load_ext mypy_ipython
# %load_ext autoreload
# %autoreload 2
# %reload_ext autoreload

import math
import sys
from pathlib import Path
from pprint import pprint

import validators
from pydantic import ValidationError

from validation_classes import (
    measuredModel,
    samplingModel,
    softSedimentMandatoryModel,
    waterColumnMandatoryModel,
)

############################ CAUTION ##################################################
MANDATORY: bool = True  # As defined by Ioulia checking for mandatory fields
#######################################################################################


def parse_sample_sheets(
    sampling_strategy: str,
    sheet_type: str,
    addresses: list[tuple[str, str]],
) -> None:
    for observatory in addresses:
        observatory_id, sheet_link = observatory
        # print(f"Observatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            # print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float):
                if math.isnan(sheet_link):
                    print(
                        f"\nObservatory {observatory_id} does not do {sampling_strategy}"
                    )
                    continue
            else:
                raise ValueError(
                    f"Unknown URL value {sheet_link} to observatory {observatory_id}"
                )
        else:
            if not validators.url(sheet_link):
                raise ValueError(f"URL {sheet_link=} is not valid")

            if observatory_id == "Plenzia":
                continue  # Sheets not publically available
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "soft_sediment" and observatory_id == "UMF":
                continue

            print(f"\n\nProcessing {observatory_id}...")
            sampling_sheet_base: str = sheet_link.split("/edit")[0]
            sampling_sheet_suffix: str = "/gviz/tq?tqx=out:csv&sheet=%s"
            sample_sheet_link: str = (
                sampling_sheet_base + sampling_sheet_suffix % sheet_type
            )
            print(f"Sample sheet data: {sample_sheet_link}")
            print(f"Sheet type link: {sheet_link}")
            df: pd.core.frame.DataFrame = pd.read_csv(
                sample_sheet_link, encoding="utf-8"
            )
            data_records_all: dict[str, str] = df.to_dict(orient="records")

            def filter_on_source_mat_id(d):
                try:
                    value = d["source_mat_id"]
                except KeyError as e:
                    raise ValueError("Cannot find source_mat_id field") from e  # noqa: B023
                if isinstance(value, float):
                    if math.isnan(value):
                        return False
                    else:  # not nan
                        raise ValueError(
                            f"Unrecognised float value: {value} in source_mat_id"
                        )
                elif value is None or len(value.split("_")) < 4:
                    return False

                # VB_IMEV has "à vérifier si les filets sont présents" for source_mat_id_orig
                # but valid source_mat_id
                # https://github.com/emo-bon/observatory-profile/issues/37
                try:
                    value = d["source_mat_id_orig"]
                except KeyError:
                    # Measured sheets dont have source_mat_id_orig
                    return True
                # Check the sampling sheets
                return "si les filets" not in value

            data_records_filtered: list[dict[str, str]] = list(
                filter(filter_on_source_mat_id, data_records_all)
            )

            if len(data_records_all) > len(data_records_filtered):
                print(
                    f"Discarded {len(data_records_all) - len(data_records_filtered)}"
                    f" records leaving {len(data_records_filtered)}"
                )
            else:
                print(f"All {len(data_records_filtered)} records passed through filter")

            if MANDATORY:
                model_type = f"{sampling_strategy}_mandatory"
            else:
                raise ValueError(
                    "No model type specified set MANDATORY to True or False"
                )
            print(f"Using model: {model_type}")

            validator = validator_classes[model_type]
            validated_rows = []
            errors = []  # type is way too complicated to include :)
            for row in data_records_filtered:
                try:
                    vr = validator(**row)
                except ValidationError as e:
                    errors.append([(row["source_mat_id"], e.errors())])
                else:
                    validated_rows.append(vr.model_dump())

            if errors:
                # errors is a list of lists where each inner list is a dict of row errors
                # where each isof key = source_mat_id and values is list of dicts each of which
                # is an error:
                total_number_errors: int = sum(
                    [len(row[1]) for e in errors for row in e]
                )
                print(f"Errors were found... {total_number_errors} in total")
                save_dir_errors: Path = PROJECT_DIR / "logs" / "validation_errors"
                outfile_name_log: str = (
                    f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.log"
                )
                out_path_log: Path = save_dir_errors / outfile_name_log
                with open(out_path_log, "w") as f:
                    pprint(errors, f)
                print(f"Written {out_path_log}")
            else:
                if len(validated_rows) != len(data_records_filtered):
                    raise RuntimeError(
                        f"Error: {len(validated_rows)=} != {len(data_records_filtered)=}"
                    )
                else:
                    print("All records passed without errors!")
                    ndf = pd.DataFrame.from_records(
                        validated_rows, index="source_mat_id"
                    )
                    save_dir_logsheets: Path = (
                        PROJECT_DIR / "validated-data" / "logsheets_mandatory"
                    )
                    outfile_name: Path = f"{observatory_id}_{sampling_strategy}_{model_type}_validated.csv"
                    ndf.to_csv(save_dir_logsheets / outfile_name)
                    print(f"Written {save_dir_logsheets / outfile_name}")


validator_classes = {
    "sampling": samplingModel,
    "measured": measuredModel,
    "water_column_mandatory": waterColumnMandatoryModel,
    "soft_sediment_mandatory": softSedimentMandatoryModel,
}

# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
validated_csv = (
    PROJECT_DIR / "validated-data" / "governance" / "logsheets_validated.csv"
)
df: pd.core.frame.DataFrame = pd.read_csv(validated_csv)
water_column_sheet_addresses: list[tuple[str, str]] = df[
    ["observatory_id", "water_column"]
].values.tolist()
soft_sediment_sheet_addresses: list[tuple[str, str]] = df[
    ["observatory_id", "soft_sediment"]
].values.tolist()
del df

parse_sample_sheets("water_column", "sampling", water_column_sheet_addresses)
parse_sample_sheets("soft_sediment", "sampling", soft_sediment_sheet_addresses)



Processing ESC68N...
Sample sheet data: https://docs.google.com/spreadsheets/d/11_Eu0W1-sDiuzKx1cIl6YuxjRHmWezN6u9v3Ly8JZ3A/gviz/tq?tqx=out:csv&sheet=sampling
Sheet type link: https://docs.google.com/spreadsheets/d/11_Eu0W1-sDiuzKx1cIl6YuxjRHmWezN6u9v3Ly8JZ3A/edit?usp=sharing
All 170 records passed through filter
Using model: water_column_mandatory
Errors were found... 1496 in total
Written /home/cymon/vscode/git-repos/emo-bon-data-validation/logs/validation_errors/ESC68N_water_column_water_column_mandatory_ERRORS.log


Processing Bergen...
Sample sheet data: https://docs.google.com/spreadsheets/d/1HuXHiUJICZrmCrJ4EZDyU5aSCMzDAc1cy_tne5YVPTg/gviz/tq?tqx=out:csv&sheet=sampling
Sheet type link: https://docs.google.com/spreadsheets/d/1HuXHiUJICZrmCrJ4EZDyU5aSCMzDAc1cy_tne5YVPTg/edit?usp=sharing
All 150 records passed through filter
Using model: water_column_mandatory
Errors were found... 1242 in total
Written /home/cymon/vscode/git-repos/emo-bon-data-validation/logs/validation_errors/Be