### Mandatory fields

The problem is that there are different mandatory fields for water-column sampling, soft-sediment sampling, and (possibly) wc-measured and ss-measured.

Need to parse the "Updated definition" sheets of both sampling types and compare between to figure out what are common, what are not common, what are mandatory and what are optional.



In [1]:
import math
import sys
from pathlib import Path

import pandas as pd

# Add project directory to path
PROJECT_DIR = Path.cwd().parents[0]
sys.path.append(str(PROJECT_DIR))
sys.path.append(str(PROJECT_DIR / "src"))

### UTILS

In [None]:
def get_fields_and_requirements_from_governance_sheet(sheet_link):
    """
    Parse the definitions sheet and return a dictionary of the fields
    and their status (mandatory or optional) in Requirment column.
    """
    sampling_sheet_base = sheet_link.split("/edit")[0]
    sampling_sheet_suffix = "/gviz/tq?tqx=out:csv&sheet=Updated%20definitions"
    sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
    print(f"Sample sheet data: {sample_sheet_link}")
    print(f"Sheet type link: {sheet_link}")
    df = pd.read_csv(sample_sheet_link, encoding="utf-8")
    categories = ["Sampling; Measured", "Sampling", "sampling"]
    mask = df["Metadata_Category"].isin(categories)
    sampling = df[mask]
    df = sampling[["Metadata_term", "Requirement"]]
    # for _, row in df.iterrows():
    #    print(f"Field: {row['Metadata_term']}, Requirement: {row['Requirement']}")
    return df.values.tolist()


def get_observatory_ids_from_governance_sheet():
    """
    Get list of all URL links to sampling sheets
    NB  you cant use a "with" closure here when reading the Pandas df
    """
    validated_csv = (
        PROJECT_DIR / "validated-data" / "governance" / "logsheets_validated.csv"
    )
    df = pd.read_csv(validated_csv)
    return df["observatory_id"].values.tolist()


def get_sheet_link_from_governance_sheet(observatory_id, env_package):
    """
    Get the link to the sampling sheet for a given observatory_id
    """
    validated_csv = (
        PROJECT_DIR / "validated-data" / "governance" / "logsheets_validated.csv"
    )
    df = pd.read_csv(validated_csv)
    ss_sheet_link = df.loc[df["observatory_id"] == observatory_id, env_package].values[
        0
    ]
    if isinstance(ss_sheet_link, float) and math.isnan(ss_sheet_link):
        print(f"Soft sediment sheet link is not available for {observatory_id}")
    return ss_sheet_link


def get_mandatory_and_optional_fields(sheet_link):
    """
    Get the mandatory and optional fields from the governance sheet
    """
    fields_and_requirements = get_fields_and_requirements_from_governance_sheet(
        sheet_link
    )

    # for field, requirement in fields_and_requirements:
    #    print(f"Field: {field}, Requirement: {requirement}")

    mandatory_fields = []
    optional_fields = []
    for field, requirement in fields_and_requirements:
        if requirement == "Mandatory (M)":
            mandatory_fields.append(field)
        elif requirement == "Optional (O)":
            optional_fields.append(field)
    return mandatory_fields, optional_fields


def check_for_duplicates(fields):
    """
    Check for duplicates in the fields list
    """
    fields_set = set(fields)
    if len(fields) != len(fields_set):
        dups = []
        for field in fields:
            if field in dups:
                continue
            if fields.count(field) > 1:
                print(f"Field {field} appears {fields.count(field)} times")
                dups.append(field)

### Numbers of mandatory fields

In [3]:
def show_statistics_for_fields(
    wc_fields_mandatory, wc_fields_optional, ss_fields_mandatory, ss_fields_optional
):
    print(f"\nLength of wc_fields: {len(wc_fields_mandatory + wc_fields_optional)}")
    print(f"Length of ss_fields: {len(ss_fields_mandatory + ss_fields_optional)}")

    print("Checking for duplicates in water_column...")
    check_for_duplicates(wc_fields_mandatory)
    check_for_duplicates(wc_fields_optional)
    print("Checking for duplicates in soft_sediment...")
    check_for_duplicates(ss_fields_mandatory)
    check_for_duplicates(ss_fields_optional)

    # print(f"Water column mandatory fields: {wc_fields_mandatory}")
    print(f"Water column mandatory fields: {len(wc_fields_mandatory)}")
    print(f"Water column optional fields: {len(wc_fields_optional)}")
    # print(f"Soft sediment mandatory fields: {ss_fields_mandatory}")
    print(f"Soft sediment mandatory fields: {len(ss_fields_mandatory)}")
    print(f"Soft sediment optional fields: {len(ss_fields_optional)}")

    # union
    union = set(wc_fields_mandatory) | set(ss_fields_mandatory)
    # print(f"All MANDATORY fields: {union}")
    print(f"All MANDATORY fields: {len(union)}")

    # intersection
    intersection = set(wc_fields_mandatory) & set(ss_fields_mandatory)
    # print(f"Common MANDATORY fields in both WC and SS: {intersection}")
    print(f"Common MANDATORY fields in both WC and SS: {len(intersection)}")

    # # difference
    # print("Difference :", A - B)
    difference = set(wc_fields_mandatory) - set(ss_fields_mandatory)
    # print(f"MANDATORY fields only in WC: {difference}")
    print(f"MANDATORY fields only in WC: {len(difference)}")

    # # difference
    # print("Difference :", B - A)
    difference = set(ss_fields_mandatory) - set(wc_fields_mandatory)
    # print(f"MANDATORY fields only in SS: {difference}")
    print(f"MANDATORY fields only in SS: {len(difference)}")

    # intersection
    intersection = set(wc_fields_optional) & set(ss_fields_optional)
    # print(f"Common OPTIONAL fields in both WC and SS: {intersection}")
    print(f"Common OPTIONAL fields in both WC and SS: {len(intersection)}")

    # union
    # print("Intersection :", A | B)
    union = set(wc_fields_optional) | set(ss_fields_optional)
    # print(f"All OPTIONAL fields: {union}")
    print(f"All OPTIONAL fields: {len(union)}")

    # # difference
    # print("Difference :", A - B)
    difference = set(wc_fields_optional) - set(ss_fields_optional)
    # print(f"OPTIONAL fields only in WC: {difference}")
    print(f"OPTIONAL fields only in WC: {len(difference)}")

    # # difference
    # print("Difference :", B - A)
    difference = set(ss_fields_optional) - set(wc_fields_optional)
    # print(f"OPTIONAL fields only in SS: {difference}")
    print(f"OPTIONAL fields only in SS: {len(difference)}")

In [None]:
obs_ids = get_observatory_ids_from_governance_sheet()

obs_ids = ["BPNS"]

for observatory_id in obs_ids:
    if observatory_id == "Plenzia":
        continue

    sheet_link = get_sheet_link_from_governance_sheet(observatory_id, "water_column")
    if isinstance(sheet_link, float) and math.isnan(sheet_link):
        continue
    else:
        print(f"\n\nProcessing {observatory_id}... water column")
        wc_fields_mandatory, wc_fields_optional = get_mandatory_and_optional_fields(
            sheet_link
        )
        print(f"Length wc_mandatory fields {len(wc_fields_mandatory)}")
        print(f"Length wc_optional fields {len(wc_fields_optional)}")
        print(f"Fields: {wc_fields_mandatory}")
        print(f"Fields: {wc_fields_optional}")

    sheet_link = get_sheet_link_from_governance_sheet(observatory_id, "soft_sediment")
    if isinstance(sheet_link, float) and math.isnan(sheet_link):
        continue
    else:
        print(f"\n\nProcessing {observatory_id}... soft sediment")
        ss_fields_mandatory, ss_fields_optional = get_mandatory_and_optional_fields(
            sheet_link
        )
        print(f"Length ss_mandatory fields {len(ss_fields_mandatory)}")
        print(f"Length ss_optional fields {len(ss_fields_optional)}")
        print(f"Fields: {ss_fields_mandatory}")
        print(f"Fields: {ss_fields_optional}")

    show_statistics_for_fields(
        wc_fields_mandatory, wc_fields_optional, ss_fields_mandatory, ss_fields_optional
    )



Processing BPNS... water column
Sample sheet data: https://docs.google.com/spreadsheets/d/1mEi4Bd2YR63WD0j54FQ6QkzcUw_As9Wilue9kaXO2DE/gviz/tq?tqx=out:csv&sheet=Updated%20definitions
Sheet type link: https://docs.google.com/spreadsheets/d/1mEi4Bd2YR63WD0j54FQ6QkzcUw_As9Wilue9kaXO2DE/edit?usp=sharing
Length wc_mandatory fields 32
Length wc_optional fields 6
Fields: ['arr_date_hq', 'arr_date_seq', 'collection_date', 'depth', 'env_material', 'failure', 'failure_comment', 'investigation_type', 'long_store', 'membr_cut', 'replicate', 'samp_collect_device', 'samp_description', 'samp_mat_process', 'samp_mat_process_dev', 'samp_size_vol', 'samp_store_date', 'samp_store_loc', 'samp_store_temp', 'sampl_person', 'sampling_event', 'ship_date', 'ship_date_seq', 'size_frac', 'size_frac_low', 'size_frac_up', 'source_mat_id_orig', 'source_mat_id', 'store_person', 'store_temp_hq', 'tax_id', 'time_fi']
Fields: ['noteworthy_env_cond', 'other_person', 'other_person_orcid', 'sampl_person_orcid', 'store_p