### Mandatory fields

The problem is that there are different mandatory fields for water-column sampling, soft-sediment sampling, and (possibly) wc-measured and ss-measured.

Need to parse the "Updated definition" sheets of both sampling types and compare between to figure out what are common, what are not common, what are mandatory and what are optional.



In [61]:
import sys
from pathlib import Path

import pandas as pd

# Add project directory to path
PROJECT_DIR = Path.cwd().parents[0]
sys.path.append(str(PROJECT_DIR))
sys.path.append(str(PROJECT_DIR / "src"))

In [62]:
def parse_definitions_sheet(sheet_link):
    """
    Parse the definitions sheet and return a dictionary of the definitions
    """
    sampling_sheet_base = sheet_link.split("/edit")[0]
    sampling_sheet_suffix = "/gviz/tq?tqx=out:csv&sheet=Updated%20definitions"
    sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
    print(f"Sample sheet data: {sample_sheet_link}")
    print(f"Sheet type link: {sheet_link}")
    df = pd.read_csv(sample_sheet_link, encoding="utf-8")

    return df[["Metadata_term", "Requirement"]].values.tolist()


# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
validated_csv = (
    PROJECT_DIR / "validated-data" / "governance" / "logsheets_validated.csv"
)
df = pd.read_csv(validated_csv)

for observatory_id in ["BPNS", "RFormosa"]:
    print(f"\n\nProcessing {observatory_id}... water column")
    wc_sheet_link = df.loc[
        df["observatory_id"] == observatory_id, "water_column"
    ].values[0]
    wc_fields = parse_definitions_sheet(wc_sheet_link)

    print(f"\n\nProcessing {observatory_id}... soft sediment")
    ss_sheet_link = df.loc[
        df["observatory_id"] == observatory_id, "soft_sediment"
    ].values[0]
    ss_fields = parse_definitions_sheet(ss_sheet_link)

    print(f"\nLength of wc_fields: {len(wc_fields)}")
    print(f"Length of ss_fields: {len(ss_fields)}")

    wc_fields_mandatory = [
        field[0] for field in wc_fields if field[1] == "Mandatory (M)"
    ]
    wc_fields_mandatory_set = set(wc_fields_mandatory)
    if len(wc_fields_mandatory) != len(wc_fields_mandatory_set):
        print("Duplicate mandatory fields in water column...")
        dups = []
        for field in wc_fields_mandatory:
            if field in dups:
                continue
            if wc_fields_mandatory.count(field) > 1:
                print(f"Field {field} appears {wc_fields_mandatory.count(field)} times")
                dups.append(field)

    wc_fields_optional = [field[0] for field in wc_fields if field[1] == "Optional (O)"]
    wc_fields_optional_set = set(wc_fields_optional)
    if len(wc_fields_optional) != len(wc_fields_optional_set):
        print("Duplicate optional fields in water column...")
        dups = []
        for field in wc_fields_optional:
            if field in dups:
                continue
            if wc_fields_optional.count(field) > 1:
                print(f"Field {field} appears {wc_fields_optional.count(field)} times")
                dups.append(field)

    ss_fields_mandatory = [
        field[0] for field in ss_fields if field[1] == "Mandatory (M)"
    ]
    ss_fields_mandatory_set = set(ss_fields_mandatory)
    if len(ss_fields_mandatory) != len(ss_fields_mandatory_set):
        dups = []
        print("Duplicate mandatory fields in soft sediment...")
        for field in ss_fields_mandatory:
            if field in dups:
                continue
            if ss_fields_mandatory.count(field) > 1:
                print(f"Field {field} appears {ss_fields_mandatory.count(field)} times")
                dups.append(field)

    ss_fields_optional = [field[0] for field in ss_fields if field[1] == "Optional (O)"]
    ss_fields_optional_set = set(ss_fields_optional)
    if len(ss_fields_optional) != len(ss_fields_optional_set):
        dups = []
        print("Duplicate optional fields in soft sediment...")
        for field in ss_fields_optional:
            if field in dups:
                continue
            if ss_fields_optional.count(field) > 1:
                print(f"Field {field} appears {ss_fields_optional.count(field)} times")
                dups.append(field)

    print(f"Water column mandatory fields: {wc_fields_mandatory}")
    print(f"Water column mandatory fields: {len(wc_fields_mandatory)}")
    print(f"Water column optional fields: {len(wc_fields_optional)}")
    print(f"Soft sediment mandatory fields: {ss_fields_mandatory}")
    print(f"Soft sediment mandatory fields: {len(ss_fields_mandatory)}")
    print(f"Soft sediment optional fields: {len(ss_fields_optional)}")

    # intersection
    intersection = wc_fields_mandatory_set & ss_fields_mandatory_set
    print(f"Common MANDATORY fields in both WC and SS: {intersection}")
    print(f"Common MANDATORY fields in both WC and SS: {len(intersection)}")

    # union
    union = wc_fields_mandatory_set | ss_fields_mandatory_set
    print(f"All MANDATORY fields: {union}")
    print(f"All MANDATORY fields: {len(union)}")

    # # difference
    # print("Difference :", A - B)
    difference = wc_fields_mandatory_set - ss_fields_mandatory_set
    print(f"MANDATORY fields only in WC: {difference}")
    print(f"MANDATORY fields only in WC: {len(difference)}")

    # # difference
    # print("Difference :", B - A)
    difference = ss_fields_mandatory_set - wc_fields_mandatory_set
    print(f"MANDATORY fields only in SS: {difference}")
    print(f"MANDATORY fields only in SS: {len(difference)}")

    # intersection
    intersection = wc_fields_optional_set & ss_fields_optional_set
    print(f"Common OPTIONAL fields in both WC and SS: {intersection}")
    print(f"Common OPTIONAL fields in both WC and SS: {len(intersection)}")

    # union
    # print("Intersection :", A | B)
    union = wc_fields_optional_set | ss_fields_optional_set
    print(f"All OPTIONAL fields: {union}")
    print(f"All OPTIONAL fields: {len(union)}")

    # # difference
    # print("Difference :", A - B)
    difference = wc_fields_optional_set - ss_fields_optional_set
    print(f"OPTIONAL fields only in WC: {difference}")
    print(f"OPTIONAL fields only in WC: {len(difference)}")

    # # difference
    # print("Difference :", B - A)
    difference = ss_fields_optional_set - wc_fields_optional_set
    print(f"OPTIONAL fields only in SS: {difference}")
    print(f"OPTIONAL fields only in SS: {len(difference)}")



Processing BPNS... water column
Sample sheet data: https://docs.google.com/spreadsheets/d/1mEi4Bd2YR63WD0j54FQ6QkzcUw_As9Wilue9kaXO2DE/gviz/tq?tqx=out:csv&sheet=Updated%20definitions
Sheet type link: https://docs.google.com/spreadsheets/d/1mEi4Bd2YR63WD0j54FQ6QkzcUw_As9Wilue9kaXO2DE/edit?usp=sharing


Processing BPNS... soft sediment
Sample sheet data: https://docs.google.com/spreadsheets/d/1zc0bZdpl-Eoi35lI_5BGkElbscplyQRyNPLkSgeEyEQ/gviz/tq?tqx=out:csv&sheet=Updated%20definitions
Sheet type link: https://docs.google.com/spreadsheets/d/1zc0bZdpl-Eoi35lI_5BGkElbscplyQRyNPLkSgeEyEQ/edit?usp=sharing

Length of wc_fields: 136
Length of ss_fields: 136
Duplicate mandatory fields in soft sediment...
Field samp_store_date appears 2 times
Field tot_depth_water_col appears 2 times
Duplicate optional fields in soft sediment...
Field water_current appears 2 times
Field water_current_method appears 2 times
Water column mandatory fields: ['arr_date_hq', 'arr_date_seq', 'chlorophyll', 'chlorophyll