# Pydantic validation framework for the EMO BON observatory and other metadata log sheets

- **pydantic** Data validation using Python type hints.
    - [pypi](https://pypi.org/project/pydantic/)
    - [Documentation](https://docs.pydantic.dev/latest/)

In [None]:
import os
import sys

# Weird stuff from JupyterHub after I moved modules and notebooks around:
# For some reasong CWD is /src/scratch even though this notebook is in /srv/scratch/emo-bon-validation
# The terminal also show us to be in /srv/scratch/emo-bon-validation
# So...
if os.getcwd() == "/srv/scratch":
    os.chdir("./emo-bon-data-validation")
print(f"CWD is {os.getcwd()}")

import importlib
import subprocess
from pathlib import Path
import pandas as pd
import pydantic
import datetime

##### Init the directory structure

In [14]:
if False:
    # Init dirs and paths, write csv files
    # Init the validation classes dir if needed
    # Note that __init__.py will need be edited manually to import the validators
    # e.g from .observatories import Model as observatoriesModel
    validation_classes_path = "./validation_classes"
    if True:
        if not os.path.exists(validation_classes_path):
            os.mkdir(validation_classes_path)
            Path(os.path.join(validation_classes_path, "__init__.py")).touch()
            os.mkdir(raw_files_path)

## Governance data

#### Read each of the governance CSV files into a Pandas dataframe

In [None]:
github_path = "https://raw.githubusercontent.com/emo-bon/governance-data/main/"
file_names = [
    "logsheets.csv",  # contain the URLs of the googlesheets that are the logsheets
    "observatories.csv",  # contain information about each observatory
    # "organisations.csv",         # contain information about the organisations in EMO BON
    # "planned_events.csv"         # contains information about planned EMO BON events (this file is only used by humans, not by any actions) - DONT CARE
    # "ro-crate-metadata.json"     # IGNORE
]

dfs = {}
for f in file_names:
    df = pd.read_csv(os.path.join(github_path, f))
    print(f"This is info() for {df.info()}")
    dfs[f] = df

#### Validate Governance tables

In [3]:
from validation_classes import observatoriesModel, logsheetsModel

validator_class_paths = {
    "logsheets.csv": logsheetsModel,
    "observatories.csv": observatoriesModel,
}
validation_classes_path = "./validation_classes"

##### Observatories table

The observatories validator mostly changes the column names to make them consistent (and spelled correctly), removes blank strings ("   ") from cells, and reformats the dates.

In [4]:
file_name = "observatories.csv"
data = dfs[file_name]  # dfs is dict of pandas df's
validator = validator_class_paths[file_name]
data_records = data.to_dict(orient="records")
validated_rows = [validator(**row).model_dump() for row in data_records]

ndf = pd.DataFrame.from_records(validated_rows, index="observatory_id")
ndf.to_csv(os.path.join("governance", "observatories_validated.csv"))

##### Logsheets table

In [5]:
cdir = os.getcwd()
file_name = "logsheets.csv"
data = dfs[file_name]  # dfs is dict of pandas df's
validator = validator_class_paths[file_name]
data_records = data.to_dict(orient="records")
validated_rows = [validator(**row).model_dump() for row in data_records]

ndf = pd.DataFrame.from_records(validated_rows, index="observatory_id")
ndf.to_csv(os.path.join("governance", "logsheets_validated.csv"))



## Validate logsheets from "water column" and "soft sediments" for the "sampling" and "measured" tables



##### Pulls from the raw Google Sheets does a "lax" validation where we correct/coerce everything to a consistent type

### !!! A note about Pandas and integer fields with missing values

Pandas will read a raw CSV file and try to determine the type while doing so. If it finds an integer column with missing values they will be NaN's, which of course are floats. Consequently, the default action here is to read the column as a floats by coercing the integer values to float to match the NaNs - this is rarely what you want.

However, Pandas does have a [nullable integer type](https://pandas.pydata.org/docs/user_guide/integer_na.html) - `pandas.Int64Dtype()` or it's string alias `"Int64"`. You can force Pandas to use this type when reading the CSV file using `pandas.read_csv('file.csv', dtype={"<int field with missing values>": "Int64"})` the NaN's will be changed to [pandas.NA types](https://pandas.pydata.org/docs/reference/api/pandas.NA.html#pandas.NA).

For clarity: this is NOT what happens below: we let the validators deal with it, which coerce the now floats back into ints.

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import os
import sys
import math
import urllib
import pandas as pd
import pydantic
from validation_classes import samplingModel, measuredModel


def get_sheet(
    sheet_type: str, sheet_link: str, format_type: str = "json"
) -> pd.core.frame.DataFrame:
    """Returns a Pandas dataframe of the 'sampling' or 'measured' sheets
    from the observatories' Google Sheets.

    CSV has a problem with the word "blank" in the replicated field.
    But none of the others work because of the header (not sure why only
    CSV doesnt have the header.

    TODO sort out the header parsing from the Google Sheet response.
    """
    sampling_sheet_base = sheet_link.split("/edit")[0]
    if format_type == "json":
        # should return json
        sampling_sheet_suffix = f"/gviz/tq?tqx=sheet={sheet_type}"
        sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
        print(f"Sample sheet link: {sample_sheet_link}")
        df = pd.read_json(sample_sheet_link)

    elif format_type == "csv":
        sampling_sheet_suffix = f"/gviz/tq?tqx=out:csv&sheet={sheet_type}"
        sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
        print(f"Sample sheet link: {sample_sheet_link}")

        # Note that even if we force the replicate field to be a string, it doesnt
        # recognise "blank" as a string, it's still None
        # df = pd.read_csv(sample_sheet_link, encoding='utf-8', dtype={"replicate": str})
        # If we force them to ints the NaNs become NA as noted above
        # df = pd.read_csv(sample_sheet_link, encoding='utf-8', dtype={"replicate": int})

        # Here we don't force ints with NaNs to "Int64", but
        # let the later validator coerce floats and ints to string | None
        df = pd.read_csv(sample_sheet_link, encoding="utf-8")

    elif format_type == "excel":
        sampling_sheet_suffix = f"/gviz/tq?tqx=out:tsv-xlsx&sheet={sheet_type}"
        sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
        print(f"Sample sheet link: {sample_sheet_link}")
        df = pd.read_excel(sample_sheet_link, engine="openpyxl")
    elif format_type == "json":
        sampling_sheet_suffix = f"/gviz/tq?tqx=out:json&sheet={sheet_type}"
        sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
        print(f"Sample sheet link: {sample_sheet_link}")
        df = pd.read_json(sample_sheet_link)
    elif format_type == "tsv":
        sampling_sheet_suffix = f"/gviz/tq?tqx=out:tsv&sheet={sheet_type}"
        sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
        print(f"Sample sheet link: {sample_sheet_link}")
        df = pd.read_csv(sample_sheet_link, sep="\t")
    else:
        raise ValueError(f"Unrecognised {format_type=}")
    return df


def parse_sample_sheets(
    sampling_strategy: str, sheet_type: str, addresses: list[str, str]
) -> None:

    for observatory in addresses:
        observatory_id, sheet_link = observatory
        # print(f"Observatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            # print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float):
                if math.isnan(sheet_link):
                    print(
                        f"Observatory {observatory_id} lacks valid sheet URL for {sampling_strategy}"
                    )
                    continue
            else:
                raise ValueError(
                    f"Unknown link {sheet_link} to observatory {observatory_id}"
                )
        else:

            if observatory_id == "Plenzia":
                continue  # Sheets not publically available
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "soft_sediment" and observatory_id == "UMF":
                continue

            # if observatory_id not in ["OSD74", "AAOT"]:
            #     continue
            # if sampling_strategy != "water_column":
            #     continue

            print(
                f"\n\nProcessing {observatory_id=} - {sampling_strategy=} - {sheet_type=}"
            )
            # Assuming either 'sampling' or 'measured' for sheet_type
            df = get_sheet(sheet_type, sheet_link, format_type="csv")

            data_records_all = df.to_dict(orient="records")

            # Many sheets have partially filled rows
            # The source_mat_id is auto-formatted and the PRIMARY_KEY
            # Therefore filter records on source_mat_id
            def filter_on_source_mat_id(d):
                # Bergen has it as source_material_id
                try:
                    value = d["source_mat_id"]
                except KeyError:
                    try:
                        value = d["source_material_id"]
                    except KeyError:
                        raise ValueError("Cannot find source_mat_id field")
                if isinstance(value, float):
                    if math.isnan(value):
                        return False
                elif value is None:
                    return False
                # Remove mis-formatted
                elif len(value.split("_")) < 6:
                    return False
                # Edge case of this otherwise blank entry having 6 "bits"
                elif value == "EMOBON_VB_Wa_230509_um_":
                    return False
                else:
                    return True

            data_records_filtered = list(
                filter(filter_on_source_mat_id, data_records_all)
            )

            if len(data_records_all) > len(data_records_filtered):
                print(
                    f"Discarded {len(data_records_all) - len(data_records_filtered)} records leaving {len(data_records_filtered)}."
                )

            validator = validator_classes[sheet_type]
            validated_rows = [
                validator(**row).model_dump() for row in data_records_filtered
            ]

            save_dir = "./logsheets"
            outfile_name = f"{observatory_id}_{sampling_strategy}_{sheet_type}_validated.csv"
            ndf = pd.DataFrame.from_records(
                validated_rows, index="source_mat_id"
            )
            ndf.to_csv(os.path.join(save_dir, outfile_name))
            print(f"Written {os.path.join(save_dir, outfile_name)}")


validator_classes = {"sampling": samplingModel, "measured": measuredModel}
# Get list of observatory ids
df = pd.read_csv("./governance/observatories_validated.csv")
observatory_ids = [id[0] for id in df[["observatory_id"]].values.tolist()]
print(f"{observatory_ids=}")
# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
governance_logsheets_validated_csv = "./governance/logsheets_validated.csv"
df = pd.read_csv(governance_logsheets_validated_csv)
water_column_sheet_addresses = df[
    ["observatory_id", "water_column"]
].values.tolist()
soft_sediment_sheet_addresses = df[
    ["observatory_id", "soft_sediment"]
].values.tolist()

parse_sample_sheets("water_column", "sampling", water_column_sheet_addresses)
parse_sample_sheets("soft_sediment", "sampling", soft_sediment_sheet_addresses)
parse_sample_sheets("water_column", "measured", water_column_sheet_addresses)
parse_sample_sheets("soft_sediment", "measured", soft_sediment_sheet_addresses)

# Validate the Observatory sheets the EMO-BON-Metadata Google Sheets

In [None]:
# Validate the Observatory sheets in the EMO-BON-Metadata Google Sheets
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import os
import sys
import math
from pprint import pprint
import pandas as pd
from pathlib import Path, PurePath
from validation_classes import observatoryModel


def get_sheet(sheet_link: str) -> pd.core.frame.DataFrame:
    """Returns a Pandas dataframe of the 'observatory' sheets
    from the observatories' Google Sheets.
    """
    print(f"Sheet link: {sheet_link}")
    sampling_sheet_base = sheet_link.split("/edit")[0]
    sampling_sheet_suffix = f"/gviz/tq?tqx=out:csv&sheet=observatory"
    sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix
    print(f"Sample sheet link: {sample_sheet_link}")
    df = pd.read_csv(sample_sheet_link, encoding="utf-8")
    return df


def parse_sample_sheets(
    sampling_strategy: str, sheet_type: str, addresses: list[str, str]
) -> None:

    for observatory in addresses:
        observatory_id, sheet_link = observatory
        # print(f"Observatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            # print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float):
                if math.isnan(sheet_link):
                    print(
                        f"Observatory {observatory_id} lacks valid sheet URL for {sampling_strategy}"
                    )
                    continue
            else:
                raise ValueError(
                    f"Unknown link {sheet_link} to observatory {observatory_id}"
                )
        else:

            if observatory_id == "Plenzia":
                continue  # Sheets not publically available
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "soft_sediment" and observatory_id == "UMF":
                continue

            # if observatory_id not in ["OSD74", "AAOT"]:
            #     continue
            # if sampling_strategy != "water_column":
            #     continue

            print(
                f"\n\nProcessing {observatory_id=} - {sampling_strategy=} - {sheet_type=}"
            )
            df: pd.core.frame.DataFrame = get_sheet(sheet_link)

            # Note there is only one row per sheet
            data_records_all = df.to_dict(orient="records")

            # pprint(data_records_all)

            # Get the obs_id from the only row
            obs_id = data_records_all[0]["obs_id"]
            assert (
                observatory_id == obs_id
            ), f"Error: {observatory_id=} != {obs_id=}"

            if len(data_records_all) != 1:
                raise RuntimeError(f"Error: {len(data_records_all)} != 1")

            validated_rows = [
                observatoryModel(**row).model_dump() for row in data_records_all
            ]

            save_dir = Path("./logsheets")
            outfile_name = Path(
                f"{observatory_id}_{sampling_strategy}_{sheet_type}_validated.csv"
            )
            ndf = pd.DataFrame.from_records(validated_rows, index="obs_id")
            out_file = PurePath(save_dir, outfile_name)
            ndf.to_csv(out_file)
            print(f"Written {out_file}")


# Get list of observatory ids
# df = pd.read_csv("./governance/observatories_validated.csv")
# observatory_ids = [id[0] for id in df[["observatory_id"]].values.tolist()]
# print(f"{observatory_ids=}")

# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
df = pd.read_csv(Path("./governance/logsheets_validated.csv"))
water_column_sheet_addresses = df[
    ["observatory_id", "water_column"]
].values.tolist()
soft_sediment_sheet_addresses = df[
    ["observatory_id", "soft_sediment"]
].values.tolist()

parse_sample_sheets("water_column", "observatory", water_column_sheet_addresses)
parse_sample_sheets(
    "soft_sediment", "observatory", soft_sediment_sheet_addresses
)

# Write the combined Observatory table

In [None]:
import os
import pandas as pd
from pathlib import Path, PurePath

FILE_PATH = Path("./logsheets")


def filter_on_sheet_type(csv_file):
    if Path(csv_file).stem.split("_")[-2] == "observatory":
        return True
    else:
        return False


csv_files = [f for f in os.listdir(FILE_PATH) if f.split(".")[1] == "csv"]
observatory_files = list(filter(filter_on_sheet_type, csv_files))
# print(f"{observatory_files}")

frames = []
for obs in observatory_files:
    df = pd.read_csv(PurePath(FILE_PATH, obs))
    df["env_package"].replace("water", "water_column", inplace=True)
    df["env_package"].replace("sediment", "soft_sediment", inplace=True)
    frames.append(df)


#        for col in df:
#            if col == rank:
#                continue
#            for i, row_value in df[col].items():
#                df.loc[i, col] = numpy.sqrt(row_value)

combined_df = pd.concat(frames)
outfile_name = f"Observatory_combined_logsheets_validated3.csv"
combined_df.to_csv(outfile_name, index=False)
combined_df.info()

# Combined meta-table for each observatory of all validated logsheets

The source_mat_id is the unique key or identifier that links the records in the Batch run_information sheets ([run-information-batch-001.csv](https://raw.githubusercontent.com/emo-bon/sequencing-data/main/shipment/batch-001/run-information-batch-001.csv) and [run-information-batch-002.csv](https://raw.githubusercontent.com/emo-bon/sequencing-data/main/shipment/batch-002/run-information-batch-002.csv)) to the sampling events in the "sampling" and "measured" sheets of the observatory logsheets (Google Sheets) (e.g. [ESC68N](https://docs.google.com/spreadsheets/d/11_Eu0W1-sDiuzKx1cIl6YuxjRHmWezN6u9v3Ly8JZ3A/edit?gid=0#gid=0)).


In [None]:
# TODO
# Check
# EMOBON_BPNS_Wa_211223_3um_1
# EMOBON_BPNS_Wa_211223_3um_2
# EMOBON_BPNS_Wa_211223_0.2um_1
# EMOBON_BPNS_Wa_211223_0.2um_2
# in the combined sheet

# TODO there are 25 missing source_mat_ids because of the "blank_1" problem - this should be
# auto corrected if the replicate column is auto-filled to text not auto there is an open issue
# on the github page

import os
import math
import copy
import difflib
import pandas as pd
import validators
import collections
from pprint import pprint

LOGSHEETS_PATH = "./logsheets"
# These are real duplicates
KNOWN_DUPLICATES = [
    "EMOBON_ROSKOGO_Wa_210618_3um_1",
    "EMOBON_PiEGetxo_Wa_210824_3um_blank",
]


def get_observatory_data() -> list[str, str, str]:
    # Get list of observatory_ids
    df = pd.read_csv("./governance/logsheets_validated.csv")
    observatory_data = df[
        ["observatory_id", "water_column", "soft_sediment"]
    ].values.tolist()
    return observatory_data


def get_all_refcodes() -> dict[str, str]:
    batch1_run_info_path = "https://raw.githubusercontent.com/emo-bon/sequencing-data/main/shipment/batch-001/run-information-batch-001.csv"
    batch2_run_info_path = "https://raw.githubusercontent.com/emo-bon/sequencing-data/main/shipment/batch-002/run-information-batch-002.csv"
    # Get list of batch1 <source_mat_id>, <ref_code>'s
    df = pd.read_csv(batch1_run_info_path)
    refcodes = {}
    for i in df[["source_material_id", "ref_code"]].values.tolist():
        assert i[0] not in refcodes, f"{i[0]} maybe duplicated"
        refcodes.update(dict([i]))
    # Get list of batch2 <source_mat_id>, <ref_code>'s
    df = pd.read_csv(batch2_run_info_path)
    b2_refcodes = {}
    for i in df[["source_material_id", "ref_code"]].values.tolist():
        assert i[0] not in refcodes, f"{i[0]} maybe duplicated"
        b2_refcodes.update(dict([i]))
    refcodes.update(b2_refcodes)
    return refcodes


def parse_observatory_sample_type(
    observatory_id: str,
    obs_refcodes: dict[str, str],
    sampling_type: str,
    save_table: bool = False,
    verbose: bool = True,
) -> list[dict[str, str]]:
    """An observatory is an EMBRC station and it has an ID
    Each observatory may take either or both of the "water_column" and "soft_sediment" sampling types
    Each sampling type has both a "sampling" and "measured" sheet

    This function returns a list of sampling events each of which is a dict with key/value pairs for each field and value
    """

    sampling_data_filename = (
        f"{observatory_id}_{sampling_type}_sampling_validated.csv"
    )
    measured_data_filename = (
        f"{observatory_id}_{sampling_type}_measured_validated.csv"
    )

    sampling_data = pd.read_csv(
        os.path.join(LOGSHEETS_PATH, sampling_data_filename)
    )
    measured_data = pd.read_csv(
        os.path.join(LOGSHEETS_PATH, measured_data_filename)
    )

    sampling_events = sampling_data.to_dict(orient="records")
    measured_events = measured_data.to_dict(orient="records")

    # To be returned
    source_mat_ids_from_combined_events = []  # List of all source_mat_ids
    all_sampling_source_mat_ids = sampling_data["source_mat_id"].values.tolist()
    missing_measured_but_refcode_present = (
        0  # Shouldn't happen if sheet is not broken
    )
    duplicates_ignored_counter = 0  # How many known duplicate source_mat_ids in the sheets are we ignoring
    combined_events = []  # List of all sampling/measured events

    # Internal
    no_refcode_counter = (
        0  # Sampling events without a refcode ie not sent to sequencing
    )
    source_mat_ids_in_sampling_with_refcode_missing_from_measured = (
        []
    )  # Again shouldn't happen
    refcodes_in_run_info = []  # List of refcodes matched to source_mat_ids

    for sampling_event in sampling_events:

        # Checking consistency
        # Does this sampling event have a ref_code
        # If yes, then it should have both a sampling and measured sheet
        # If no, we can ignore it
        event_mat_id = sampling_event["source_mat_id"]

        # Two source_mat_id's in the Batch 1 & 2 run_informations match duplicate sampling events
        # in the sampling sheets - here we ignore those two:
        if event_mat_id in KNOWN_DUPLICATES:
            duplicates_ignored_counter += 1
            # print(f"IGNORING DUP: {event_mat_id}")
            continue

        # Hack for HCMR-1 replicates which are "blank1" and "blank2" in the sampling sheet
        # become just blank in the measured and run_information sheet from where we get the refcodes
        if event_mat_id == "EMOBON_HCMR-1_Wa_210917_3um_blank1":
            event_mat_id = "EMOBON_HCMR-1_Wa_210917_3um_blank"
        if event_mat_id == "EMOBON_HCMR-1_Wa_210917_0.2um_blank1":
            event_mat_id = "EMOBON_HCMR-1_Wa_210917_0.2um_blank"

        try:
            refcode = obs_refcodes[event_mat_id]
        except KeyError:
            no_refcode_counter += 1
            # OK so has not been sent to sequencing; ignore
            continue

        event_measured = False
        for measured_event in measured_events:

            try:
                measured_event["source_mat_id"]
            except KeyError:
                print(f"Key error: {measured_event}")
                raise KeyError
                # Should not happen
            if measured_event["source_mat_id"] == event_mat_id:
                event_measured = copy.deepcopy(measured_event)
                break

        if not event_measured:
            # sampling sheet source_mat_id has ref_code in run_information
            # but the corresponding measured sheet lacks the same sources_mat_id
            # This shouldn't happen unless the auto-formatting of the 'source_mat_id'
            # field in the 'measured' sheet is broken - which is exactly what happened.
            missing_measured_but_refcode_present += 1
            source_mat_ids_in_sampling_with_refcode_missing_from_measured.append(
                event_mat_id
            )
            continue
        else:
            sampling_event["ref_code"] = refcode  # key to sequence data
            sampling_event["obs_id"] = observatory_id  # key to observatory data
            if refcode in refcodes_in_run_info:
                raise ValueError(
                    f"Error: {refcode=} match more that one sampling event "
                    f"with the {source_mat_id=}"
                )
            else:
                refcodes_in_run_info.append(refcode)
                # Delete the now duplicated source_mat_id in the combined event
                del event_measured["source_mat_id"]
                source_mat_ids_from_combined_events.append(event_mat_id)
                sampling_event.update(event_measured)
                combined_events.append(sampling_event)

    if verbose:
        print(
            f"Observatory {observatory_id}-{sampling_type} has {len(sampling_events)} sampling events.\n"
            f"{no_refcode_counter} have no ref_code (i.e. they were not sent for sequencing), \n"
            f"{missing_measured_but_refcode_present} 'sampling' events have a refcode but no "
            f"'measured' data with the corresponding source_mat_id: \n"
            # f"{source_mat_ids_in_sampling_with_refcode_missing_from_measured} \n"
            f"A total of {len(combined_events)} sampling events with refcode and measured sheet were found.\n"
        )

    # Did we find all the sampling events?
    se = len(sampling_events)
    ce = len(combined_events)
    mmbrcp = missing_measured_but_refcode_present
    nrc = no_refcode_counter
    dc = duplicates_ignored_counter
    assert se == (ce + nrc + mmbrcp + dc), (
        f"Something is a foot: len(sampling_events) {se} != "
        f"(len(combined_events) {ce} + no_refcode_counter {nrc} "
        f"missing_measured_but_refcode_present {mmbrcp}) "
        f"known duplicates ignored was {dc}"
    )

    if len(combined_events) != 0 and save_table:
        save_dir = "./transformed"
        outfile_name = (
            f"{observatory_id}_{sampling_type}_combined_validated.csv"
        )
        ndf = pd.DataFrame.from_records(combined_events, index="source_mat_id")
        ndf.to_csv(os.path.join(save_dir, outfile_name))

    return (
        source_mat_ids_from_combined_events,
        all_sampling_source_mat_ids,
        missing_measured_but_refcode_present,
        duplicates_ignored_counter,
        combined_events,
    )


def validate_observatories(
    observatory_data, obs_refcodes, save_table=False
) -> tuple[list, list, int, int, list]:

    # To be returned
    all_source_mat_ids_from_combined_events: list[str] = []
    all_source_mat_ids_from_sheets: list[str] = []
    all_missing_measured_but_refcode_present: int = 0
    all_duplicates_ignored: int = 0
    all_combined_events: list[list[str, ...]] = []

    for observatory_id, water_column, soft_sediment in observatory_data:

        if observatory_id == "Plenzia":
            continue  # Data not public
        if observatory_id == "UMF" and soft_sediment:
            continue  # Broken sheet

        # Surely there is a better way to do this
        observatories_present = []
        if validators.url(water_column):
            observatories_present.append("water_column")
        if validators.url(soft_sediment):
            observatories_present.append("soft_sediment")

        for sampling_strategy in observatories_present:

            r = parse_observatory_sample_type(
                observatory_id, obs_refcodes, sampling_strategy, save_table
            )

            all_source_mat_ids_from_combined_events.extend(r[0]),
            all_source_mat_ids_from_sheets.extend(r[1]),
            # Cannot use += on tuple unpacking
            all_missing_measured_but_refcode_present = (
                all_missing_measured_but_refcode_present + r[2]
            )
            all_duplicates_ignored = all_duplicates_ignored + r[3]
            all_combined_events.extend(r[4])

    return (
        all_source_mat_ids_from_combined_events,
        all_source_mat_ids_from_sheets,
        all_missing_measured_but_refcode_present,
        all_duplicates_ignored,
        all_combined_events,
    )


############ VALIDATE OBSERVATORIES ###################################################

# Get observatory data and ref_codes
observatory_data: list[str, str, str] = get_observatory_data()
obs_refcodes: dict[str, str] = get_all_refcodes()

result = validate_observatories(observatory_data, obs_refcodes, save_table=True)

all_source_mat_ids_from_combined_events = result[0]
all_source_mat_ids_from_sheets = result[1]
all_missing_measured_but_refcode_present = result[2]
all_duplicates_ignored = result[3]
all_combined_events = result[4]

############## REPORT STATISTICS ########################################################

############### REFCODES ####################################
print(f"There are {len(obs_refcodes)} total ref_codes assigned")
# We are ignoring both of the duplicates but it's only 1 record expected missing from total_combined events

############## DUPLICATES ###################################
total_dups_ignored = int(all_duplicates_ignored / 2)
print(
    f"A total of {total_dups_ignored} sequencing event in the Batch 1 & 2 run_information sheets "
    f"have refcodes that match source_mat_ids in the sample sheets that have duplicate entries"
)
print(
    f"Total number of combined sampling events with ref_codes: {len(all_combined_events)}"
)
duplicates = [
    source_mat_id
    for source_mat_id, count in collections.Counter(
        all_source_mat_ids_from_sheets
    ).items()
    if count > 1
]
print(
    f"Total number of all_source_mat_ids_from_sheets: {len(all_source_mat_ids_from_sheets)}"
    f" of which {len(duplicates)} were duplicates"
)

############# SOURCE_MAT_IDS ################################
missing_source_mat_ids = []
for source_mat_id in obs_refcodes:
    # source_mat_ids are the keys in the refcode dict of the run_information
    # print(f"refcode from run_information: {refcode}")

    # Hack for HCMR-1
    if source_mat_id == "EMOBON_HCMR-1_Wa_210917_3um_blank":
        source_mat_id = "EMOBON_HCMR-1_Wa_210917_3um_blank1"
    if source_mat_id == "EMOBON_HCMR-1_Wa_210917_0.2um_blank":
        source_mat_id = "EMOBON_HCMR-1_Wa_210917_0.2um_blank1"

    if source_mat_id not in all_source_mat_ids_from_sheets:
        # print(f"source_mat_id {source_mat_id} is missing from the sampling sheets")

        # Get close matches to missing source_mat_id
        matches = difflib.get_close_matches(
            source_mat_id, all_source_mat_ids_from_sheets, n=3
        )
        missing_source_mat_ids.append([source_mat_id, matches])

if missing_source_mat_ids:
    print(
        "\n\nThe missing source_mat_ids that are in the run information sheets are:"
    )
    for missing in missing_source_mat_ids:
        join = " ".join(missing[1])
        print(
            f"Missing source_mat_id is {missing[0]} close matches are \n\t {join}"
        )

    for missing in missing_source_mat_ids:
        print(missing)

print(
    f"A total of {len(missing_source_mat_ids)} source_mat_ids "
    f"in the batch 1 & 2 run information sheets are missing from the "
    f"observatory sampling sheets"
)

missing = False  # CAUTION: THIS SHOULD BE ZERO!
counter = 0
# TODO: take all_source_mat_ids_from_combined_events directly from all_combined_events rather than having
# a separate list
for source_mat_id in all_source_mat_ids_from_combined_events:
    # source_mat_ids are the keys in the refcode dict of the run_information
    # print(f"source_mat_id: {source_mat_id}")
    if source_mat_id not in obs_refcodes:
        # print(f"source_mat_id {source_mat_id} is missing from the run_information")
        counter += 1
        missing = True
if missing:
    print(
        f"ERROR: A total of {counter} source_mat_ids "
        f"in the sampling sheets that also have refcodes are missing from the "
        f"Batch 1 & 2 run information sheets"
    )

##################### SUMMARY #################################
total = len(all_combined_events) + len(missing_source_mat_ids)
print(
    f"\nTotal combined_events: {len(all_combined_events)} \n"
    f"Missing source_mat_ids in combined events: {len(missing_source_mat_ids)} \n"
    f"EMO BON ref_codes in run_information sheets: {len(obs_refcodes)} \n"
    f"Events ignored due to duplications in sampling sheets: {total_dups_ignored} \n"
    f"{len(all_combined_events)} + {len(missing_source_mat_ids)} = {len(obs_refcodes)} - {total_dups_ignored}"
)

# Create the single metadata table from the combined observatories tables

Combined observatories metadata tables are in ./transformed and have the file name format:
`<observatory_id>_<sampling_strategy>_combined_validated.csv`

In [None]:
FILE_PATH = "./transformed"
csv_files = [f for f in os.listdir(FILE_PATH) if f.split(".")[1] == "csv"]

number_of_rows = 0
frames = []
for obs in csv_files:
    front = obs.split("_combined_validated.csv")[0]
    obs_id, strategy = front.split("_", 1)
    # print(f"{obs_id=} -> - {strategy=}")
    df = pd.read_csv(
        os.path.join(FILE_PATH, obs), dtype={"tax_id": "Int64"}
    )  # See note above
    for index, event in df.iterrows():
        # This is the name given in the Observatory sheet Google Spreadsheet
        df.loc[index, "env_package"] = strategy
        number_of_rows += 1
    frames.append(df)

print(f"{number_of_rows=}")
combined_df = pd.concat(frames)
today = datetime.datetime.now().strftime("%Y-%m-%d")
outfile_name = f"Batch1and2_combined_logsheets_{today}.csv"
# combined_df.set_index("source_mat_id")
combined_df.to_csv(outfile_name, index=False)
combined_df.info()