# Pydantic validation framework for the EMO BON observatory and other metadata log sheets

- **pydantic** Data validation using Python type hints.
    - [pypi](https://pypi.org/project/pydantic/)
    - [Documentation](https://docs.pydantic.dev/latest/)

In [1]:
import os
import sys

# Weird stuff from JupyterHub after I moved modules and notebooks around:
# For some reasong CWD is /src/scratch even though this notebook is in /srv/scratch/emo-bon-validation
# The terminal also show us to be in /srv/scratch/emo-bon-validation
# So...
if os.getcwd() == "/srv/scratch":
    os.chdir("./emo-bon-data-validation")
print(f"CWD is {os.getcwd()}")

import importlib
import subprocess
from pathlib import Path
import pandas as pd
import pydantic

CWD is /srv/scratch/emo-bon-data-validation


##### Init the directory structure

In [14]:
if False:
    #Init dirs and paths, write csv files
    # Init the validation classes dir if needed
    # Note that __init__.py will need be edited manually to import the validators
    # e.g from .observatories import Model as observatoriesModel
    validation_classes_path = "./validation_classes"
    if True:
        if not os.path.exists(validation_classes_path):
            os.mkdir(validation_classes_path)
            Path(os.path.join(validation_classes_path, "__init__.py")).touch()
            os.mkdir(raw_files_path)

## Governance data

#### Read each of the governance CSV files into a Pandas dataframe

In [15]:
github_path = "https://raw.githubusercontent.com/emo-bon/governance-data/main/"
file_names = [
        "logsheets.csv",              # contain the URLs of the googlesheets that are the logsheets
        "observatories.csv"           # contain information about each observatory
        #"organisations.csv",         # contain information about the organisations in EMO BON
        #"planned_events.csv"         # contains information about planned EMO BON events (this file is only used by humans, not by any actions) - DONT CARE
        #"ro-crate-metadata.json"     # IGNORE
        ]
dfs = {}
for f in file_names:
    df = pd.read_csv(os.path.join(github_path, f))
    print(f"This is info() for {df.info()}")
    dfs[f] = df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 9 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   EMBRC Node                           18 non-null     object
 1   EMBRC Site                           18 non-null     object
 2   EMOBON_observatory_id                18 non-null     object
 3   Water Column                         17 non-null     object
 4   Soft sediment                        7 non-null      object
 5   data_quality_control_threshold_date  18 non-null     object
 6   data_quality_control_assignee        18 non-null     object
 7   rocrate_profile_uri                  18 non-null     object
 8   autogenerate                         18 non-null     int64 
dtypes: int64(1), object(8)
memory usage: 1.4+ KB
This is info() for None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 22 columns):
 #   C

#### Validate Governance tables

In [16]:
from validation_classes import observatoriesModel, logsheetsModel
validator_class_paths = {"logsheets.csv": logsheetsModel, "observatories.csv": observatoriesModel}
validation_classes_path = "./validation_classes"

##### Observatories table

The observatories validator mostly changes the column names to make them consistent (and spelled correctly), removes blank strings ("   ") from cells, and reformats the dates.

In [17]:
file_name = "observatories.csv"
data = dfs[file_name] # dfs is dict of pandas df's
validator = validator_class_paths[file_name]
data_records = data.to_dict(orient="records")
validated_rows = [validator(**row).model_dump() for row in data_records]

#for row in data_records:
#    #print(row)
#    validator(**row)

#for record in validated_rows:
#    for field in record:
#        print(f"Record {field} has value {record[field]} is type {type(record[field])}")

ndf = pd.DataFrame.from_records(validated_rows, index="observatory_id")
ndf.to_csv(os.path.join("governance", "observatories_validated.csv"))

# Take a look at ./validation_classes/validated_governance/observatories_validated.csv

##### Logsheets table

In [None]:
file_name = "logsheets.csv"
data = dfs[file_name] # dfs is dict of pandas df's
validator = validator_class_paths[file_name]
data_records = data.to_dict(orient="records")
validated_rows = [validator(**row).model_dump() for row in data_records]
        
ndf = pd.DataFrame.from_records(validated_rows, index="observatory_id")
ndf.to_csv(os.path.join(validation_classes_path, "governance", "logsheets_validated.csv"))

# Take a look at ./validation_classes/validated_governance/logsheets_validated.csv



## Logsheets from water column and soft sediments sampling and "measured" tables



### Version 1 - pulls from Google Sheets only does "lax" validation

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
    
import os
import sys
import math
import pandas as pd
import pydantic
from validation_classes import samplingModel, measuredModel

def parse_sample_sheets(sampling_strategy: str, sheet_type: str, addresses: pd.core.frame.DataFrame) -> None: 
    for observatory in addresses:
        observatory_id, sheet_link = observatory
        #print(f"Observatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            #print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float): 
                if math.isnan(sheet_link):
                    print(f"Observatory {observatory_id} lacks valid sheet URL {sheet_link}")
                    continue
            else:
                raise ValueError(f"Unknown link {sheet_link} to observatory {observatory_id}")
        else:

            if observatory_id == "Plenzia": continue # Sheets not publically available
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "soft_sediment" and observatory_id == "UMF":
                continue
            
            #if observatory_id in ["BPNS", "Bergen", "ESC68N", "MBAL4",
            #                     "VB", "ROSKOGO"]:
            #["Bergen", "MBAL4", "ESC68N", 
            #               "BPNS", "VB", "ROSKOGO",
            #               "EMT21", "PiEGetxo", "RFormosa",
            #              "OSD74", "AAOT", "NRMCB",
            #              "HCMR-1", "IUIEilat", "UMF"]:
            #    continue
    
            #if observatory_id != "BPNS":
            #    continue
    
            print(f"Processing {observatory_id}...")
            sampling_sheet_base = sheet_link.split("/edit")[0]
            sampling_sheet_suffix = "/gviz/tq?tqx=out:csv&sheet=%s"
            sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix % sheet_type
            print(f"Sample sheet link: {sample_sheet_link}")
            df = pd.read_csv(sample_sheet_link, encoding='utf-8')
            data_records_all = df.to_dict(orient="records")
    
            # Many sheets have partially filled rows
            # The source_mat_id is manually curated and the PRIMARY_KEY
            # Therefore filter records on source_mat_id
            def filter_on_source_mat_id(d):
                # Bergen has it as source_material_id
                try:
                    value = d["source_mat_id"]
                except KeyError:
                    try:
                        value = d["source_material_id"]
                    except KeyError:
                        raise ValueError("Cannot find source_mat_id field")
                if isinstance(value, float):
                    if math.isnan(value):
                        return False
                elif value is None:
                    return False
                # Remove mis-formatted
                elif len(value.split("_")) < 6:
                    return False
                #Edge case of this otherwise blank entry having 6 "bits"
                elif value == "EMOBON_VB_Wa_230509_um_":
                    return False 
                else:
                    return True
                
            data_records_filtered = list(filter(filter_on_source_mat_id, data_records_all))
    
            if len(data_records_all) > len(data_records_filtered):
                print(f"Discarded {len(data_records_all) - len(data_records_filtered)} records leaving {len(data_records_filtered)}.")
            
            validator = validator_classes[sheet_type]
            validated_rows = [validator(**row).model_dump() for row in data_records_filtered]
    
            #for record in validated_rows:
            #    for field in record:
            #        print(f"Record {field} has value {record[field]} is type {type(record[field])}")
    
            save_dir = "./logsheets"
            outfile_name = f"{observatory_id}_{sampling_strategy}_{sheet_type}_validated.csv"
            ndf = pd.DataFrame.from_records(validated_rows, index="source_mat_id")
            ndf.to_csv(os.path.join(save_dir, outfile_name))
            print(f"Written {os.path.join(save_dir, outfile_name)}")


validator_classes = {"sampling": samplingModel, "measured": measuredModel}
# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
governance_logsheets_validated_csv = "./governance/logsheets_validated.csv"
df = pd.read_csv(governance_logsheets_validated_csv)
water_column_sheet_addresses = df[["observatory_id", "water_column"]].values.tolist()
soft_sediment_sheet_addresses  = df[["observatory_id", "soft_sediment"]].values.tolist()
del df
    
parse_sample_sheets("water_column", "sampling", water_column_sheet_addresses)
parse_sample_sheets("soft_sediment", "sampling", soft_sediment_sheet_addresses)
parse_sample_sheets("water_column", "measured", water_column_sheet_addresses)
parse_sample_sheets("soft_sediment", "measured", soft_sediment_sheet_addresses)


### Version 2: pulls from Google Sheets, does "lax" (default), "strict" and "semi-strict" validation depending on constant

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
    
import os
import sys
import math
import pickle
import pandas as pd
from pydantic import ValidationError
from pprint import pprint
from validation_classes import samplingModel, measuredModel, samplingModelStrict, samplingModelSemiStrict

############################ CAUTION #############################################
STRICT      = False  # As defined by Ioulia, dates corrected, NA's removed etc
SEMI_STRICT = True   # As defined by Ioulia but not checking for mandatory fields
                     # ints and str coerced to floats when possible
##################################################################################

def parse_sample_sheets(sampling_strategy: str,
                        sheet_type: str,
                        addresses: pd.core.frame.DataFrame,
                       ) -> None:
    
    for observatory in addresses:
        observatory_id, sheet_link = observatory
        #print(f"Observatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            #print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float): 
                if math.isnan(sheet_link):
                    print(f"Observatory {observatory_id} lacks valid sheet URL {sheet_link}")
                    continue
            else:
                raise ValueError(f"Unknown link {sheet_link} to observatory {observatory_id}")
        else:

            if observatory_id == "Plenzia": continue # Sheets not publically available
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "soft_sediment" and observatory_id == "UMF":
                continue
    
            print(f"Processing {observatory_id}...")
            sampling_sheet_base = sheet_link.split("/edit")[0]
            sampling_sheet_suffix = "/gviz/tq?tqx=out:csv&sheet=%s"
            sample_sheet_link = sampling_sheet_base + sampling_sheet_suffix % sheet_type
            print(f"Sample sheet link: {sample_sheet_link}")
            df = pd.read_csv(sample_sheet_link, encoding='utf-8')
            data_records_all = df.to_dict(orient="records")
    
            # Many sheets have partially filled rows
            # The source_mat_id is manually curated and the PRIMARY_KEY
            # Therefore filter records on source_mat_id
            def filter_on_source_mat_id(d):
                # Bergen has it as source_material_id
                try:
                    value = d["source_mat_id"]
                except KeyError:
                    try:
                        value = d["source_material_id"]
                    except KeyError:
                        raise ValueError("Cannot find source_mat_id field")
                if isinstance(value, float):
                    if math.isnan(value):
                        return False
                elif value is None:
                    return False
                # Remove mis-formatted
                elif len(value.split("_")) < 6:
                    return False
                #Edge case of this otherwise blank entry having 6 "bits"
                elif value == "EMOBON_VB_Wa_230509_um_":
                    return False 
                else:
                    return True
                
            data_records_filtered = list(filter(filter_on_source_mat_id, data_records_all))
    
            if len(data_records_all) > len(data_records_filtered):
                print(f"Discarded {len(data_records_all) - len(data_records_filtered)} records leaving {len(data_records_filtered)}.")

            if STRICT:
                model_type = f"{sheet_type}_strict"
            elif SEMI_STRICT:
                model_type = f"{sheet_type}_semistrict"
            else:
                model_type = sheet_type

            validator = validator_classes[model_type]

            #validated_rows = [validator(**row).model_dump() for row in data_records_filtered]
            validated_rows = []
            errors: List[List[str:List[Dict]]] = [] # where each error is the inner Dict
            for row in data_records_filtered:
                try:
                    vr = validator(**row)
                except ValidationError as e:
                    if observatory_id == "Bergen":
                        errors.append([(row["source_material_id"], e.errors())])
                    else:
                        errors.append([(row["source_mat_id"], e.errors())])
                else:
                    validated_rows.append(vr.model_dump())

            if errors:
                # errors is a list of lists where each inner list is a dict of row errors
                # where each isof key = source_mat_id and values is list of dicts each of which
                # is an error:
                #List[List[str:List[Dict]]]
                total_number_errors = sum([len(row[1]) for e in errors for row in e])
                print(f"Errors were found... {total_number_errors} in total")
                save_dir = "./validation_errors"
                outfile_name_pk = f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.pickle"
                out_path_pk = os.path.join(save_dir, outfile_name_pk)
                with open(out_path_pk, "wb") as f:
                    pickle.dump(errors, f, pickle.HIGHEST_PROTOCOL)
                outfile_name_log = f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.log"
                out_path_log = os.path.join(save_dir, outfile_name_log)                
                with open(out_path_log, "w") as f:
                    pprint(errors, f)
            else:
                assert len(validated_rows) == len(data_records_filtered), "Not sure what happenned, but len(validated_rows) != len(data_filtered_records)"
                print("All records passed!")
            
                #for record in validated_rows:
                #    for field in record:
                #        print(f"Record {field} has value {record[field]} is type {type(record[field])}")

                if not STRICT and not SEMI_STRICT:
                    save_dir = "./logsheets"
                    outfile_name = f"{observatory_id}_{sampling_strategy}_{model_type}_validated.csv"
                    ndf = pd.DataFrame.from_records(validated_rows, index="source_mat_id")
                    ndf.to_csv(os.path.join(save_dir, outfile_name))
                    print(f"Written {os.path.join(save_dir, outfile_name)}")


validator_classes = {"sampling": samplingModel,
                     "measured": measuredModel,
                     "sampling_strict": samplingModelStrict,
                     "sampling_semistrict": samplingModelSemiStrict
                    }

# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
governance_logsheets_validated_csv = "./governance/logsheets_validated.csv"
df = pd.read_csv(governance_logsheets_validated_csv)
water_column_sheet_addresses = df[["observatory_id", "water_column"]].values.tolist()
soft_sediment_sheet_addresses  = df[["observatory_id", "soft_sediment"]].values.tolist()
del df

parse_sample_sheets("water_column", "sampling", water_column_sheet_addresses)

#parse_sample_sheets("water_column", "sampling", water_column_sheet_addresses)
#parse_sample_sheets("soft_sediment", "sampling", soft_sediment_sheet_addresses)
#parse_sample_sheets("water_column", "measured", water_column_sheet_addresses)
#parse_sample_sheets("soft_sediment", "measured", soft_sediment_sheet_addresses)


### Version 3: pulls sheets from Github after curation, does "lax", "strict", and "semi-strict" validation

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
    
import os
import sys
import math
import pickle
from urllib.request import HTTPError
from enum import Enum
import pandas as pd
from pydantic import ValidationError
from pprint import pprint
from validation_classes import (samplingModelGithub,               # lax validator for EMO-BON Github repository
                                samplingModelGithubStrict,         # strict validator for EMO-BON Github repository
                                samplingModelGithubSemiStrict      # semi-strict validator for EMO-BON Github repository                  
                               )
class SamplingStrategy(Enum):
    WATER    = "water"    # Originally water_column
    SEDIMENT = "sediment" # Originally soft_sediment

class SheetType(Enum):
    SAMPLING = "sampling"
    MEASURED = "measured"

# Not all observatories have "transformed" sheets on GH, but may have "raw"
# Of course the types in the fields are different to difficult to validate
# with a single validator - best just to ignore the raw sheets
USE_RAW = False

############################ CAUTION #############################################
# As defined by Ioulia, dates corrected, NA's removed etc
STRICT      = False

# As defined by Ioulia but not checking for mandatory fields
# ints and str coerced to floats when possible
SEMI_STRICT = False  
##################################################################################

def get_sheet_from_github(observatory_id: SheetType,
                          sampling_strategy: SamplingStrategy, 
                          sheet_type: str) -> pd.core.frame.DataFrame:
    """
    Here we pull the "sampling" or "measured" sheets from Github. These are the curated
    sheets downloaded by the Github actions and hopefully do not have the errors that
    the CSV's pulled directly from Google Sheets had (e.g. the word "blank" magically
    disappering from the "replicate" field.

    Github paths look like:
    https://raw.githubusercontent.com/emo-bon/observatory-umf-crate/main/logsheets/transformed/sediment_measured.csv
    https://raw.githubusercontent.com/emo-bon/observatory-bergen-crate/main/logsheets/raw/water_sampling.csv
    """

    prefix     = "https://raw.githubusercontent.com/emo-bon"
    obs_name   = f"observatory-{observatory_id}-crate"
    inter_path = "main/logsheets"
    dir_path   = "transformed"
    sheet_name = f"{sampling_strategy}_{sheet_type}.csv"

    print(f"Processing {observatory_id}... {sheet_name}")
    github_addr = os.path.join(prefix, obs_name, inter_path, dir_path, sheet_name)
    try:
        df = pd.read_csv(github_addr)
    except HTTPError:
        # Some observatories don't yet have transformed sheets
        if USE_RAW:
            # Try for the raw sheets
            print("Unable to find 'transformed' sheet, reading the 'raw' sheet")
            dir_path   = "raw"
            github_addr = os.path.join(prefix, obs_name, inter_path, dir_path, sheet_name)
            try:
                df = pd.read_csv(github_addr)
            except HTTPError:
                raise ValueError("Unable to find transformed or raw sheet")
        else:
            print(f"Observatory {observatory_id} does not have a transformed {sheet_name} on GH")
            return None
            
    return df

def filter_on_source_mat_id(d):
    # Bergen has it as source_material_id on Google and Github
    try:
        value = d["source_mat_id"]
    except KeyError:
        try:
            value = d["source_material_id"]
        except KeyError:
            raise ValueError("Cannot find source_mat_id field")
    if isinstance(value, float):
        if math.isnan(value):
            return False
    elif value is None:
        return False
    # Remove mis-formatted
    elif len(value.split("_")) < 6:
        return False
    #Edge case of this otherwise blank entry having 6 "bits"
    elif value == "EMOBON_VB_Wa_230509_um_":
        return False 
    else:
        return True

def parse_sample_sheets(sampling_strategy: str,
                        sheet_type: str,
                        addresses: pd.core.frame.DataFrame,
                       ) -> None:
    
    for observatory in addresses:
        observatory_id, sheet_link = observatory
        #print(f"ObSservatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            #print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float):
                # Only OOB doesnt do water_column
                # But most do not do soft-sediments
                if math.isnan(sheet_link):
                    print(f"Observatory {observatory_id} does not have a {sampling_strategy} sampling strategy.")
                    continue
            else:
                raise ValueError(f"Unknown value \'{sheet_link}\' in {sampling_strategy} cell of {observatory_id}")
        else:

            if observatory_id == "Plenzia": continue # Sheets not publically available

            ################ CAUTION ##################
            #if not observatory_id in ["AAOT"]: continue
            
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "sediment" and observatory_id == "UMF":
                continue
    
            df = get_sheet_from_github(observatory_id, sampling_strategy, sheet_type)
            if df is None:
                continue
            data_records_all = df.to_dict(orient="records")
    
            # Many sheets have partially filled rows
            # The source_mat_id is manually curated and the PRIMARY_KEY
            # Therefore filter records on source_mat_id
                
            data_records_filtered = list(filter(filter_on_source_mat_id, data_records_all))
    
            if len(data_records_all) > len(data_records_filtered):
                print(f"Discarded {len(data_records_all) - len(data_records_filtered)} records leaving {len(data_records_filtered)}.")

            ################ CAUTION ##############
            #continue

            if STRICT:
                model_type = f"{sheet_type}_github_strict"
            elif SEMI_STRICT:
                model_type = f"{sheet_type}_github_semistrict"
            else:
                model_type = f"{sheet_type}_github"

            validator = validator_classes[model_type]
            #print(f"Using {validator} from {model_type}")

            #validated_rows = [validator(**row).model_dump() for row in data_records_filtered]
            validated_rows = []
            errors: List[List[str:List[Dict]]] = [] # where each error is the inner Dict
            for row in data_records_filtered:
                try:
                    vr = validator(**row)
                except ValidationError as e:
                    if observatory_id == "Bergen":
                        errors.append([(row["source_material_id"], e.errors())])
                    else:
                        errors.append([(row["source_mat_id"], e.errors())])
                else:
                    validated_rows.append(vr.model_dump())

            if errors:
                # errors is a list of lists where each inner list is a dict of row errors
                # where each isof key = source_mat_id and values is list of dicts each of which
                # is an error:
                #List[List[str:List[Dict]]]
                total_number_errors = sum([len(row[1]) for e in errors for row in e])
                print(f"Errors were found... {total_number_errors} in total")
                save_dir = "./validation_errors_github"
                #outfile_name_pk = f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.pickle"
                #out_path_pk = os.path.join(save_dir, outfile_name_pk)
                #with open(out_path_pk, "wb") as f:
                #    pickle.dump(errors, f, pickle.HIGHEST_PROTOCOL)
                outfile_name_log = f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.log"
                out_path_log = os.path.join(save_dir, outfile_name_log)                
                with open(out_path_log, "w") as f:
                    pprint(errors, f)
            else:
                assert len(validated_rows) == len(data_records_filtered), \
                    "Not sure what happenned, but len(validated_rows) != len(data_filtered_records)"
                print("All records passed!")
            
                #for record in validated_rows:
                #    for field in record:
                #        print(f"Record {field} has value {record[field]} is type {type(record[field])}")

                if not STRICT and not SEMI_STRICT:
                    save_dir = "./logsheets_github"
                    outfile_name = f"{observatory_id}_{sampling_strategy}_{model_type}_validated.csv"
                    ndf = pd.DataFrame.from_records(validated_rows, index="source_mat_id")
                    ndf.to_csv(os.path.join(save_dir, outfile_name))
                    print(f"Written {os.path.join(save_dir, outfile_name)}")

validator_classes = {"sampling_github"           : samplingModelGithub, 
                     "sampling_github_strict"    : samplingModelGithubStrict,
                     "sampling_github_semistrict": samplingModelGithubSemiStrict
                    }

# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
governance_logsheets_validated_csv = "./governance/logsheets_validated.csv"
df = pd.read_csv(governance_logsheets_validated_csv)
water_column_sheet_addresses = df[["observatory_id", "water_column"]].values.tolist()
soft_sediment_sheet_addresses  = df[["observatory_id", "soft_sediment"]].values.tolist()
del df

parse_sample_sheets("water", "sampling", water_column_sheet_addresses)
parse_sample_sheets("sediment", "sampling", soft_sediment_sheet_addresses)
#parse_sample_sheets("water", "measured", water_column_sheet_addresses)
#parse_sample_sheets("sediment", "measured", soft_sediment_sheet_addresses)


Processing ESC68N... water_sampling.csv
Discarded 124 records leaving 120.
All records passed!
Written ./logsheets_github/ESC68N_water_sampling_github_validated.csv
Processing Bergen... water_sampling.csv
Observatory Bergen does not have a transformed water_sampling.csv on GH
Processing MBAL4... water_sampling.csv
Discarded 162 records leaving 80.
All records passed!
Written ./logsheets_github/MBAL4_water_sampling_github_validated.csv
Processing BPNS... water_sampling.csv
All records passed!
Written ./logsheets_github/BPNS_water_sampling_github_validated.csv
Processing ROSKOGO... water_sampling.csv
All records passed!
Written ./logsheets_github/ROSKOGO_water_sampling_github_validated.csv
Processing VB... water_sampling.csv
Discarded 296 records leaving 554.
All records passed!
Written ./logsheets_github/VB_water_sampling_github_validated.csv
Observatory OOB does not have a water sampling strategy.
Processing EMT21... water_sampling.csv
Discarded 17 records leaving 188.
All records pass

# Create meta-table of all logsheets.sheets

In [6]:
import os
import copy
import difflib
import pandas as pd
import validators
from pprint import pprint

######################## CAUTION #######################
# It seems the source_mat_ids in the run-sheet do not match the source_mat_ids in the sampling sheets: 
# CHECK
# EMOBON_OOB_So_210608_micro_1
# How many source_mat_ids in the run-information-batch-001.csv sheets have not equivalents in the sampling sheets?
######################## CAUTION #######################

LOGSHEETS_PREFIX = "./logsheets"

def get_observatory_data() -> list[str, str, str]:
    # Get list of observatory_ids
    df = pd.read_csv("./governance/logsheets_validated.csv")
    observatory_data = df[["observatory_id", "water_column", "soft_sediment"]].values.tolist()
    return observatory_data

def get_refcodes() -> dict[str, str]:
    batch1_run_info_path = "https://raw.githubusercontent.com/emo-bon/sequencing-data/main/shipment/batch-001/run-information-batch-001.csv"
    batch2_run_info_path = "https://raw.githubusercontent.com/emo-bon/sequencing-data/main/shipment/batch-002/run-information-batch-002.csv"
    # Get list of batch1 <source_mat_id>, <ref_code>'s
    df = pd.read_csv(batch1_run_info_path)
    refcodes = {}
    for i in df[["source_material_id", "ref_code"]].values.tolist():
        assert i[0] not in refcodes, f"{source_material_id} maybe duplicated"
        refcodes.update(dict([i]))
    # Get list of batch2 <source_mat_id>, <ref_code>'s
    df = pd.read_csv(batch2_run_info_path)
    b2_refcodes = {}
    for i in df[["source_material_id", "ref_code"]].values.tolist():
        assert i[0] not in refcodes, f"{source_material_id} maybe duplicated"
        b2_refcodes.update(dict([i]))
    refcodes.update(b2_refcodes)
    return refcodes

def parse_observatory_sample_type(observatory_id: str, sampling_type: str, save_table: bool = False) -> list[dict[str, str]]:
        """An observatory is an EMBRC station and it has an ID
           Each observatory may take either or both of the "water_column" and "soft_sediment" sampling types
           Each sampling type has both a "sampling" and "measured" sheet

           This function returns a list of sampling events each of which is a dict with key/value pairs for each field and value
        """

        
        sampling_data_filename = f"{observatory_id}_{sampling_type}_sampling_validated.csv"
        measured_data_filename = f"{observatory_id}_{sampling_type}_measured_validated.csv"
    
        sampling_data = pd.read_csv(os.path.join(LOGSHEETS_PREFIX, sampling_data_filename))
        measured_data = pd.read_csv(os.path.join(LOGSHEETS_PREFIX, measured_data_filename))

        # Pull out the source_mat_ids:
        all_sampling_source_mat_ids = sampling_data["source_mat_id"].values.tolist()
        #print(f"source_mat_ids in {observatory_id}-{sampling_type}: {all_sampling_source_mat_ids}")
    
        sampling_events = sampling_data.to_dict(orient="records")
        measured_events = measured_data.to_dict(orient="records")
    
        combined_events = []
        source_mat_ids_from_combined_events = []
        no_refcode_counter = 0
        missing_refcode_counter = 0
        missing_measured_but_refcode_present = 0
        for sampling_event in sampling_events:

            # Checking consistency
            # Does this sampling event have a ref_code
            # If yes, then it should have both a sampling and measured sheet
            # If no, we can ignore it
            event_mat_id = sampling_event["source_mat_id"]
            try:
                refcode = obs_refcodes[event_mat_id]
            except KeyError:
                no_refcode_counter += 1
                # OK so has not been sent to sequencing; ignore
                continue
            
            event_measured = False
            for measured_event in measured_events:
                try:
                    measured_event["source_mat_id"]
                except KeyError:
                    print(f"Key error: {measured_event}")
                    # Should not happen
                if measured_event["source_mat_id"] == event_mat_id:
                    event_measured = copy.deepcopy(measured_event)
                    break
                    
            if not event_measured:
                missing_measured_but_refcode_present += 1
                continue
            else:
                sampling_event["ref_code"] = refcode
                # Delete the now duplicated source_mat_id
                del event_measured["source_mat_id"]
                source_mat_ids_from_combined_events.append(event_mat_id)
                sampling_event.update(event_measured)
                combined_events.append(sampling_event)
    
        print(
              f"Observatory {observatory_id}-{sampling_type} has {len(sampling_events)} sampling events, "
              f"of which {no_refcode_counter} have missing no ref_code (not sent for sequencing), "
              f"and {missing_measured_but_refcode_present} have a refcode but not 'measured' data. "
              f"A total of {len(combined_events)} sampling events with refcode and measured sheet were found.\n"
             )
    
        # Did we find all the sampling events?
        assert len(sampling_events) == (len(combined_events) + no_refcode_counter), \
            "Something is a foot: len(sampling_events) != (len(combined_events) + no_refcode_counter)"


        if save_table:
            save_dir = "./transformed"
            outfile_name = f"{observatory_id}_{sampling_type}_combined_validated.csv"
            ndf = pd.DataFrame.from_records(combined_events, index="source_mat_id")
            ndf.to_csv(os.path.join(save_dir, outfile_name))

        return source_mat_ids_from_combined_events, \
               all_sampling_source_mat_ids, \
               missing_measured_but_refcode_present, \
               combined_events 

observatory_data = get_observatory_data()
#pprint(observatory_data)
obs_refcodes = get_refcodes()

total_combined_events = 0
total_missing_measured_but_refcode_present = 0
all_source_mat_ids_from_sheets = []
source_mat_ids_from_combined_events = []
for observatory_id, water_column, soft_sediment in observatory_data:

    if observatory_id == "Plenzia": continue # Data not public
    if observatory_id == "UMF" and soft_sediment: continue # Broken sheet
        
    if water_column and validators.url(water_column):
        
        r = parse_observatory_sample_type(observatory_id, "water_column", save_table = False)
        wc_source_mat_ids_from_combined_events = r[0]
        wc_all_sampling_source_mat_ids = r[1]
        wc_missing_measured_but_refcode_present = r[2]
        wc_combined_events = r[3]
        #pprint(wc_source_mat_ids_from_combined_events)
        
        total_combined_events += len(wc_combined_events)
        total_missing_measured_but_refcode_present += wc_missing_measured_but_refcode_present
        all_source_mat_ids_from_sheets.extend(wc_all_sampling_source_mat_ids)
        source_mat_ids_from_combined_events.extend(wc_source_mat_ids_from_combined_events)
        
        
    if soft_sediment and validators.url(soft_sediment):
        r = parse_observatory_sample_type(observatory_id, "soft_sediment", save_table = False)
        ss_source_mat_ids_from_combined_events= r[0]
        ss_all_sampling_source_mat_ids = r[1]
        ss_missing_measured_but_refcode_present = r[2]
        ss_combined_events = r[3]
    
        total_combined_events += len(ss_combined_events)
        total_missing_measured_but_refcode_present += ss_missing_measured_but_refcode_present
        all_source_mat_ids_from_sheets.extend(ss_all_sampling_source_mat_ids)
        source_mat_ids_from_combined_events.extend(ss_source_mat_ids_from_combined_events)

pprint(f"There are {len(obs_refcodes)} total ref_codes assigned")
pprint(f"Total number of combined sampling events with ref_codes: {total_combined_events}")
# Did we find all the ref_codes?
#assert len(obs_refcodes) == (total_combined_events + total_missing_measured_but_refcode_present), \
#    "Something is a foot: len(obs_refcodes) != (len(total_combined_events) + total_missing_measured_but_refcode_present)"

pprint(f"Total number of all_source_mat_ids_from_sheets: {len(all_source_mat_ids_from_sheets)}")

missing_source_mat_ids =[]
for source_mat_id in obs_refcodes: 
    # source_mat_ids are the keys in the refcode dict of the run_information
    # print(f"refcode from run_information: {refcode}")
    if source_mat_id not in all_source_mat_ids_from_sheets:
        #print(f"source_mat_id {source_mat_id} is missing from the sampling sheets")
        
        # Get close matches to missing source_mat_id
        matches = difflib.get_close_matches(source_mat_id, all_source_mat_ids_from_sheets, n=3)
        missing_source_mat_ids.append([source_mat_id, matches])
        
print(f"A total of {len(missing_source_mat_ids)} source_mat_ids "
      f"in the batch 1 & 2 run information sheets are missing from the "
      f"observatory sampling sheets")

###### CAUTION: THIS SHOULD BE ZERO! ################
missing = False
counter = 0
#pprint(source_mat_ids_from_combined_events)
for source_mat_id in source_mat_ids_from_combined_events: 
    # source_mat_ids are the keys in the refcode dict of the run_information
    #print(f"source_mat_id: {source_mat_id}")
    if source_mat_id not in obs_refcodes:
        #print(f"source_mat_id {source_mat_id} is missing from the run_information")
        counter += 1
        missing = True
if missing:
    print(f"A total of {counter} source_mat_ids "
          f"in the sampling sheets that also have refcodes are missing from the "
          f"Batch 1 & 2 run information sheets")

total = total_combined_events + len(missing_source_mat_ids)
print(
    f"Total combined_events {total_combined_events} + {len(missing_source_mat_ids)} "
    f"= {total} and should be equal to total number of refcodes assigned in the run information "
    f"sheets {len(obs_refcodes)}"
    )

#pprint("The missing source_mat_ids that are in the run information sheets are:")
#for missing in missing_source_mat_ids:
#    join = " ".join(missing[1])
#    print(f"Missing source_mat_id is {missing[0]} close matches are \n\t {join}")

#for missing in missing_source_mat_ids:
#    print(missing)


Observatory ESC68N-water_column has 150 sampling events, of which 136 have missing no ref_code (not sent for sequencing), and 0 have a refcode but not 'measured' data. A total of 14 sampling events with refcode and measured sheet were found.

Observatory Bergen-water_column has 108 sampling events, of which 108 have missing no ref_code (not sent for sequencing), and 0 have a refcode but not 'measured' data. A total of 0 sampling events with refcode and measured sheet were found.

Observatory MBAL4-water_column has 78 sampling events, of which 74 have missing no ref_code (not sent for sequencing), and 0 have a refcode but not 'measured' data. A total of 4 sampling events with refcode and measured sheet were found.

Observatory BPNS-water_column has 280 sampling events, of which 271 have missing no ref_code (not sent for sequencing), and 0 have a refcode but not 'measured' data. A total of 9 sampling events with refcode and measured sheet were found.

Observatory BPNS-soft_sediment has 1