### Pulls sheets from Github after QC curation by EMBRC, does "lax", "strict", and "semi-strict" validation

In [None]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
    
import os
import sys
import math
import pickle
from urllib.request import HTTPError
from enum import Enum
import pandas as pd
from pydantic import ValidationError
from pprint import pprint
from validation_classes import (samplingModelGithub,               # lax validator for EMO-BON Github repository
                                samplingModelGithubStrict,         # strict validator for EMO-BON Github repository
                                samplingModelGithubSemiStrict      # semi-strict validator for EMO-BON Github repository                  
                               )
class SamplingStrategy(Enum):
    WATER    = "water"    # Originally water_column
    SEDIMENT = "sediment" # Originally soft_sediment

class SheetType(Enum):
    SAMPLING = "sampling"
    MEASURED = "measured"

# Not all observatories have "transformed" sheets on GH, but may have "raw"
# Of course the types in the fields are different to difficult to validate
# with a single validator - best just to ignore the raw sheets
USE_RAW = False

############################ CAUTION #############################################
# As defined by Ioulia, dates corrected, NA's removed etc
STRICT      = False

# As defined by Ioulia but not checking for mandatory fields
# ints and str coerced to floats when possible
SEMI_STRICT = False  
##################################################################################

def get_sheet_from_github(observatory_id: SheetType,
                          sampling_strategy: SamplingStrategy, 
                          sheet_type: str) -> pd.core.frame.DataFrame:
    """
    Here we pull the "sampling" or "measured" sheets from Github. These are the curated
    sheets downloaded by the Github actions and hopefully do not have the errors that
    the CSV's pulled directly from Google Sheets had (e.g. the word "blank" magically
    disappering from the "replicate" field.)

    Github paths look like:
    https://raw.githubusercontent.com/emo-bon/observatory-umf-crate/main/logsheets/transformed/sediment_measured.csv
    https://raw.githubusercontent.com/emo-bon/observatory-bergen-crate/main/logsheets/raw/water_sampling.csv

    The problem at 29-08-2024 is that the data are out of date.
    """

    prefix     = "https://raw.githubusercontent.com/emo-bon"
    obs_name   = f"observatory-{observatory_id}-crate"
    inter_path = "main/logsheets"
    dir_path   = "transformed"
    sheet_name = f"{sampling_strategy}_{sheet_type}.csv"

    print(f"Processing {observatory_id}... {sheet_name}")
    github_addr = os.path.join(prefix, obs_name, inter_path, dir_path, sheet_name)
    try:
        df = pd.read_csv(github_addr)
    except HTTPError:
        # Some observatories don't yet have transformed sheets
        if USE_RAW:
            # Try for the raw sheets
            print("Unable to find 'transformed' sheet, reading the 'raw' sheet")
            dir_path   = "raw"
            github_addr = os.path.join(prefix, obs_name, inter_path, dir_path, sheet_name)
            try:
                df = pd.read_csv(github_addr)
            except HTTPError:
                raise ValueError("Unable to find transformed or raw sheet")
        else:
            print(f"Observatory {observatory_id} does not have a transformed {sheet_name} on GH")
            return None
            
    return df

def filter_on_source_mat_id(d):
    # Bergen has it as source_material_id on Google and Github
    try:
        value = d["source_mat_id"]
    except KeyError:
        try:
            value = d["source_material_id"]
        except KeyError:
            raise ValueError("Cannot find source_mat_id field")
    if isinstance(value, float):
        if math.isnan(value):
            return False
    elif value is None:
        return False
    # Remove mis-formatted
    elif len(value.split("_")) < 6:
        return False
    #Edge case of this otherwise blank entry having 6 "bits"
    elif value == "EMOBON_VB_Wa_230509_um_":
        return False 
    else:
        return True

def parse_sample_sheets(sampling_strategy: str,
                        sheet_type: str,
                        addresses: pd.core.frame.DataFrame,
                       ) -> None:
    
    for observatory in addresses:
        observatory_id, sheet_link = observatory
        #print(f"ObSservatory_id {observatory_id} sheet_link {sheet_link}")
        if not isinstance(sheet_link, str):
            #print(f"This is the sheet_link type {type(sheet_link)}")
            if isinstance(sheet_link, float):
                # Only OOB doesnt do water_column
                # But most do not do soft-sediments
                if math.isnan(sheet_link):
                    print(f"Observatory {observatory_id} does not have a {sampling_strategy} sampling strategy.")
                    continue
            else:
                raise ValueError(f"Unknown value \'{sheet_link}\' in {sampling_strategy} cell of {observatory_id}")
        else:

            if observatory_id == "Plenzia": continue # Sheets not publically available

            ################ CAUTION ##################
            #if not observatory_id in ["AAOT"]: continue
            
            # UMF soft_sed has two source_mat_ids
            if sampling_strategy == "sediment" and observatory_id == "UMF":
                continue
    
            df = get_sheet_from_github(observatory_id, sampling_strategy, sheet_type)
            if df is None:
                continue
            data_records_all = df.to_dict(orient="records")
    
            # Many sheets have partially filled rows
            # The source_mat_id is manually curated and the PRIMARY_KEY
            # Therefore filter records on source_mat_id
                
            data_records_filtered = list(filter(filter_on_source_mat_id, data_records_all))
    
            if len(data_records_all) > len(data_records_filtered):
                print(f"Discarded {len(data_records_all) - len(data_records_filtered)} records leaving {len(data_records_filtered)}.")

            ################ CAUTION ##############
            #continue

            if STRICT:
                model_type = f"{sheet_type}_github_strict"
            elif SEMI_STRICT:
                model_type = f"{sheet_type}_github_semistrict"
            else:
                model_type = f"{sheet_type}_github"

            validator = validator_classes[model_type]
            #print(f"Using {validator} from {model_type}")

            #validated_rows = [validator(**row).model_dump() for row in data_records_filtered]
            validated_rows = []
            errors: List[List[str:List[Dict]]] = [] # where each error is the inner Dict
            for row in data_records_filtered:
                try:
                    vr = validator(**row)
                except ValidationError as e:
                    if observatory_id == "Bergen":
                        errors.append([(row["source_material_id"], e.errors())])
                    else:
                        errors.append([(row["source_mat_id"], e.errors())])
                else:
                    validated_rows.append(vr.model_dump())

            if errors:
                # errors is a list of lists where each inner list is a dict of row errors
                # where each isof key = source_mat_id and values is list of dicts each of which
                # is an error:
                #List[List[str:List[Dict]]]
                total_number_errors = sum([len(row[1]) for e in errors for row in e])
                print(f"Errors were found... {total_number_errors} in total")
                save_dir = "./validation_errors_github"
                #outfile_name_pk = f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.pickle"
                #out_path_pk = os.path.join(save_dir, outfile_name_pk)
                #with open(out_path_pk, "wb") as f:
                #    pickle.dump(errors, f, pickle.HIGHEST_PROTOCOL)
                outfile_name_log = f"{observatory_id}_{sampling_strategy}_{model_type}_ERRORS.log"
                out_path_log = os.path.join(save_dir, outfile_name_log)                
                with open(out_path_log, "w") as f:
                    pprint(errors, f)
            else:
                assert len(validated_rows) == len(data_records_filtered), \
                    "Not sure what happenned, but len(validated_rows) != len(data_filtered_records)"
                print("All records passed!")
            
                #for record in validated_rows:
                #    for field in record:
                #        print(f"Record {field} has value {record[field]} is type {type(record[field])}")

                if not STRICT and not SEMI_STRICT:
                    save_dir = "./logsheets_github"
                    outfile_name = f"{observatory_id}_{sampling_strategy}_{model_type}_validated.csv"
                    ndf = pd.DataFrame.from_records(validated_rows, index="source_mat_id")
                    ndf.to_csv(os.path.join(save_dir, outfile_name))
                    print(f"Written {os.path.join(save_dir, outfile_name)}")

validator_classes = {"sampling_github"           : samplingModelGithub, 
                     "sampling_github_strict"    : samplingModelGithubStrict,
                     "sampling_github_semistrict": samplingModelGithubSemiStrict
                    }

# Get list of all URL links to sampling sheets
# NB  you cant use a "with" closure here when reading the Pandas df
governance_logsheets_validated_csv = "./governance/logsheets_validated.csv"
df = pd.read_csv(governance_logsheets_validated_csv)
water_column_sheet_addresses = df[["observatory_id", "water_column"]].values.tolist()
soft_sediment_sheet_addresses  = df[["observatory_id", "soft_sediment"]].values.tolist()
del df

parse_sample_sheets("water", "sampling", water_column_sheet_addresses)
parse_sample_sheets("sediment", "sampling", soft_sediment_sheet_addresses)
#parse_sample_sheets("water", "measured", water_column_sheet_addresses)
#parse_sample_sheets("sediment", "measured", soft_sediment_sheet_addresses)
