## Setup

### Installing Dependencies

In [45]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install prettytable
!{sys.executable} -m pip install imageio

Defaulting to user installation because normal site-packages is not writeable
Collecting prettytable
  Downloading prettytable-3.7.0-py3-none-any.whl (27 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.7.0
--- Logging error ---
Traceback (most recent call last):
  File "/home/max/.local/lib/python3.10/site-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/home/max/.local/lib/python3.10/site-packages/pip/_vendor/rich/console.py", line 1673, in print
    extend(render(renderable, render_options))
  File "/home/max/.local/lib/python3.10/site-packages/pip/_vendor/rich/console.py", line 1305, in render
    for render_output in iter_render:
  File "/home/max/.local/lib/python3.10/site-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/home/max/.local/lib/python3.10/site-packages/pip/_vendor/rich/segment.py",

### Adding cli to path

In [46]:

import os
import sys

sys.path.insert(
    0, 
    "/home/max/uva/fairscape-cli/fairscape_cli/"
)


### Importing Schema Ingredients

In [89]:
from typing import (
    List,
    Optional
)
import pathlib
from fairscape_cli.models.schema import (
    ImageSchema,
    ImageValidation,
    ImageValidationException,
    ImagePathNotFoundException,
    DatatypeEnum,
    DatatypeSchema,
    ColumnSchema,
    TabularDataSchema,
    DataValidationException,
    NAValidationException,
    PathNotFoundException
)



### Example Schema

In [59]:

apms_datatype = DatatypeSchema(
    name="APMS Experiment",
    description="Identifier for APMS experiment corresponding to the given node2vec vector",
    base="string",
    format="APMS_[0-9]*"
)

apms_column = ColumnSchema(
    name="APMS Experiment",
    description="APMS_column",
    ordered=False,
    required=True,
    number=0,
    datatype=apms_datatype,
    titles=["APMS Experiment"]
)

gene_symbol_datatype = DatatypeSchema(
    name="Gene Symbol",
    description="Gene Symbol in String Form",
    base="string",
    format="[A-Z0-9]*",
    minLength=3,
    maxLength=20
)

gene_symbol_column = ColumnSchema(
    name="Gene Symbol",
    description="gene symbol for apms embedding vector",
    ordered=False,
    required=True,
    number=0,
    datatype=gene_symbol_datatype,
    valueURL="http://edamontology.org/data_1026",
    titles=["Gene Symbol"]
)

embedding_column = ColumnSchema(
    name="embedding values",
    datatype = "float",
    required = True,
    description="node2vec embedding vector values for genes",
    number="2::",
    titles=["node2vec embedding vector"]
)

embedding_schema = TabularDataSchema(
    guid="ark:99999/schema/apms_embedding_schema",
    name="apms embedding schema",
    description="embedding vector values for genes determined by running node2vec on APMS networks",
    seperator=",",
    header=False,
    columns=[
        apms_column,
        gene_symbol_column,
        embedding_column
    ]
)

### Read Example Data

In [60]:
# test reading the data with the specified schema
embedding_path = pathlib.Path("./data/APMS_embedding_MUSIC.csv")
embedding_df = embedding_schema.ReadTabularData(embedding_path)

col_schema = embedding_schema.columns[0]
column = embedding_df.iloc[:,0]


In [102]:
# data designed to fail
embedding_path = pathlib.Path("./data/APMS_embedding_MUSIC.csv")
embedding_df_fail = embedding_schema.ReadTabularData(embedding_path)



In [104]:
# set na validation to fail
embedding_df_fail.iloc[2,0]
embedding_df_fail.iat[2,0] = None
embedding_df_fail.iat[3, 0] = ""
fail_column = embedding_df_fail.iloc[:, 0]

In [116]:
embedding_df.columns

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025],
           dtype='int64', length=1026)

In [117]:
fail_column.dtype

dtype('O')

## Schema Validation

### Null Values

In [112]:
# if values are required check if any missing values are present
def ValidateNullValue(null_value: Optional[str], column) -> None:
    """ Validate that null values are not present when the column schema specifies these values as required
    """
    
    if any(column.isna()): 
        raise NAValidationException(error="Column has missing Values")

    if any(column == None):
        raise NAValidationException(error="Column has missing Values")

    # compare to passed null value
    if null_value != None:
        if any(column == null_value):
            raise NAValidationException(error="Column has null values")

    return None


#### Test Null Values

In [110]:
ValidateNullValue(col_schema.null, column)

In [111]:
ValidateNullValue(col_schema.null, fail_column)

NAValidationException: DataValidationException: Validation Failed

### Validate Datatype

In [121]:
column.astype('string')

0        APMS_1
1        APMS_2
2        APMS_3
3        APMS_4
4        APMS_5
         ...   
656    APMS_657
657    APMS_658
658    APMS_659
659    APMS_660
660    APMS_661
Name: 0, Length: 661, dtype: string

In [124]:
try:
    column.astype('int64')
except ValueError as e:
    print("Failed to Cast types")

Failed to Cast types


In [125]:
def ValidateDatatypeEnum(column, datatype_enum: DatatypeEnum) -> None:
    """Validate that all column values can be correctly cast to the schema's specified datatype
    """
    
    if datatype_enum == DatatypeEnum.string:
        column.astype('string')
        
    if datatype_enum == DatatypeEnum.float:
        column.astype('float64')

    if datatype_enum == DatatypeEnum.integer:
        column.astype('int64')
        
    if datatype_enum == DatatypeEnum.datetime:
        pd.to_datetime(column)
    
    if datatype_enum == DatatypeEnum.any:
        pass
        
    return None
        

In [126]:
def ValidateDatatypeFormat(column, datatype_base: DatatypeEnum, datatype_format: str)-> None:
    """ Validate format constraints for various types according to CSV On the Web Schema
    
    Only supporting string regexes for now, but in the future float precision and datetime formats can also be supported
    """
    if datatype_base == DatatypeEnum.string:
        ValidateStringFormat(column, datatype_format)

    if datatype_base == DatatypeEnum.float:
        pass
    
    if datatype_base == DatatypeEnum.datetime:
        pass



In [129]:
class StringFormatException(Exception):
    
    def __init__(self, message="StringFormatException: String Values Failed to Match datatype_format"):
        self.message = message
        super().__init__(self.message)

In [128]:
def ValidateStringFormat(column, datatype_format: str) -> None:
    
    find_matches = lambda cell: re.findall(datatype_format, cell)

    def find_matches(cell):
        return re.fullmatch(datatype_format, cell)
    
    matches = column.apply(find_matches)
    
    if any(matches == None):
        raise StringFormatException
    
    return None

In [63]:
def ValidateDatatype(datatype_schema: DatatypeSchema, column) -> List[Exception]:
        validation_failures = []


        # validate length
        if datatype_schema.length != None:
            if all(len(column) == datatype_schema.length) != True:
                exception_message = f"DatatypeValidationException: length validation failure\n all values do not have length {datatype_schema.length}"

                validation_failures.append(
                    DatatypeValidationException(
                        error="length", 
                        message=exception_message
                    ) 
                )
                

        # validate maxLength
        if datatype_schema.maxLength != None:
            if any(len(column)>datatype_schema.maxLength):
                exception_message = f"DatatypeValidationException: maxLength validation failure\nsome values have lengths > maxLength {datatype_schema.maxLength}"

                validation_failures.append(
                    DatatypeValidationException(
                        error="maxLength", 
                        message=exception_message
                    ) 
                )

        # validate minLength
        if datatype_schema.minLength != None:
            if any(len(column)<datatype_schema.minLength):
                exception_message = f"DatatypeValidationException: minLength validation failure\nsome values have lengths < minLength {datatype_schema.minLength}"

                validation_failures.append(
                    DatatypeValidationException(
                        error="minLength", 
                        message=exception_message
                    ) 
                )

        # get min
        if datatype_schema.min != None:
            if any(column<datatype_schema.min):

                exception_message = f"DatatypeValidationException: min validation failure\n some values have values < min {datatype_schema.min}"

                validation_failures.append(
                    DatatypeValidationException(
                        error="max", 
                        message=exception_message
                    ) 
                )

        # get maximum
        if datatype_schema.maximum != None:
            if any(column<datatype_schema.max):

                exception_message = f"DatatypeValidationException: max validation failure\n some values have values > max {datatype_schema.max}"

                validation_failures.append(
                    DatatypeValidationException(
                        error="max", 
                        message=exception_message
                    ) 
                )

        # validateFormat        
        # get the base and format
        datatype_base = col_schema.datatype.base
        datatype_format = col_schema.datatype.format

        try:
            ValidateDatatypeFormat(column, datatype_base, datatype_format)
        except Exception as e:
            validation_failures.append(e)

        return validation_failures

In [64]:

def ValidateColumn(column_schema: ColumnSchema, column):
    failures = []

    if column_schema.required == True:
        try:
            ValidateNullValue(column_schema.null, column) 
        except NAValidationException as e:
            print(e)

    # validate the datatype
    match type(col_schema.datatype):

        case DatatypeSchema():
            datatype_validation_failures = ValidateDatatype(column, datatype_schema) 


        case DatatypeEnum():
            try:
                ValidateDatatypeEnum(column, datatype_schema.datatype)
            except ValueError as e:
                
                failures.append(e) 