## JSON Schema Proposal

Utilize JSON Schema for validation logic, either using csv on the web as the interface or as metadata.


In [2]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pydantic
!{sys.executable} -m pip install jsonschema
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install openpyxl


Collecting pydantic
  Downloading pydantic-2.5.2-py3-none-any.whl.metadata (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.2/65.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting annotated-types>=0.4.0 (from pydantic)
  Downloading annotated_types-0.6.0-py3-none-any.whl.metadata (12 kB)
Collecting pydantic-core==2.14.5 (from pydantic)
  Downloading pydantic_core-2.14.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.5 kB)
Downloading pydantic-2.5.2-py3-none-any.whl (381 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.9/381.9 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading pydantic_core-2.14.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading annotated_types-0.6.0-py3-none-any.whl (12 kB)
Installing collected pac

## Pydantic Models

In [3]:
from jsonschema import validate
import pandas as pd

from pydantic import (
	BaseModel,
	Field
)
from typing import (
	Dict, 
	List, 
	Optional,
	Union,
	Literal
)

from enum import Enum
import re

# datatype enum
class DatatypeEnum(str, Enum):
	NULL = "null"
	BOOLEAN = "boolean"
	OBJECT = "object"
	STRING = "string"
	NUMBER = "number"
	INTEGER = "integer"
	ARRAY = "array"

class Items(BaseModel):
	datatype: DatatypeEnum = Field(alias="type")

class BaseProperty(BaseModel):
	description: str = Field(description="description of field")
	number: Union[int,str] = Field(description="index of the column for this value")
	valueURL: Optional[str] = Field(default=None)	
	#multiple: Optional[bool]
	#seperator: Optional[str]

class NullProperty(BaseModel):
	datatype: Literal['null'] = Field(alias="type")

class StringProperty(BaseProperty):
	datatype: Literal['string'] = Field(alias="type")
	pattern: Optional[str] = Field(description="regex pattern for field", default=None)

class ArrayProperty(BaseProperty):
	datatype: Literal['array'] = Field(alias="type")
	maxItems: int = Field(description="max items in array, validation fails if length is greater than this value")
	minItems: Optional[int] = Field(description="min items in array, validation fails if lenght is shorter than this value")
	uniqueItems: Optional[bool] = Field()
	items: Items

class BooleanProperty(BaseProperty):
	datatype: Literal['boolean'] = Field(alias="type")

class NumberProperty(BaseProperty):
	datatype: Literal['number'] = Field(alias="type")

class IntegerProperty(BaseProperty):
	datatype: Literal['integer'] = Field(alias="type")


PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty, NullProperty]


In [4]:
class ValidationSchema(BaseModel):
	schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="schema")
	guid: str = Field(alias="@id")
	properties: Dict[str, PropertyUnion]
	datatype: str = Field(default="object", alias="type")
	additionalProperties: bool = Field()
	required: List[str] = Field(description="list of required properties by name")
	seperator: str = Field(description="Field seperator for the file")
	header: bool = Field(description="Do files of this schema have a header row")
	examples: Optional[List[Dict[str, str ]]]

	def load_data(self, path: str) -> pd.DataFrame:
		# TODO deal with alternative filetypes
		return pd.read_csv(path, sep=self.seperator,  header=None)

	def execute_validation_csv(self, data_frame):
		schema_definition = self.model_dump(by_alias=True)

		property_slice = {
			property_name: {
				"number": property_data.get("number"),
				"type": property_data.get("type")
			}
			for property_name, property_data in schema_definition.get("properties").items()
		}


		def json_row(row):
			json_output = {}
			for property_name, property_values in property_slice.items():
			
				index_slice = property_values.get("number")
				datatype = property_values.get("type")

				if isinstance(index_slice, int): 

					if datatype == "boolean":
						json_output[property_name] = bool(row.iloc[index_slice])
					else:
						json_output[property_name] = row.iloc[index_slice]
				
				elif isinstance(index_slice, str):

					n_to_end_slice_match = re.search("^([0-9]*)::$", index_slice)
					start_to_n_slice_match = re.search("^::([0-9]*)$", index_slice)
					n_to_m_slice_match = re.search("^([0-9]*):([0-9]*)$", index_slice)

					if n_to_end_slice_match:
						start = int(n_to_end_slice_match.group(1))
						generated_slice = slice(start, len(row))
					elif start_to_n_slice_match:
						end = int(start_to_n_slice_match.group(1))
						generated_slice = slice(0,end)
					elif n_to_m_slice_match:
						start = int(n_to_m_slice_match.group(1))
						end = int(n_to_m_slice_match.group(2))
						generated_slice = slice(start, end)
					else:
						# raise exception for improperly passing a slice 
						raise Exception()

					# slice rows according to matched slice

					# if datatype is boolean coerce datatype
					if datatype=="boolean":
						json_output[property_name] = [ bool(item) for item in list(row.iloc[generated_slice])]
					else:
						json_output[property_name] = list(row.iloc[generated_slice])

			return json_output

		# run conversion on data frame 
		for i in range(data_frame.shape[0]):
			data_row = data_frame.iloc[i,:]

			# TODO catch all validation errors and then return
			validate(
				instance=json_row(data_row),
				schema= schema_definition 
			)

In [10]:
pd.read_excel("../tests/data/test_xlsx.xlsx")

Unnamed: 0,col1,col2,col3
0,a,1,True
1,b,2,False
2,c,3,True


UnicodeDecodeError: 'utf-8' codec can't decode bytes in position 15-16: invalid continuation byte

In [8]:
# validation execution
def execute_validation(schema_definition, data_frame):

	property_slice = {
		property_name: {
			"number": property_data.get("number"),
			"type": property_data.get("type")
		}
		for property_name, property_data in schema_definition.get("properties").items()
	}


	def json_row(row):
		json_output = {}
		for property_name, property_values in property_slice.items():
		
			index_slice = property_values.get("number")
			datatype = property_values.get("type")

			if isinstance(index_slice, int): 

				if datatype == "boolean":
					json_output[property_name] = bool(row.iloc[index_slice])
				else:
					json_output[property_name] = row.iloc[index_slice]
			
			elif isinstance(index_slice, str):

				n_to_end_slice_match = re.search("^([0-9]*)::$", index_slice)
				start_to_n_slice_match = re.search("^::([0-9]*)$", index_slice)
				n_to_m_slice_match = re.search("^([0-9]*):([0-9]*)$", index_slice)

				if n_to_end_slice_match:
					start = int(n_to_end_slice_match.group(1))
					generated_slice = slice(start, len(row))
				elif start_to_n_slice_match:
					end = int(start_to_n_slice_match.group(1))
					generated_slice = slice(0,end)
				elif n_to_m_slice_match:
					start = int(n_to_m_slice_match.group(1))
					end = int(n_to_m_slice_match.group(2))
					generated_slice = slice(start, end)
				else:
					# raise exception for improperly passing a slice 
					raise Exception()

				# slice rows according to matched slice

				# if datatype is boolean coerce datatype
				if datatype=="boolean":
					json_output[property_name] = [ bool(item) for item in list(row.iloc[generated_slice])]
				else:
					json_output[property_name] = list(row.iloc[generated_slice])

		return json_output

	# run conversion on data frame 
	for i in range(data_frame.shape[0]):
		data_row = data_frame.iloc[i,:]

		validate(
			instance=json_row(data_row),
			schema= schema_definition 
		)

### Testing Property Union

In [121]:
property_test_data = {"properties": embedding_schema_definition.get("properties")}

class PropertyTest(BaseModel):
	properties: Dict[str, PropertyUnion]

PropertyTest(**property_test_data)

PropertyTest(properties={'experiment_identifier': StringProperty(description='Identifier for APMS experiment corresponding to the given node2vec vector', number=0, valueURL=None, datatype='string', pattern='APMS_[0-9]*'), 'gene_symbol': StringProperty(description='gene symbol for apms embedding vector', number=1, valueURL=None, datatype='string', pattern='[A-Z0-9]*'), 'embedding': ArrayProperty(description='embedding vector values for genes determined by running node2vec on APMS networks', number='2::', valueURL=None, datatype='array', maxItems=1024, minItems=1024, uniqueItems=False, items=Items(datatype=<DatatypeEnum.NUMBER: 'number'>))})

## Tests

#### Embedding Schema

In [5]:
embedding_schema_definition = {
	"@context": {
		"evi": "https://example.org/",
		"csvw:": "https://example.org/",
		"@vocab": "https://schema.org/"
	},
	"$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://fairscape.pods.uvarc.io/ark:59852/apms_embedding_schema",
  "@id": "https://fairscape.pods.uvarc.io/ark:59852/apms_embedding_schema",
  "title": "APMS Embedding Schema",
  "description": "Schema for APMS Embedding Results from pipeline",
  "type": "object",
	"properties": {
		"experiment_identifier": {
			"type": "string",
			"description": "Identifier for APMS experiment corresponding to the given node2vec vector",
			"pattern": "APMS_[0-9]*",
			"number": 0
		},
		"gene_symbol": {
			"type": "string",
			"description": "gene symbol for apms embedding vector",
			"pattern": "[A-Z0-9]*",
			"number": 1,
			"csvw:valueURL": "http://edamontology.org/data_1026"
		},
		"embedding": {
			"type": "array",
			"minItems": 1024,
			"maxItems": 1024,
			"items": {
				"type": "number"
			},
			"uniqueItems": False,
			"description": "embedding vector values for genes determined by running node2vec on APMS networks",
			"number": "2::"
		},
	},
	"additionalProperties": False,
	"required": ["experiment_identifier", "gene_symbol", "embedding"],
	"header": False,
	"seperator": ",",
	"examples": [
		{"@id": "ark:59852/apms_data_example"}
	]
}

In [6]:
embedding_schema_pydantic = ValidationSchema(**embedding_schema_definition)
schema_json = embedding_schema_pydantic.model_dump(by_alias=True)

In [7]:
schema_json

{'schema': 'https://json-schema.org/draft/2020-12/schema',
 '@id': 'https://fairscape.pods.uvarc.io/ark:59852/apms_embedding_schema',
 'properties': {'experiment_identifier': {'description': 'Identifier for APMS experiment corresponding to the given node2vec vector',
   'number': 0,
   'valueURL': None,
   'type': 'string',
   'pattern': None},
  'gene_symbol': {'description': 'gene symbol for apms embedding vector',
   'number': 1,
   'valueURL': None,
   'type': 'string',
   'pattern': '[A-Z0-9]*'},
  'embedding': {'description': 'embedding vector values for genes determined by running node2vec on APMS networks',
   'number': '2::',
   'valueURL': None,
   'type': 'array',
   'maxItems': 1024,
   'minItems': 1024,
   'uniqueItems': False,
   'items': {'type': <DatatypeEnum.NUMBER: 'number'>}}},
 'type': 'object',
 'additionalProperties': False,
 'required': ['experiment_identifier', 'gene_symbol', 'embedding'],
 'seperator': ',',
 'header': False,
 'examples': [{'@id': 'ark:59852/apm

In [9]:
embed_data = pd.read_csv("../tests/data/APMS_embedding_MUSIC.csv", header=None)
embedding_schema_pydantic = ValidationSchema(**embedding_schema_definition)
schema_json = embedding_schema_pydantic.model_dump(by_alias=True)

execute_validation(schema_definition=schema_json, data_frame=embed_data)

SchemaError: None is not of type 'string'

Failed validating 'type' in metaschema['allOf'][1]['properties']['properties']['additionalProperties']['$dynamicRef']['allOf'][3]['properties']['pattern']:
    {'format': 'regex', 'type': 'string'}

On schema['properties']['experiment_identifier']['pattern']:
    None

#### Gene Node Attributes Schema

In [161]:
ppi_gene_node_attributes_schema = {
	"@context": {
		"evi": "https://example.org/",
		"csvw:": "https://example.org/",
		"@vocab": "https://schema.org/"
	},
	"$schema": "https://json-schema.org/draft/2020-12/schema",
  "@id": "https://fairscape.pods.uvarc.io/ark:59852/apms_ppi_gene_node_attributes",
  "title": "APMS PPI Gene Node Attributes Schema",
  "description": "Schema for APMS Raw Data ",
  "type": "object",
	"properties": {
		"name": {
			"type": "string",
			"description": "Gene Symbol for apms protein",
			"pattern": "[A-Z0-9]*",
			"number": 0,
			"csvw:valueURL": "http://edamontology.org/data_1026"
		},
		"represents": {
			"type": "string",
			"description": "Ensembl Gene ID for protien",
			"number": 1,
			"pattern": "[A-Z0-9]*",
			#"multiple": True,
			#"seperator": ";",
			"csvw:valueURL": "http://edamontology.org/data_1033"
		},
		#"ambiguous": {
			#"type": "null",
			#"description": "Is this description ambiguous",
			#"number": 2
		#},
		"bait": {
			"type": "boolean",
			"description": "True means this protien was utilized as a bait protein in an experiment",
			"number": 3
		}
	},
	"additionalProperties": False,
	"required": ["name", "represents", "bait"],
	"header": False,
	"seperator": ",",
	"examples": [
		{"@id": "ark:59852/apms_data_example"}
	]
}

In [162]:
ppi_gene_node_attributes_data = pd.read_csv(
	"../tests/data/crates/1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha/ppi_gene_node_attributes.tsv",
	sep="\t",
	dtype={3: 'bool'}
	)

apms_gene_node_schema = ValidationSchema(**ppi_gene_node_attributes_schema)
gene_node_schema = apms_gene_node_schema.model_dump(by_alias=True)

execute_validation(
	schema_definition= gene_node_schema,
	data_frame=ppi_gene_node_attributes_data
	)

In [155]:
type(ppi_gene_node_attributes_data.iloc[0, :].iloc[3])

numpy.bool_

#### PPI Edgelist Schema

In [157]:
amps_ppi_edgelist_schema = {
	"@context": {
		"evi": "https://example.org/",
		"csvw:": "https://example.org/",
		"@vocab": "https://schema.org/"
	},
	"$schema": "https://json-schema.org/draft/2020-12/schema",
  "@id": "https://fairscape.pods.uvarc.io/ark:59852/amps_ppi_edgelist",
  "title": "APMS Protien Protien Interaction Edgelist",
  "description": "Schema for APMS Raw Edgelist",
  "type": "object",
	"properties": {
		"geneA": {
			"type": "string",
			"description": "Gene Symbol for Bait Protien",
			"pattern": "[A-Z0-9]*",
			"number": 0,
			"csvw:valueURL": "http://edamontology.org/data_1026"
		},
		"geneB": {
			"type": "string",
			"description": "Gene Symbol for Prey Protien",
			"pattern": "[A-Z0-9]*",
			"number": 1,
			"csvw:valueURL": "http://edamontology.org/data_1026"
		},
	},
	"additionalProperties": False,
	"required": ["geneA","geneB"],
	"header": True,
	"seperator": "\t",
	"examples": [
		{"@id": "ark:59852/apms_ppi_edgelist_example"}
	]
}

In [158]:

ppi_gene_node_attributes_data = pd.read_csv(
	"../tests/data/crates/1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha/ppi_edgelist.tsv",
	sep="\t"
	)

apms_ppi_edgelist_pydantic = ValidationSchema(**amps_ppi_edgelist_schema)

execute_validation(
	schema_definition=apms_ppi_edgelist_pydantic.model_dump(by_alias=True), 
	data_frame=ppi_gene_node_attributes_data
	)

### Future Work

#### CLI implementation for creating a schema in one command

#### Uploading a Schema to Fairscape

Need to create an endpoint, first dealing with schemas as only metadata

```
requests.post(
	url="https://fairscape.pods.uvarc.io/schema",	
	json=embedding_schema_definition,
)

```


#### CLI implementation of interactive process


```
fairscape-cli create schema
> title: USER SPCECIED
> description: 
> ...
> add a property Y/N
> property title:
> property type [string|number|boolean|]: "boolean"
> add another property: Y/N

...
> execute on an example file? Y/N
> example file path:

> errors found
> dumping 
```