## JSON Schema Proposal

Utilize JSON Schema for validation logic, either using csv on the web as the interface or as metadata.


In [75]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pydantic



In [76]:
import pydantic

In [73]:
from jsonschema import validate
import pathlib

In [22]:
embed_data = pd.read_csv("../tests/data/APMS_embedding_MUSIC.csv", header=None)

In [23]:
embed_data.columns

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025],
           dtype='int64', length=1026)

In [1]:
embedding_schema_definition = {
	"@context": {
		"evi": "https://example.org/",
		"csvw:": "https://example.org/",
		"@vocab": "https://schema.org/"
	},
	"$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://fairscape.pods.uvarc.io/ark:59852/apms_embedding_schema",
  "@id": "https://fairscape.pods.uvarc.io/ark:59852/apms_embedding_schema",
  "title": "APMS Embedding Schema",
  "description": "Schema for APMS Embedding Results from pipeline",
  "type": "object",
	"properties": {
		"experiment_identifier": {
			"type": "string",
			"description": "Identifier for APMS experiment corresponding to the given node2vec vector",
			"pattern": "APMS_[0-9]*",
			"number": 0
		},
		"gene_symbol": {
			"type": "string",
			"description": "gene symbol for apms embedding vector",
			"pattern": "[A-Z0-9]*",
			"number": 1,
			"csvw:valueURL": "http://edamontology.org/data_1026"
		},
		"embedding": {
			"type": "array",
			"minItems": 1024,
			"maxItems": 1024,
			"items": {
				"type": "number"
			},
			"uniqueItems": False,
			"description": "embedding vector values for genes determined by running node2vec on APMS networks",
			"number": "2::"
		},
	},
	"additionalProperties": False,
	"required": ["experiment_identifier", "gene_symbol", "embedding"],
	"examples": [
		{"@id": "ark:59852/apms_data_example"}
	]
}

In [2]:
from pydantic import (
	BaseModel,
	Field
)
from typing import (
	Dict, 
	List, 
	Optional,
	Union,
	Literal
)

from enum import Enum

In [3]:
# datatype enum
class DatatypeEnum(str, Enum):
	NULL = "null"
	BOOLEAN = "boolean"
	OBJECT = "object"
	STRING = "string"
	NUMBER = "number"
	INTEGER = "integer"
	ARRAY = "array"

In [126]:
DatatypeEnum.BOOLEAN

<DatatypeEnum.BOOLEAN: 'boolean'>

In [9]:
class Items(BaseModel):
	datatype: DatatypeEnum = Field(alias="type")

class BaseProperty(BaseModel):
	description: str = Field(description="description of field")
	number: int | str = Field(description="index of the column for this value")
	valueURL: Optional[str] = Field(default=None)	

class StringProperty(BaseProperty):
	datatype: Literal['string'] = Field(alias="type")
	pattern: Optional[str] = Field(description="regex pattern for field")

class ArrayProperty(BaseProperty):
	datatype: Literal['array'] = Field(alias="type")
	maxItems: int = Field(description="max items in array, validation fails if length is greater than this value")
	minItems: Optional[int] = Field(description="min items in array, validation fails if lenght is shorter than this value")
	uniqueItems: Optional[bool] = Field()
	items: Items

class BooleanProperty(BaseProperty):
	datatype: Literal['boolean'] = Field(alias="type")

class NumberProperty(BaseProperty):
	datatype: Literal['number'] = Field(alias="type")

class IntegerProperty(BaseProperty):
	datatype: Literal['integer'] = Field(alias="type")

In [10]:
PropertyUnion = Union[StringProperty, ArrayProperty, BooleanProperty, NumberProperty, IntegerProperty]

In [11]:
class PropertyTest(BaseModel):
	properties: Dict[str, PropertyUnion]

In [12]:
property_test_data = {"properties": {
		"experiment_identifier": {
			"type": "string",
			"description": "Identifier for APMS experiment corresponding to the given node2vec vector",
			"pattern": "APMS_[0-9]*",
			"number": 0
		},
		"gene_symbol": {
			"type": "string",
			"description": "gene symbol for apms embedding vector",
			"pattern": "[A-Z0-9]*",
			"number": 1,
			"valueURL": "http://edamontology.org/data_1026"
		},
		"embedding": {
			"type": "array",
			"minItems": 1024,
			"maxItems": 1024,
			"items": {
				"type": "number"
			},
			"uniqueItems": False,
			"description": "embedding vector values for genes determined by running node2vec on APMS networks",
			"number": "2::"
		},
	}
}

In [13]:
PropertyTest(**property_test_data)

PropertyTest(properties={'experiment_identifier': StringProperty(description='Identifier for APMS experiment corresponding to the given node2vec vector', number=0, valueURL=None, datatype='string', pattern='APMS_[0-9]*'), 'gene_symbol': StringProperty(description='gene symbol for apms embedding vector', number=1, valueURL='http://edamontology.org/data_1026', datatype='string', pattern='[A-Z0-9]*'), 'embedding': ArrayProperty(description='embedding vector values for genes determined by running node2vec on APMS networks', number='2::', valueURL=None, datatype='array', maxItems=1024, minItems=1024, uniqueItems=False, items=Items(datatype=<DatatypeEnum.NUMBER: 'number'>))})

In [14]:
class ValidationSchema(BaseModel):
	schema_version: str = Field(default="https://json-schema.org/draft/2020-12/schema", alias="schema")
	guid: str = Field(alias="@id")
	properties: Dict[str, PropertyUnion]
	datatype: str = Field(default="object", alias="type")
	additionalProperties: bool = Field()
	required: List[str] = Field(description="list of required properties by name")
	examples: Optional[List[Dict[str, str ]]]

In [15]:
ValidationSchema(**embedding_schema_definition)

ValidationSchema(schema_version='https://json-schema.org/draft/2020-12/schema', guid='https://fairscape.pods.uvarc.io/ark:59852/apms_embedding_schema', properties={'experiment_identifier': StringProperty(description='Identifier for APMS experiment corresponding to the given node2vec vector', number=0, valueURL=None, datatype='string', pattern='APMS_[0-9]*'), 'gene_symbol': StringProperty(description='gene symbol for apms embedding vector', number=1, valueURL=None, datatype='string', pattern='[A-Z0-9]*'), 'embedding': ArrayProperty(description='embedding vector values for genes determined by running node2vec on APMS networks', number='2::', valueURL=None, datatype='array', maxItems=1024, minItems=1024, uniqueItems=False, items=Items(datatype=<DatatypeEnum.NUMBER: 'number'>))}, datatype='object', additionalProperties=False, required=['experiment_identifier', 'gene_symbol', 'embedding'], examples=[{'@id': 'ark:59852/apms_data_example'}])

In [142]:
embedding_schema_definition

{'$schema': 'https://json-schema.org/draft/2020-12/schema',
 '$id': 'https://fairscape.pods.uvarc.io/ark:59852/apms_embedding_schema',
 'title': 'APMS Embedding Schema',
 'description': 'Schema for APMS Embedding Results from pipeline',
 'type': 'object',
 'properties': {'experiment_identifier': {'type': 'string',
   'description': 'Identifier for APMS experiment corresponding to the given node2vec vector',
   'pattern': 'APMS_[0-9]*'},
  'gene_symbol': {'type': 'string',
   'description': 'gene symbol for apms embedding vector',
   'pattern': '[A-Z0-9]*',
   'number': 1},
  'embedding': {'type': 'array',
   'minItems': 1024,
   'maxItems': 1024,
   'items': {'type': 'number'},
   'uniqueItems': False,
   'description': 'embedding vector values for genes determined by running node2vec on APMS networks'}},
 'additionalProperties': False,
 'required': ['experiment_identifier', 'gene_symbol', 'embedding']}

Uploading a Schema to Fairscape

```
requests.post(
	url="https://fairscape.pods.uvarc.io/schema",	
	json=embedding_schema_definition,
)

```

## Client Creating a Schema


```
fairscape-cli create schema
> title: USER SPCECIED
> description: 
> ...
> add a property Y/N
> property title:
> property type [string|number|boolean|]: "boolean"
> add another property: Y/N

...
> execute on an example file? Y/N
> example file path:

> errors found
> dumping 
```


In [28]:
row[0]

'APMS_1'

In [32]:
list(row[2::])

[0.07591,
 0.161315,
 -0.025731,
 0.071347,
 -0.175898,
 0.041408,
 -0.061304,
 -0.136247,
 0.106549,
 -0.075448,
 0.033692,
 0.193541,
 -0.098475,
 -0.048085,
 -0.12567,
 0.007304,
 -0.091125,
 -0.025974,
 0.131792,
 -0.219541,
 -0.0586789999999999,
 -0.040777,
 0.018422,
 -0.02981,
 -0.056754,
 0.156514,
 0.0220129999999999,
 0.071653,
 0.00029,
 0.126702,
 0.014124,
 -0.014273,
 -0.074498,
 0.067484,
 -0.014903,
 -0.095951,
 0.133019,
 -0.367448,
 0.06527,
 -0.015587,
 0.071432,
 -0.1249369999999999,
 0.153505,
 -0.327368,
 -0.030467,
 -0.127108,
 0.012345,
 -0.081027,
 0.15074,
 0.1200589999999999,
 -0.061838,
 -0.02048,
 0.196326,
 0.208088,
 -0.028729,
 0.021224,
 -0.112507,
 -0.147349,
 -0.007333,
 0.2101449999999999,
 -0.012915,
 -0.001191,
 -0.018385,
 -0.171959,
 0.255183,
 -0.106547,
 -0.148266,
 0.130291,
 0.0082659999999999,
 -0.14399,
 -0.230831,
 0.047902,
 0.0539679999999999,
 0.194763,
 -0.114253,
 -0.014531,
 -0.182695,
 0.0015689999999999,
 -0.101109,
 0.045073,
 -0.

In [69]:

def json_row(row):
	return {
		"experiment_identifier": row[0],
		"gene_symbol": row[1],
		"embedding": list(row[2::])
	}

In [70]:
def validate_jsonschema(passed_csv_row, passed_schema):
	validate(
		instance=json_row(passed_csv_row),
		schema= passed_schema
	)

In [71]:
len(json_row(row).get("embedding"))

1024

In [72]:
validate_jsonschema(row , embedding_schema_definition)