In [2]:
from pydantic import (
    BaseModel,
    Field,
    constr,
    computed_field,
    ConfigDict
)

In [3]:
from typing import (
    Optional,
    List,
    Dict,
    Union
)

In [4]:
import hashlib
import json
import re

## GUID Generation Formula from Tim

```
{
"project":"CM4AI
computation=“AP-MS initial processing”
cell_line=“MDA-MB468”
treatment=“untreated”
release_name=“0.1 alpha”
version=[int
}
```

data_release_name=0.1 alphaeger]

## ROCrate Models

In [None]:
ARK_NAAN = "59852"

In [5]:
default_context = {
    "@vocab": "https://schema.org/",
    "evi": "https://w3id.org/EVI#"
}


class Identifier(BaseModel):
    guid: str = Field(
        title="guid",
        alias="@id" 
    )
    metadataType: str = Field(
        title="metadataType",
        alias="@type" 
    )
    name: str


class FairscapeBaseModel(BaseModel):
    model_config = ConfigDict(
        populate_by_name = True,
        validate_assignment = True,  
    )
    context: Dict[str,str] = Field(
        default=default_context,
        title="context",
        alias="@context"
    )
    metadataType: str = Field(
        title="metadataType",
        alias="@type"
    )
    url: Optional[str] = Field(default=None)
    name: str = Field(max_length=200)
    keywords: List[str] = Field(default=[])
    description: str = Field(min_length=5)

    @computed_field(alias="@id")
    @property
    def guid(self) -> str:
        # TODO url encode values
        # TODO add random hash digest

        # if 
        return f"ark:{ARK_NAAN}/rocrate-{self.name.replace(' ', '')}"

In [6]:
class FairscapeProvModel(FairscapeBaseModel):
     sourceOrganization: Optional[str] = Field(default=None)

### rocrate

```
class ROCrate(BaseModel):
    guid: Optional[str] = Field(default="")
    metadataType: str = Field(default="https://schema.org/Dataset")
    name: str = Field(max_length=200)
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    projectName: Optional[str]
    organizationName: Optional[str]
    path: pathlib.Path
    metadataGraph: Optional[List[Union[Dataset,Software, Computation]]] = Field(alias="@graph")
```

In [84]:
# if part of organization


In [7]:
# dataset

class Dataset(FairscapeProvModel):
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Dataset")
    additionalType: Optional[str] = Field(default="Dataset")
    author: str = Field(max_length=64)
    datePublished: str = Field(...)
    version: str
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    associatedPublication: Optional[str] = None
    additionalDocumentation: Optional[str] = None
    fileFormat: str = Field(alias="format")
    dataSchema: Optional[Union[str, dict]] = Field(alias="schema", default=None)
    generatedBy: Optional[List[Union[str, Identifier]]] = Field(default=[])
    derivedFrom: Optional[List[Union[str, Identifier]]] = Field(default=[])
    usedBy: Optional[List[Union[str, Identifier]]] = Field(default = [])
    contentUrl: Optional[str] = Field(default=None)


class DatasetContainer(FairscapeProvModel): 
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Dataset", alias="@type")
    additionalType: Optional[str] = Field(default="DatasetContainer")
    name: str
    version: str = Field(default="0.1.0")
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    generatedBy: Optional[List[Union[str, Identifier]]] = Field(default=[])
    derivedFrom: Optional[List[Union[str, Identifier]]] = Field(default=[])
    usedBy: Optional[List[Union[str, Identifier]]] = Field(default = [])
    hasPart: Optional[List[Union[str, Identifier]]] = Field(default=[])
    isPartOf: Optional[List[Union[str, Identifier]]] = Field(default=[])

In [8]:
# software
class Software(FairscapeProvModel): 
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Software")
    additionalType: Optional[str] = Field(default="Software")
    author: str = Field(min_length=4, max_length=64)
    dateModified: str
    version: str
    description: str =  Field(min_length=10)
    associatedPublication: Optional[str] = Field(default=None)
    additionalDocumentation: Optional[str] = Field(default=None)
    fileFormat: str = Field(title="fileFormat", alias="format")
    usedByComputation: Optional[List[str]] = Field(default=[])
    contentUrl: Optional[str] = Field(default=None)
 

In [9]:
# computation
class Computation(FairscapeProvModel):
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Computation")
    additionalType: Optional[str] = Field(default="Computation")
    runBy: str
    dateCreated: str 
    associatedPublication: Optional[str] = Field(default=None)
    additionalDocumentation: Optional[str] = Field(default=None)
    command: Optional[Union[List[str], str]] = Field(default="")
    usedSoftware: Optional[List[str]] = Field(default=[])
    usedDataset: Optional[Union[List[str], str]] = Field(default=[])
    generated: Optional[Union[str,List[str]]] = Field(default=[])

In [11]:
class ROCrate(FairscapeBaseModel):
    metadataType: str = Field(alias="@type", default="https://schema.org/Dataset")
    metadataGraph: List[Union[Dataset, Software, Computation, DatasetContainer]] = Field(..., discriminator='addtionalType', alias="@graph")


    @computed_field(alias="@id")
    @property
    def guid(self) -> str:

        # remove trailing whitespace 
        cleaned_name = re.sub('\s+$', '', self.name)

        # remove restricted characters
        url_name = re.sub('\W','', cleaned_name.replace('', '-'))
        
        # add md5 hash digest on remainder of metadata
        sha_256_hash = hashlib.sha256()

        # use a subset of properties for hash digest
        digest_dict = {
            "name": self.name,
            "@graph": [model.model_dump_json(by_alias=True) for model in self.metadataGraph]
        }
        encoded = json.dumps(digest_dict, sort_keys=True).encode()
        sha_256_hash.update(encoded)
        digest_string = sha_256_hash.hexdigest()
        
        return f"ark:{ARK_NAAN}/rocrate-{url_name}-{digest_string[0:10]}"
        

In [12]:
class Organization(FairscapeBaseModel):
    metadataType: Optional[str] = Field(default="")
    additionalType: Optional[str] = Field(default="Organization")
    funder: Optional[List[str]] = Field(..., description="A list of guids for organizations that are responsible for funding this organization")

In [13]:
class Project(FairscapeBaseModel):
    parentOrganization: str = Field(...)

## Instance Test

In [14]:
test_dataset_one = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test dataset",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-input-1.csv"
)

test_dataset_two = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test dataset two",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-input-2.csv"
)

test_dataset_container = DatasetContainer(
    version="0.1",
    name="test dataset container",
    description = "my example test dataset",
    keywords = ["test"],
    hasPart=[
        test_dataset_one.guid,
        test_dataset_two.guid    
    ]
)

test_output = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test result",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-output.csv",
    generatedBy=[]
)

In [15]:
test_software = Software(
 #   guid=f"ark:{ARK_NAAN}/test-software",
    name="test software",
    author="Max Levinson",
    description="A test software",
    dateModified="08-09-2023",
    version="0.1.0",
    format=".py",
    keywords=["test"],
    contentUrl="file://script.py"
)

In [28]:
test_computation = Computation(
    name="computation",
    author="Max Levinson",
    description="A pretend computation",
    dateModified="08-09-2023",
    dateCreated="08-09-2023",
    version="0.1.0",
    format=".py",
    keywords=["test"],
    runBy="Max Levinson",
    usedSoftware=[test_software.guid],
    usedDataset=[test_dataset_container.guid],
    generated=[test_output.guid]
)

In [29]:
test_crate = ROCrate(
    name="test crate",
    description="a testing example",
    keywords=["test"],
    projectName="test_proj",
    organizationName="test_org",
    metadataGraph= [
        test_dataset_one,
        test_dataset_two,
        test_output,
        test_dataset_container,
        test_software,
        test_computation
    ]
)

In [23]:
import os

'/'

In [26]:
os.chdir("/com.docker.devenvironments.code/")

In [30]:
import json

with open("./tests/data/test-crate/ro-crate-metadata.json", "w") as metadata_file:
	json.dump(test_crate.model_dump(by_alias=True), metadata_file, indent=2)

In [18]:
test_crate.metadataGraph[4]

Computation(context={'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'}, metadataType='https://w3id.org/EVI#Computation', url=None, name='test software', keywords=['test'], description='A test software', sourceOrganization=None, additionalType='Computation', runBy='Max Levinson', dateCreated='08-09-2023', associatedPublication=None, additionalDocumentation=None, command='', usedSoftware=['ark:59853/rocrate-testsoftware'], usedDataset=['ark:59853/rocrate-testdatasetcontainer'], generated=['ark:59853/rocrate-testresult'], guid='ark:59853/rocrate-testsoftware')

In [224]:
test_crate.metadataGraph[2].model_dump(by_alias=True)

{'@context': {'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'},
 '@type': 'https://w3id.org/EVI#Dataset',
 'url': None,
 'name': 'test dataset container',
 'keywords': ['test'],
 'description': 'my example test dataset',
 'sourceOrganization': None,
 'additionalType': 'DatasetContainer',
 'version': '0.1',
 'generatedBy': [],
 'derivedFrom': [],
 'usedBy': [],
 'hasPart': ['ark:59853/rocrate-testdataset',
  'ark:59853/rocrate-testdatasettwo'],
 'isPartOf': [],
 '@id': 'ark:59853/rocrate-testdatasetcontainer'}

In [225]:
dataset_one_json = test_crate.metadataGraph[0].model_dump(by_alias=True)
dataset_one_json

{'@context': {'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'},
 'metadataType': 'https://w3id.org/EVI#Dataset',
 'url': None,
 'name': 'test dataset',
 'keywords': ['test'],
 'description': 'my example test dataset',
 'sourceOrganization': None,
 'additionalType': 'Dataset',
 'author': 'Max Levinson',
 'datePublished': '08-09-2023',
 'version': '0.1',
 'associatedPublication': None,
 'additionalDocumentation': None,
 'format': 'text/csv',
 'schema': None,
 'generatedBy': [],
 'derivedFrom': [],
 'usedBy': [],
 'contentUrl': 'file://test-one.csv',
 '@id': 'ark:59853/rocrate-testdataset'}

In [80]:
dataset_one_json['usedBy'] = [test_computation.guid]
dataset_one_json['isPartOf'] = [test_dataset_container.guid]

In [226]:
dataset_one_json

{'@context': {'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'},
 'metadataType': 'https://w3id.org/EVI#Dataset',
 'url': None,
 'name': 'test dataset',
 'keywords': ['test'],
 'description': 'my example test dataset',
 'sourceOrganization': None,
 'additionalType': 'Dataset',
 'author': 'Max Levinson',
 'datePublished': '08-09-2023',
 'version': '0.1',
 'associatedPublication': None,
 'additionalDocumentation': None,
 'format': 'text/csv',
 'schema': None,
 'generatedBy': [],
 'derivedFrom': [],
 'usedBy': [],
 'contentUrl': 'file://test-one.csv',
 '@id': 'ark:59853/rocrate-testdataset'}

In [59]:
test_crate.metadataGraph[0]

Dataset(guid='ark:59853/test', context={'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'}, metadataType='https://w3id.org/EVI#Dataset', url=None, name='test dataset', keywords=['test'], description='my example test dataset', author='Max Levinson', datePublished='08-09-2023', version='0.1', associatedPublication=None, additionalDocumentation=None, fileFormat='text/csv', dataSchema=None, generatedBy=[], derivedFrom=[], usedBy=[], contentUrl='file://test.csv')

In [143]:
test_crate.guid

'ark:59853/rocrate-testcrate-f99da2870e'

In [144]:
test_crate.model_dump_json(by_alias=True)

'{"@context":{"@vocab":"https://schema.org/","evi":"https://w3id.org/EVI#"},"@type":"https://schema.org/Dataset","url":null,"name":"test crate","keywords":["test"],"description":"a testing example","projectName":"test_proj","organizationName":"test_org","@graph":[{"@context":{"@vocab":"https://schema.org/","evi":"https://w3id.org/EVI#"},"metadataType":"https://w3id.org/EVI#Dataset","url":null,"name":"test dataset","keywords":["test"],"description":"my example test dataset","sourceOrganization":null,"additionalType":"Dataset","author":"Max Levinson","datePublished":"08-09-2023","version":"0.1","associatedPublication":null,"additionalDocumentation":null,"format":"text/csv","schema":null,"generatedBy":[],"derivedFrom":[],"usedBy":[],"contentUrl":"file://test-one.csv","@id":"ark:59853/rocrate-testdataset"},{"@context":{"@vocab":"https://schema.org/","evi":"https://w3id.org/EVI#"},"metadataType":"https://w3id.org/EVI#Dataset","url":null,"name":"test dataset two","keywords":["test"],"descrip

## Entailment properties

### Inverse Entailment
- Filter all computations
    - usedDataset
    - usedSoftware
    - generated

- For each of these lists, filter for those guids and add inverse property

### Transitivity for Dataset Containers

If a computation used a dataset container, all elements are usedBy that computation

- Leave TODO for reverse all elements are usedBy computation, entails that the set is usedBy scomputation

### Transitivity for ROCrate

Using an ROCrate means only the tail of a provenance chain is 

In [19]:
passed_crate = test_crate

In [228]:
computations = list(filter(lambda x: x.additionalType == "Computation", passed_crate.metadataGraph))

# can alter pydantic data from 
computations[0].name = "my software"
computations

passed_crate.metadataGraph

[Dataset(context={'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'}, metadataType='https://w3id.org/EVI#Dataset', url=None, name='test dataset', keywords=['test'], description='my example test dataset', sourceOrganization=None, additionalType='Dataset', author='Max Levinson', datePublished='08-09-2023', version='0.1', associatedPublication=None, additionalDocumentation=None, fileFormat='text/csv', dataSchema=None, generatedBy=[], derivedFrom=[], usedBy=[], contentUrl='file://test-one.csv', guid='ark:59853/rocrate-testdataset'),
 Dataset(context={'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'}, metadataType='https://w3id.org/EVI#Dataset', url=None, name='test dataset two', keywords=['test'], description='my example test dataset', sourceOrganization=None, additionalType='Dataset', author='Max Levinson', datePublished='08-09-2023', version='0.1', associatedPublication=None, additionalDocumentation=None, fileFormat='text/csv', dataSchema=None, generatedBy=[]

In [229]:
# if an element from a list is selected
# and data property is changed it chages globally
passed_computation = computations[0]
passed_computation.name = "test computation"
passed_computation.name

'test computation'

In [230]:
# three lists of ids
used_datasets = passed_computation.usedDataset
used_software = passed_computation.usedSoftware
generated_datasets = passed_computation.generated 

In [None]:
def entailment(passed_crate):

    computations = list(filter(lambda x: x.additionalType == "Computation", passed_crate.metadataGraph))

    def filterCrateByGUID(guid):
        return list(filter(lambda x: x.guid==guid, passed_crate.metadataGraph))

    def inverseUsedDataset(used_dataset_guid, computation_guid):
        used_dataset_list = filterCrateByGUID(used_dataset_guid)
        
        # update each dataset as 
        for used_dataset in used_dataset_list:
            used_dataset.usedBy.append(computation_guid)

    def inverseUsedSoftware(used_software_guid, computation_guid):
        used_software_list = filterCrateByGUID(used_software_guid) 
        
        for used_software in used_software_list:
            used_software.usedBy.append(computation_guid)


    def inverseGenerated(generated_guid, computation_guid):
        generated_list = filterCrateByGUID(generated_guid)

        for generated_element in generated_list:
            generated_element.generatedBy.append(computation_guid)


    for computation_element in computations:
        #used_datasets = computation.usedDatasets
        #used_software = computation.usedSoftware
        #  generated = computation.generated

        [ inverseUsedDataset(used_dataset.guid, computation_element.guid) for used_dataset in computation_element.usedDatasets]
        [ inverseUsedSoftware(used_software.guid, computation_element.guid) for used_software in computation_element.usedSoftware]
        [ inverseGenerated(generated.guid, computation_element.guid) for generated in computation_element.generated]
        


['ark:59853/rocrate-testcomputation']