In [4]:
from pydantic import (
    BaseModel,
    Field,
    constr,
    computed_field,
    ConfigDict
)

from typing import (
    Optional,
    List,
    Dict,
    Union,
    Literal
)

import hashlib
import json
import re

## ROCrate Models

In [5]:
ARK_NAAN = "59852"

In [16]:
default_context = {
    "@vocab": "https://schema.org/",
    "evi": "https://w3id.org/EVI#"
}


class Identifier(BaseModel):
    guid: str = Field(
        title="guid",
        alias="@id" 
    )
    metadataType: str = Field(
        title="metadataType",
        alias="@type" 
    )
    name: str


class FairscapeBaseModel(BaseModel):
    model_config = ConfigDict(
        populate_by_name = True,
        validate_assignment = True,  
    )
    context: Dict[str,str] = Field(
        default=default_context,
        title="context",
        alias="@context"
    )
    metadataType: str = Field(
        title="metadataType",
        alias="@type"
    )
    url: Optional[str] = Field(default=None)
    name: str = Field(max_length=200)
    keywords: List[str] = Field(default=[])
    description: str = Field(min_length=5)
    isPartOf: Optional[List[str]] = Field(default=[])


### Prov Elements

In [14]:
# dataset
class Dataset(FairscapeBaseModel):
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Dataset")
    additionalType: Literal['Dataset']
    #additionalType: Optional[str] = Field(default="Dataset")
    author: str = Field(max_length=64)
    datePublished: str = Field(...)
    version: str = Field(default="0.1.0")
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    associatedPublication: Optional[str] = Field(default=None)
    additionalDocumentation: Optional[str] = Field(default=None)
    fileFormat: str = Field(alias="format")
    dataSchema: Optional[Union[str, dict]] = Field(alias="schema", default=None)
    generatedBy: Optional[List[str]] = Field(default=[])
    derivedFrom: Optional[List[str]] = Field(default=[])
    usedBy: Optional[List[str]] = Field(default = [])
    contentUrl: Optional[str] = Field(default=None)


class DatasetContainer(FairscapeBaseModel): 
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Dataset", alias="@type")
    additionalType: Literal['DatasetContainer']
    #additionalType: Optional[str] = Field(default="DatasetContainer")
    name: str
    version: str = Field(default="0.1.0")
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    generatedBy: Optional[List[str]] = Field(default=[])
    derivedFrom: Optional[List[str]] = Field(default=[])
    usedBy: Optional[List[str]] = Field(default = [])
    hasPart: Optional[List[str]] = Field(default=[])
    isPartOf: Optional[List[str]] = Field(default=[])


# software
class Software(FairscapeBaseModel): 
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Software")
    additionalType: Optional[str] = Field(default="Software")
    author: str = Field(min_length=4, max_length=64)
    dateModified: str
    version: str
    description: str =  Field(min_length=10)
    associatedPublication: Optional[str] = Field(default=None)
    additionalDocumentation: Optional[str] = Field(default=None)
    fileFormat: str = Field(title="fileFormat", alias="format")
    usedByComputation: Optional[List[str]] = Field(default=[])
    contentUrl: Optional[str] = Field(default=None)

# computation
class Computation(FairscapeBaseModel):
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Computation")
    additionalType: Optional[str] = Field(default="Computation")
    runBy: str
    dateCreated: str 
    associatedPublication: Optional[str] = Field(default=None)
    additionalDocumentation: Optional[str] = Field(default=None)
    command: Optional[Union[List[str], str]] = Field(default="")
    usedSoftware: Optional[List[str]] = Field(default=[])
    usedDataset: Optional[Union[List[str], str]] = Field(default=[])
    generated: Optional[Union[str,List[str]]] = Field(default=[])

### ROCrate Model

```
class ROCrate(BaseModel):
    guid: Optional[str] = Field(default="")
    metadataType: str = Field(default="https://schema.org/Dataset")
    name: str = Field(max_length=200)
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    projectName: Optional[str]
    organizationName: Optional[str]
    path: pathlib.Path
    metadataGraph: Optional[List[Union[Dataset,Software, Computation]]] = Field(alias="@graph")
```

In [15]:
class ROCrate(FairscapeBaseModel):
    metadataType: str = Field(alias="@type", default="https://schema.org/Dataset")
    metadataGraph: List[Union[Dataset, Software, Computation, DatasetContainer]] = Field(..., discriminator='addtionalType', alias="@graph") 

## Computed GUID Property

- Can be put on any pydantic class
- Unclear how and when this is called if a property is specified.
- TODO: Need to detect identifier collisions before inserting records into mongo, and replacing identifiers in evi graph

In [None]:

#@computed_field(alias="@id")
#    @property
def guid(self) -> str:

	# remove trailing whitespace 
	cleaned_name = re.sub('\s+$', '', self.name)

	# remove restricted characters
	url_name = re.sub('\W','', cleaned_name.replace('', '-'))

	# add md5 hash digest on remainder of metadata
	sha_256_hash = hashlib.sha256()

	# use a subset of properties for hash digest
	digest_dict = {
		"name": self.name,
		"@graph": [model.model_dump_json(by_alias=True) for model in self.metadataGraph]
	}
	encoded = json.dumps(digest_dict, sort_keys=True).encode()
	sha_256_hash.update(encoded)
	digest_string = sha_256_hash.hexdigest()

	return f"ark:{ARK_NAAN}/rocrate-{url_name}-{digest_string[0:10]}"

### Organization Model

In [12]:
class Organization(FairscapeBaseModel):
    metadataType: Optional[str] = Field(default="")
    additionalType: Optional[str] = Field(default="Organization")
    funder: Optional[List[str]] = Field(..., description="A list of guids for organizations that are responsible for funding this organization")

### Project Model

In [None]:
class Project(FairscapeBaseModel):
    parentOrganization: str = Field(...)

## ROCrate Instance Test

In [None]:
test_dataset_one = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test dataset",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-input-1.csv"
)

test_dataset_two = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test dataset two",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-input-2.csv"
)

test_dataset_container = DatasetContainer(
    version="0.1",
    name="test dataset container",
    description = "my example test dataset",
    keywords = ["test"],
    hasPart=[
        test_dataset_one.guid,
        test_dataset_two.guid    
    ]
)

test_output = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test result",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-output.csv",
    generatedBy=[]
)

In [15]:
test_software = Software(
 #   guid=f"ark:{ARK_NAAN}/test-software",
    name="test software",
    author="Max Levinson",
    description="A test software",
    dateModified="08-09-2023",
    version="0.1.0",
    format=".py",
    keywords=["test"],
    contentUrl="file://script.py"
)

In [None]:
test_computation = Computation(
    name="computation",
    author="Max Levinson",
    description="A pretend computation",
    dateModified="08-09-2023",
    dateCreated="08-09-2023",
    version="0.1.0",
    format=".py",
    keywords=["test"],
    runBy="Max Levinson",
    usedSoftware=[test_software.guid],
    usedDataset=[test_dataset_container.guid],
    generated=[test_output.guid]
)

In [None]:
test_crate = ROCrate(
    name="test crate",
    description="a testing example",
    keywords=["test"],
    projectName="test_proj",
    organizationName="test_org",
    metadataGraph= [
        test_dataset_one,
        test_dataset_two,
        test_output,
        test_dataset_container,
        test_software,
        test_computation
    ]
)

# Create RO-Crate zip file 

In [18]:
test_crate.metadataGraph[4]

In [None]:
test_crate.metadataGraph[2].model_dump(by_alias=True)

In [None]:
dataset_one_json = test_crate.metadataGraph[0].model_dump(by_alias=True)
dataset_one_json

In [None]:
dataset_one_json['usedBy'] = [test_computation.guid]
dataset_one_json['isPartOf'] = [test_dataset_container.guid]

In [None]:
dataset_one_json

In [None]:
test_crate.metadataGraph[0]

In [None]:
test_crate.guid

In [None]:
test_crate.model_dump_json(by_alias=True)

## Entailment properties

### Inverse Entailment
- Filter all computations
    - usedDataset
    - usedSoftware
    - generated

- For each of these lists, filter for those guids and add inverse property

metadata_validation_status = validate_rocrate_object_reference(rocrate, minio_client, mongo_client, rocrate_zip_file)

# In a nutshell: Upload RO-Crate archive to the object store and validate 

In [76]:
import requests
import json
from requests_toolbelt.multipart.encoder import MultipartEncoder
### Transitivity for Dataset Containers

If a computation used a dataset container, all elements are usedBy that computation

# upload a rocrate to minio object store
rocrate_transfer = requests.post(
    root_url + f"rocrate/upload",
    data=mp_encoder,                              
    # The MultipartEncoder provides the content-type header with the boundary:
    headers={'Content-Type': mp_encoder.content_type}
)
rocrate_transfer.json()

{'created': {'@id': 'ark:59853/rocrate-Exampleinputdataset-298dd2e013',
  '@type': 'Dataset',
  'name': 'Example input dataset'}}

In [None]:
passed_crate = test_crate

In [None]:
def entailPartOf(rocrate: ROCrate):
    """ entailPartOf

root_url = "http://localhost:8080/"

rocrate_download = requests.get(root_url + f"rocrate/archived/download/ark:59853/rocrate-Exampleinputdataset-298dd2e013", headers={"Content-Type":"application/zip"})

rocrate_download.raise_for_status() # ensure we notice bad responses

# save the downloaded crate in the project root directory
with open("downloaded-rocrate.zip", "wb") as file:
    file.write(rocrate_download.content)


# Download extracted ROCrate as a zip file using the API

In [None]:
def provEntailment(passed_crate):

    computations = list(filter(lambda x: x.additionalType == "Computation", passed_crate.metadataGraph))

    def filterCrateByGUID(guid):
        return list(filter(lambda x: x.guid==guid, passed_crate.metadataGraph))

    def inverseUsedDataset(used_dataset_guid, computation_guid):
        used_dataset_list = filterCrateByGUID(used_dataset_guid)
        
        # update each dataset as 
        for used_dataset in used_dataset_list:
            used_dataset.usedBy.append(computation_guid)

    def inverseUsedSoftware(used_software_guid, computation_guid):
        used_software_list = filterCrateByGUID(used_software_guid) 
        
        for used_software in used_software_list:
            used_software.usedBy.append(computation_guid)


    def inverseGenerated(generated_guid, computation_guid):
        generated_list = filterCrateByGUID(generated_guid)

rocrate_download = requests.get(root_url + f"rocrate/extracted/download/ark:59853/rocrate-Exampleinputdataset-298dd2e013", headers={"Content-Type":"application/zip"})
        for generated_element in generated_list:
            generated_element.generatedBy.append(computation_guid)


# save the downloaded crate in the project root directory
with open("downloaded-rocrate.zip", "wb") as file:
    file.write(rocrate_download.content)

In [3]:
import pymongo

def registerROCrateIdentifiers(rocrate: ROCrate, identifier_collection: pymongo.collection.Collection):
    """ Run to insert all identifiers into ROCrate
    """
    pass

NameError: name 'ROCrate' is not defined

## API Tests

### Zipping Test Data

In [None]:
# dump test instance into test-crate
import json
with open("./tests/data/test-crate/ro-crate-metadata.json", "w") as metadata_file:
	json.dump(test_crate.model_dump(by_alias=True), metadata_file, indent=1)


import shutil
shutil.make_archive(
    base_name="/com.docker.devenvironments.code/tests/data/test-crate", 
    format="zip", 
    base_dir="/com.docker.devenvironments.code/tests/data/test-crate/"
    )


# alternatively use zipfile to iterate through files to write to a zip archive

#import zipfile
#with zipfile.ZipFile('/com.docker.devenvironments.code/tests/data/test-crate.zip', 'r') as zip_crate:
#	pass 

'/com.docker.devenvironments.code/tests/data/test-crate.zip'

### Success Test

In [44]:
import requests
fairscape_root_url = "http://localhost:8080"

In [45]:
with open('/com.docker.devenvironments.code/tests/data/test-crate.zip', 'rb') as zipped_crate:
	upload_request = requests.post(
		url=f"{fairscape_root_url}/rocrate/upload",
		files={"rocrate": ("test-crate.zip", zipped_crate)}
	)

In [46]:
upload_request.status_code

500

In [47]:
upload_request.json()

Exception ignored in: <function Minio.__del__ at 0x7f7dfaca8860>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/minio/api.py", line 160, in __del__
    self._http.clear()
    ^^^^^^^^^^
AttributeError: 'Minio' object has no attribute '_http'


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

### Parsing Fail Test

In [17]:
import minio

In [25]:
from dotenv import load_dotenv

load_dotenv()

False

In [35]:
from dotenv import dotenv_values
from pathlib import Path

config = dotenv_values("/com.docker.devenvironments.code/.env")

In [39]:
bool(config['MINIO_SECURE']=="True")

False

In [33]:
minio_client = minio.Minio(
    endpoint=f"{config['MINIO_URI']}:{config['MINIO_PORT']}", 
    access_key=config['MINIO_ACCESS_KEY'],
    secret_key=config['MINIO_SECRET_KEY'],
    secure=False
    #http_client=
    )

In [34]:
minio_client.list_buckets()

[]