In [None]:
from pydantic import (
    BaseModel,
    Field,
    constr,
    computed_field,
    ConfigDict
)

In [None]:
from typing import (
    Optional,
    List,
    Dict,
    Union
)

In [None]:
import hashlib
import json
import re

## ROCrate Models

In [None]:
ARK_NAAN = "59852"

In [None]:
default_context = {
    "@vocab": "https://schema.org/",
    "evi": "https://w3id.org/EVI#"
}


class Identifier(BaseModel):
    guid: str = Field(
        title="guid",
        alias="@id" 
    )
    metadataType: str = Field(
        title="metadataType",
        alias="@type" 
    )
    name: str


class FairscapeBaseModel(BaseModel):
    model_config = ConfigDict(
        populate_by_name = True,
        validate_assignment = True,  
    )
    context: Dict[str,str] = Field(
        default=default_context,
        title="context",
        alias="@context"
    )
    metadataType: str = Field(
        title="metadataType",
        alias="@type"
    )
    url: Optional[str] = Field(default=None)
    name: str = Field(max_length=200)
    keywords: List[str] = Field(default=[])
    description: str = Field(min_length=5)

    @computed_field(alias="@id")
    @property
    def guid(self) -> str:
        # TODO url encode values
        # TODO add random hash digest

        # if 
        return f"ark:{ARK_NAAN}/rocrate-{self.name.replace(' ', '')}"

In [None]:
class FairscapeProvModel(FairscapeBaseModel):
     sourceOrganization: Optional[str] = Field(default=None)

### rocrate

```
class ROCrate(BaseModel):
    guid: Optional[str] = Field(default="")
    metadataType: str = Field(default="https://schema.org/Dataset")
    name: str = Field(max_length=200)
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    projectName: Optional[str]
    organizationName: Optional[str]
    path: pathlib.Path
    metadataGraph: Optional[List[Union[Dataset,Software, Computation]]] = Field(alias="@graph")
```

In [None]:
# if part of organization


In [None]:
# dataset

class Dataset(FairscapeProvModel):
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Dataset")
    additionalType: Optional[str] = Field(default="Dataset")
    author: str = Field(max_length=64)
    datePublished: str = Field(...)
    version: str
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    associatedPublication: Optional[str] = None
    additionalDocumentation: Optional[str] = None
    fileFormat: str = Field(alias="format")
    dataSchema: Optional[Union[str, dict]] = Field(alias="schema", default=None)
    generatedBy: Optional[List[Union[str, Identifier]]] = Field(default=[])
    derivedFrom: Optional[List[Union[str, Identifier]]] = Field(default=[])
    usedBy: Optional[List[Union[str, Identifier]]] = Field(default = [])
    contentUrl: Optional[str] = Field(default=None)


class DatasetContainer(FairscapeProvModel): 
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Dataset", alias="@type")
    additionalType: Optional[str] = Field(default="DatasetContainer")
    name: str
    version: str = Field(default="0.1.0")
    description: str = Field(min_length=10)
    keywords: List[str] = Field(...)
    generatedBy: Optional[List[Union[str, Identifier]]] = Field(default=[])
    derivedFrom: Optional[List[Union[str, Identifier]]] = Field(default=[])
    usedBy: Optional[List[Union[str, Identifier]]] = Field(default = [])
    hasPart: Optional[List[Union[str, Identifier]]] = Field(default=[])
    isPartOf: Optional[List[Union[str, Identifier]]] = Field(default=[])

In [None]:
# software
class Software(FairscapeProvModel): 
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Software")
    additionalType: Optional[str] = Field(default="Software")
    author: str = Field(min_length=4, max_length=64)
    dateModified: str
    version: str
    description: str =  Field(min_length=10)
    associatedPublication: Optional[str] = Field(default=None)
    additionalDocumentation: Optional[str] = Field(default=None)
    fileFormat: str = Field(title="fileFormat", alias="format")
    usedByComputation: Optional[List[str]] = Field(default=[])
    contentUrl: Optional[str] = Field(default=None)
 

In [None]:
# computation
class Computation(FairscapeProvModel):
    metadataType: Optional[str] = Field(default="https://w3id.org/EVI#Computation")
    additionalType: Optional[str] = Field(default="Computation")
    runBy: str
    dateCreated: str 
    associatedPublication: Optional[str] = Field(default=None)
    additionalDocumentation: Optional[str] = Field(default=None)
    command: Optional[Union[List[str], str]] = Field(default="")
    usedSoftware: Optional[List[str]] = Field(default=[])
    usedDataset: Optional[Union[List[str], str]] = Field(default=[])
    generated: Optional[Union[str,List[str]]] = Field(default=[])

In [None]:
class ROCrate(FairscapeBaseModel):
    metadataType: str = Field(alias="@type", default="https://schema.org/Dataset")
    metadataGraph: List[Union[Dataset, Software, Computation, DatasetContainer]] = Field(..., discriminator='addtionalType', alias="@graph")


    @computed_field(alias="@id")
    @property
    def guid(self) -> str:

        # remove trailing whitespace 
        cleaned_name = re.sub('\s+$', '', self.name)

        # remove restricted characters
        url_name = re.sub('\W','', cleaned_name.replace('', '-'))
        
        # add md5 hash digest on remainder of metadata
        sha_256_hash = hashlib.sha256()

        # use a subset of properties for hash digest
        digest_dict = {
            "name": self.name,
            "@graph": [model.model_dump_json(by_alias=True) for model in self.metadataGraph]
        }
        encoded = json.dumps(digest_dict, sort_keys=True).encode()
        sha_256_hash.update(encoded)
        digest_string = sha_256_hash.hexdigest()
        
        return f"ark:{ARK_NAAN}/rocrate-{url_name}-{digest_string[0:10]}"
        

In [None]:
class Organization(FairscapeBaseModel):
    metadataType: Optional[str] = Field(default="")
    additionalType: Optional[str] = Field(default="Organization")
    funder: Optional[List[str]] = Field(..., description="A list of guids for organizations that are responsible for funding this organization")

In [None]:
class Project(FairscapeBaseModel):
    parentOrganization: str = Field(...)

## Instance Test

In [None]:
test_dataset_one = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test dataset",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-input-1.csv"
)

test_dataset_two = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test dataset two",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-input-2.csv"
)

test_dataset_container = DatasetContainer(
    version="0.1",
    name="test dataset container",
    description = "my example test dataset",
    keywords = ["test"],
    hasPart=[
        test_dataset_one.guid,
        test_dataset_two.guid    
    ]
)

test_output = Dataset(
    author="Max Levinson",
    datePublished = "08-09-2023",
    version="0.1",
    name="test result",
    description = "my example test dataset",
    keywords = ["test"],
    fileFormat = "text/csv",
    contentUrl="file://test-output.csv",
    generatedBy=[]
)

In [None]:
test_software = Software(
 #   guid=f"ark:{ARK_NAAN}/test-software",
    name="test software",
    author="Max Levinson",
    description="A test software",
    dateModified="08-09-2023",
    version="0.1.0",
    format=".py",
    keywords=["test"],
    contentUrl="file://script.py"
)

In [None]:
test_computation = Computation(
    name="computation",
    author="Max Levinson",
    description="A pretend computation",
    dateModified="08-09-2023",
    dateCreated="08-09-2023",
    version="0.1.0",
    format=".py",
    keywords=["test"],
    runBy="Max Levinson",
    usedSoftware=[test_software.guid],
    usedDataset=[test_dataset_container.guid],
    generated=[test_output.guid]
)

In [None]:
test_crate = ROCrate(
    name="test crate",
    description="a testing example",
    keywords=["test"],
    projectName="test_proj",
    organizationName="test_org",
    metadataGraph= [
        test_dataset_one,
        test_dataset_two,
        test_output,
        test_dataset_container,
        test_software,
        test_computation
    ]
)

# Create RO-Crate zip file 

In [None]:
import zipfile
import os

# write metadata to the file
with open("./tests/data/test-crate/ro-crate-metadata.json", "w") as metadata_file:
	json.dump(test_crate.model_dump(by_alias=True), metadata_file, indent=2)

# create a zip file from a directory   
def zipdir(target_dir, handler):    
    for root, dirs, files in os.walk(target_dir):
        for file in files:
            handler.write(os.path.join(root, file), 
                       os.path.relpath(os.path.join(root, file), 
                                       os.path.join(target_dir, '..')))

rocrate_zip_file = './tests/data/1.ppi_download.zip'
#rocrate_zip_file = './tests/data/2.ppi_embedding.zip'
#rocrate_zip_file = './tests/data/3.coembedding.zip'
#rocrate_zip_file = './tests/data/4.hierarchy.zip'
with zipfile.ZipFile(rocrate_zip_file, 'w', zipfile.ZIP_DEFLATED) as zip_handler:
    zipdir('./tests/data/1.ppi_download/', zip_handler)
#    zipdir('./tests/data/2.ppi_embedding/', zip_handler)
#    zipdir('./tests/data/3.coembedding/', zip_handler)
#    zipdir('./tests/data/4.hierarchy/', zip_handler) 

# Prepare MINio object store for upload

In [None]:
import json
import io
import minio
from minio.error import MinioException
from fastapi.responses import JSONResponse

class MinioConfig(BaseModel):
    uri: str 
    user: str 
    password: str 
    default_bucket: str 
    secure: bool
    
    def CreateClient(self):
        return minio.Minio(
                self.uri, 
                access_key= self.user, 
                secret_key= self.password,
                secure = self.secure
                )

minio_config = MinioConfig(
    uri="localhost:9000",
    user="testroot",
    password="testroot",
    default_bucket="test",
    secure=False
)

minio_client = minio_config.CreateClient()

ROCRATE_BUCKET_NAME = "crate-contents"


found = minio_client.bucket_exists(ROCRATE_BUCKET_NAME)
if found:
    print("Bucket exits: ", ROCRATE_BUCKET_NAME)
else:
    print("Could not connect to bucket: ", ROCRATE_BUCKET_NAME)

# Set up Mongo for metadata upload

In [None]:
from pymongo import MongoClient
from urllib.parse import quote_plus

class MongoConfig(BaseModel):
    host: Optional[str] = "localhost"
    port: Optional[str] = "27017"
    user: Optional[str] = "root"
    password: Optional[str] = "rootpass"
    db: Optional[str] = "fairscape"
    identifier_collection: Optional[str] = "mds"
    rocrate_collection: Optional[str] = "rocrate"
    user_collection: Optional[str] = "users"
    session_collection: Optional[str] = "sessions"


    def CreateClient(self):

        #connection_string = f"mongodb://{quote_plus(self.user)}:{quote_plus(self.password)}@{self.host}:{self.port}/{self.db}"
        connection_string = f"mongodb://{quote_plus(self.user)}:{quote_plus(self.password)}@{self.host}:{self.port}"
        print(connection_string)
        return MongoClient(connection_string)

def get_mongo_config():
    return MongoConfig(
        host= "localhost",
        port= "27017",
        user= "root",
        password= "rootpass",
        db= "fairscape",
        rocrate_collection= "rocrate"
    )

mongo_config = get_mongo_config()
mongo_client = mongo_config.CreateClient()
# print(mongo_client.server_info())




# Upload RO-Crate zip file to the bucket

In [None]:
def unzip_and_upload(MinioClient, Object):
    
    try:        
        with open(Object, "rb") as zip_object:
            zip_contents = zip_object.read()
            
            with zipfile.ZipFile(io.BytesIO(zip_contents), "r") as zip_file:                         
                for file_info in zip_file.infolist():
                    file_contents = zip_file.read(file_info.filename)
                    MinioClient.put_object(ROCRATE_BUCKET_NAME, file_info.filename, io.BytesIO(file_contents), len(file_contents))                
    except Exception as e:
        return JSONResponse(status_code=500, 
                            content=f"Exception uploading ROCrate: {str(e)}")
    return JSONResponse(status_code=200, content=f"Upload successful :)")

# Unzip the archive and upload to minio
upload_status = unzip_and_upload(minio_client, rocrate_zip_file)

if upload_status.status_code == 200:
    print(upload_status.body)
else:
    print(upload_status.body)

# Get metadata from uploaded RO-Crate

In [None]:
from pathlib import Path

def get_metadata_from_crate(minio_client, crate_file_name):
    rocrate_root_dir = Path(rocrate_zip_file).stem
    objects = minio_client.list_objects(ROCRATE_BUCKET_NAME, prefix=rocrate_root_dir, recursive=True)
        
    for obj in objects:
        if obj.object_name.endswith(crate_file_name): 
            metadata_content = minio_client.get_object(ROCRATE_BUCKET_NAME, obj.object_name).read()
            return metadata_content                                                     

RO_CRATE_METADATA_FILE_NAME = 'ro-crate-metadata.json'

rocrate_metadata_read = get_metadata_from_crate(minio_client, RO_CRATE_METADATA_FILE_NAME)

if rocrate_metadata_read:
    print(rocrate_metadata_read)
else:
    print(f"{RO_CRATE_METADATA_FILE_NAME} not found in ROCrate")

rocrate = ROCrate(**json.loads(rocrate_metadata_read))
# print(rocrate.guid)                  

# Validate RO-Crate metadata

In [None]:
from pathlib import Path
import pymongo

def validate_rocrate_object_reference(rocrate, MinioClient, MongoClient: pymongo.MongoClient, Object):
    
    # prefix, org, proj, creative_work_id = crate.guid.split("/")
    prefix, creative_work_id = rocrate.guid.split("/")
        
    archived_object_path = f"{creative_work_id}/{rocrate.name}"
    # print(archived_object_path)
        
    # List instances of Dataset and Software in the ROCrate metadata
    object_instances_in_metadata = list(filter(
        lambda x: (x.additionalType == "Dataset" 
                   or x.additionalType == "Software"), 
                rocrate.metadataGraph)
    )

     # List full object paths specified in the ROCrate metadata
    object_paths_in_metadata = [obj_instance.contentUrl for obj_instance in object_instances_in_metadata]
        
    # List object names only from their full path                    
    objects_in_metadata = [Path(obj).name for obj in object_paths_in_metadata]
    
    try:
        rocrate_root_dir = Path(rocrate_zip_file).stem
        object_instances_in_crate = MinioClient.list_objects(ROCRATE_BUCKET_NAME, prefix = rocrate_root_dir, recursive=True)
        object_paths_in_crate = [obj_instance.object_name for obj_instance in object_instances_in_crate]            
        objects_in_crate = [Path(obj).name for obj in object_paths_in_crate]
                
            # Check if metadata objects exist in the crate
        if set(objects_in_metadata).issubset(set(objects_in_crate)):                
            #print("is a subset") 
                        
            with open(Object, "rb") as zip_object:
                #zip_contents = zip_object.read()
                upload_operation = MinioClient.put_object(
                    bucket_name="test",
                    object_name=f"{archived_object_path}.zip",                    
                    data=zip_object,                
                    length=-1,
                    part_size= 5 * 1024 * 1024 ,
                    content_type="application/zip"
                )
            
            
        else:
            missing_objects = set(objects_in_metadata) - set(objects_in_crate)
            print("missing objects: ", missing_objects)
    
    except Exception as e:
            print(f"exception validating objects in ROCrate: {str(e)}")    
    
    # create database
    mongo_db = mongo_client[mongo_config.db]
    rocrate_collection = mongo_db[mongo_config.rocrate_collection]
    
    data = rocrate.model_dump(by_alias=True)
    
    
    data["distribution"] = {"extractedROCrateBucket": ROCRATE_BUCKET_NAME, 
                            "archivedROCrateBucket": "test", 
                            "extractedObjectPath": object_paths_in_crate, 
                            "archivedObjectPath": f"{archived_object_path}.zip"
                            }
    
    
    insert_op = rocrate_collection.insert_one(data)
    
    #print(insert_op.inserted_id)
    

metadata_validation_status = validate_rocrate_object_reference(rocrate, minio_client, mongo_client, rocrate_zip_file)

# In a nutshell: Upload RO-Crate archive to the object store and validate 

In [88]:
import requests
import json
from requests_toolbelt.multipart.encoder import MultipartEncoder

root_url = "http://localhost:8080/"
mp_encoder = MultipartEncoder(
    fields={        
        # 'file' should be same as the argument in the POST method
        # 'file': ('test-rocrate', open(rocrate_zip_file, 'rb'), 'application/zip')
        'file': ('test-rocrate', open('/home/sadnan/work/uva/projects/mds_python/tests/data/1.ppi_download.zip', 'rb'), 'application/zip')
    }
)

# upload a rocrate to minio object store
rocrate_transfer = requests.post(
    root_url + f"rocrate/upload",
    data=mp_encoder,                              
    # The MultipartEncoder provides the content-type header with the boundary:
    headers={'Content-Type': mp_encoder.content_type}
)
rocrate_transfer.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

# Download archived ROCrate as a zip file using the API

In [85]:
import requests

root_url = "http://localhost:8080/"

rocrate_download = requests.get(root_url + f"rocrate/archived/download/ark:59852/rocrate-Exampleinputdataset-61d58dab2b", headers={"Content-Type":"application/zip"})

rocrate_download.raise_for_status() # ensure we notice bad responses

# save the downloaded crate in the project root directory
with open("downloaded-rocrate.zip", "wb") as file:
    file.write(rocrate_download.content)


# Download extracted ROCrate as a zip file using the API

In [None]:
import requests

root_url = "http://localhost:8080/"

rocrate_download = requests.get(root_url + f"rocrate/extracted/download/ark:59852/rocrate-Exampleinputdataset-61d58dab2b", headers={"Content-Type":"application/zip"})

rocrate_download.raise_for_status() # ensure we notice bad responses

# save the downloaded crate in the project root directory
with open("downloaded-rocrate.zip", "wb") as file:
    file.write(rocrate_download.content)

In [None]:
import os

In [None]:
os.chdir("/com.docker.devenvironments.code/")

In [None]:
import json

with open("./tests/data/test-crate/ro-crate-metadata.json", "w") as metadata_file:
	json.dump(test_crate.model_dump(by_alias=True), metadata_file, indent=2)

In [None]:
test_crate.metadataGraph[4]

In [None]:
test_crate.metadataGraph[2].model_dump(by_alias=True)

In [None]:
dataset_one_json = test_crate.metadataGraph[0].model_dump(by_alias=True)
dataset_one_json

In [None]:
dataset_one_json['usedBy'] = [test_computation.guid]
dataset_one_json['isPartOf'] = [test_dataset_container.guid]

In [None]:
dataset_one_json

In [None]:
test_crate.metadataGraph[0]

In [None]:
test_crate.guid

In [None]:
test_crate.model_dump_json(by_alias=True)

## Entailment properties

### Inverse Entailment
- Filter all computations
    - usedDataset
    - usedSoftware
    - generated

- For each of these lists, filter for those guids and add inverse property

### Transitivity for Dataset Containers

If a computation used a dataset container, all elements are usedBy that computation

- Leave TODO for reverse all elements are usedBy computation, entails that the set is usedBy scomputation

### Transitivity for ROCrate

Using an ROCrate means only the tail of a provenance chain is 

In [None]:
passed_crate = test_crate

In [None]:
computations = list(filter(lambda x: x.additionalType == "Computation", passed_crate.metadataGraph))

# can alter pydantic data from 
computations[0].name = "my software"
computations

passed_crate.metadataGraph

In [None]:
# if an element from a list is selected
# and data property is changed it chages globally
passed_computation = computations[0]
passed_computation.name = "test computation"
passed_computation.name

In [None]:
# three lists of ids
used_datasets = passed_computation.usedDataset
used_software = passed_computation.usedSoftware
generated_datasets = passed_computation.generated 

In [None]:
def entailment(passed_crate):

    computations = list(filter(lambda x: x.additionalType == "Computation", passed_crate.metadataGraph))

    def filterCrateByGUID(guid):
        return list(filter(lambda x: x.guid==guid, passed_crate.metadataGraph))

    def inverseUsedDataset(used_dataset_guid, computation_guid):
        used_dataset_list = filterCrateByGUID(used_dataset_guid)
        
        # update each dataset as 
        for used_dataset in used_dataset_list:
            used_dataset.usedBy.append(computation_guid)

    def inverseUsedSoftware(used_software_guid, computation_guid):
        used_software_list = filterCrateByGUID(used_software_guid) 
        
        for used_software in used_software_list:
            used_software.usedBy.append(computation_guid)


    def inverseGenerated(generated_guid, computation_guid):
        generated_list = filterCrateByGUID(generated_guid)

        for generated_element in generated_list:
            generated_element.generatedBy.append(computation_guid)


    for computation_element in computations:
        #used_datasets = computation.usedDatasets
        #used_software = computation.usedSoftware
        #  generated = computation.generated

        [ inverseUsedDataset(used_dataset.guid, computation_element.guid) for used_dataset in computation_element.usedDatasets]
        [ inverseUsedSoftware(used_software.guid, computation_element.guid) for used_software in computation_element.usedSoftware]
        [ inverseGenerated(generated.guid, computation_element.guid) for generated in computation_element.generated]
        
