In [1]:
# import rocrate models
import os
import sys

#sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../')))
#os.path.join(os.path.dirname(__file__), '../')
srcPath = os.path.abspath('C:\\Users\\Max\\Documents\\GitHub\\mds_python\\src\\' )
sys.path.insert(0, srcPath)

In [None]:
import pathlib
from fairscape_mds.config import create_fairscape_config
from dotenv import load_dotenv
import os

load_dotenv("../local.env")

fairscapeConfig = create_fairscape_config(os.environ)

#ldapConnection = fairscapeConfig.ldap.connectAdmin()
#currentUserLDAP = getUserByCN(ldapConnection, userCN)
#ldapConnection.unbind()

minioClient = fairscapeConfig.minio.CreateClient()
mongoClient = fairscapeConfig.mongo.CreateClient()

mongoDB = mongoClient[fairscapeConfig.mongo.db]
asyncCollection = mongoDB[fairscapeConfig.mongo.async_collection]
identifierCollection = mongoDB[fairscapeConfig.mongo.identifier_collection]
rocrateCollection = mongoDB[fairscapeConfig.mongo.rocrate_collection]

In [3]:
# pydantic model for Distributions
from pydantic import BaseModel, Field
from typing import Union, Optional, List
from fairscape_mds.models.rocrate import (
	ROCrateV1_2,
	ROCrateDataset,
	ROCrateSoftware,
	ROCrateComputation,
	ROCrateMetadataElem
)
from fairscape_mds.models.fairscape_base import (
	IdentifierValue
)

import timeit

class FairscapeDataDistribution(BaseModel):
	distributionType: str = 'minio'
	objectPath: str
	objectBucket: str


class MongoDocument(BaseModel):
	guid: str = Field(alias="@id")
	metadataType: str 
	owner: str
	metadata: Union[ROCrateV1_2, ROCrateDataset, ROCrateSoftware, ROCrateComputation] #Schema]
	distribution: Optional[List[FairscapeDataDistribution]]


class MongoDocument(BaseModel):
	guid: str = Field(alias="@id")
	metadataType: str 
	owner: str 
	metadata: ROCrateV1_2
	distribution: FairscapeDataDistribution

  "class": algorithms.Blowfish,


In [4]:
import uuid
import pathlib
import shutil
import zipfile
import json
import minio
import urllib.parse

In [5]:
def uploadLocalFileMinio(
	minioClient: minio.Minio, 
	bucketName: str, 
	objectName: str, 
	filePath: str, 
	metadata: dict= None):
	uploadResult = minioClient.fput_object(
			bucket_name= bucketName,
			object_name=objectName,
			file_path=filePath,
			metadata=metadata
		)
	return uploadResult

In [6]:
# given rocrate
rocrateFolder = pathlib.Path("D:\Work\Data\Dataverse-Uploaded\crates_format_1.2")

testROCrate = rocrateFolder / "1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha.zip"

## Mocking the Transaction

1. create transaction folder
2. Copy crate to local directory
3. Unzip contents in transaction folder
4. process metadata
5. upload extracted datasets
6. replace ro-crate-metadata.json
7. zip up and upload archive

In [7]:
# mock user CN
userCN = 'mal8ch'

In [8]:
# mock a transaction uuid
transactionID = str(uuid.uuid4())

# create a folder for jobs
jobsFolder = pathlib.Path("jobs/")
jobsFolder.mkdir(exist_ok=True)

# create a folder for this transaction
transactionFolder = jobsFolder / transactionID
transactionFolder.mkdir(exist_ok=True)

In [9]:
extractFolder = transactionFolder / 'extracted'

In [10]:
# mock copy to transaction directory
zippedCratePath = transactionFolder / testROCrate.name

shutil.copyfile(
	str(testROCrate),
	str(zippedCratePath)
)


'jobs\\91e12cb3-f796-4203-ad40-f490f7b3006c\\1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha.zip'

In [11]:
crateFolderName = zippedCratePath.name.strip(zippedCratePath.suffix)

In [12]:
# unzip the archive 
with zippedCratePath.open('rb') as zippedCrateFileObj:
	zipCrate = zipfile.ZipFile(zippedCrateFileObj)
	zipCrate.extractall(path=str(extractFolder))

In [13]:
# extract the metadata
extractedContentsFolder = extractFolder / crateFolderName
metadataFilePath = extractedContentsFolder / 'ro-crate-metadata.json'
with metadataFilePath.open("r") as crateMetadataFileObj:
	crateMetadataJSON = json.load(crateMetadataFileObj)

# parse metadata into pydantic model
fullCrateModel = ROCrateV1_2.model_validate(crateMetadataJSON)

In [None]:
# add schema models to ROCrateV1_2 pydantic model metadataGraph Union
# also look for schemas within the ROCrate before looking for Schemas in minio

In [14]:
def cleanGUID(metadata):
	if "http" in metadata.guid:
		metadata.guid = urllib.parse.urlparse(metadata.guid).path.lstrip('/')

def cleanIdentifiers(crate: ROCrateV1_2):
	rocrateMetadata = crate.getCrateMetadata()

	#clean ROCrate metadata identifier
	cleanGUID(rocrateMetadata)

	
	# clean identifiers
	for elem in crate.getEVIElements():
		cleanGUID(elem)

		if isinstance(elem, ROCrateDataset):
			# usedByComputation
			for usedByComputation in elem.usedByComputation:
				cleanGUID(usedByComputation)
			
			# generatedBy
			for generatedBy in elem.generatedBy:
				cleanGUID(generatedBy)

		if isinstance(elem, ROCrateSoftware):
			for usedByElem in elem.usedByComputation:
				cleanGUID(usedByElem)

		if isinstance(elem, ROCrateComputation):
			#elem.usedDataset
			for usedDataset in elem.usedDataset:
				cleanGUID(usedDataset)
			#elem.generated
			for generated in elem.generated:
				cleanGUID(generated)
			#elem.usedSoftware
			for usedSoftware in elem.usedSoftware:
				cleanGUID(usedSoftware)


def processMetadataIsPartOf(crate: ROCrateV1_2):
	eviElements = crate.getEVIElements()
	rocrateRootElem = crateModel.getCrateMetadata()

	rocrateRootElem.hasPart = [ IdentifierValue.model_validate({"@id": x.guid}) for x in eviElements]

	# set inverse relation
	elemIsPartOf = IdentifierValue.model_validate({"@id": rocrateRootElem.guid})
	for elem in eviElements:
		elem.isPartOf = elemIsPartOf

def processMetadata(crate: ROCrateV1_2):
	pass

In [19]:
fullCrateModel.model_dump_json(by_alias=True)

'{"@context":{"EVI":"https://w3id.org/EVI#","@vocab":"https://schema.org/"},"@graph":[{"@id":"ro-crate-metadata.json","@type":"CreativeWork","conformsTo":{"@id":"https://w3id.org/ro/crate/1.2-DRAFT"},"about":{"@id":"https://fairscape.net/ark:59852/rocrate-1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha"}},{"@id":"ark:59852/rocrate-1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha","@type":["Dataset","https://w3id.org/EVI#ROCrate"],"name":"Initial integration run","keywords":["Ideker Lab","CM4AI","0.1 alpha","MDA-MB-468","untreated","chromatin","Initial integration run","AP-MS edgelist download"],"isPartOf":[{"@id":"ark:/Ideker_Lab"},{"@id":"ark:/Ideker_Lab/CM4AI"}],"version":"0.5alpha","license":"https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en","associatedPublication":"Clark T, Schaffer L, Obernier K, Al Manir S, Churas CP, Dailamy A, Doctor Y, Forget A, Hansen JN, Hu M, Lenkiewicz J, Levinson MA, Marquez C, Mohan J, Nourreddine S, Nies

In [21]:
fullCrateModel.model_dump(by_alias=True)

{'@context': {'EVI': 'https://w3id.org/EVI#', '@vocab': 'https://schema.org/'},
 '@graph': [{'@id': 'ro-crate-metadata.json',
   '@type': 'CreativeWork',
   'conformsTo': {'@id': 'https://w3id.org/ro/crate/1.2-DRAFT'},
   'about': {'@id': 'https://fairscape.net/ark:59852/rocrate-1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha'}},
  {'@id': 'ark:59852/rocrate-1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha',
   '@type': ['Dataset', 'https://w3id.org/EVI#ROCrate'],
   'name': 'Initial integration run',
   'keywords': ['Ideker Lab',
    'CM4AI',
    '0.1 alpha',
    'MDA-MB-468',
    'untreated',
    'chromatin',
    'Initial integration run',
    'AP-MS edgelist download'],
   'isPartOf': [{'@id': 'ark:/Ideker_Lab'}, {'@id': 'ark:/Ideker_Lab/CM4AI'}],
   'version': '0.5alpha',
   'license': 'https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en',
   'associatedPublication': 'Clark T, Schaffer L, Obernier K, Al Manir S, Churas CP, Dailamy A, D

In [84]:
# for datasetElem in datasetList:

datasetElem = datasetList[0]
datasetContentUrl = datasetElem.contentUrl
if 'http' in datasetContentUrl:
	pass
elif 'file:' in datasetContentUrl:
	# create a path to the file specified by the dataset metadata 
	sourcePath = pathlib.Path(datasetElem.contentUrl.lstrip('file:///'))
	datasetFilePath = extractedContentsFolder / sourcePath.name

# check that path exists in the crate
if not datasetFilePath.exists():
	pass



uploadPath = pathlib.PurePosixPath(fairscapeConfig.minio.default_bucket_path) / userCN / 'datasets' / sourcePath

datasetUploadStart = time.time()
# upload the file 
uploadResult = uploadLocalFileMinio(
		minioClient=minioClient, 
		bucketName=fairscapeConfig.minio.default_bucket, 	
		objectName=str(uploadPath),
		filePath=str(datasetFilePath),
		metadata={"guid": datasetElem.guid, "owner": userCN}
		)

datasetUploadEnd = time.time()

uploadDuration = datasetUploadEnd - datasetUploadStart
print(uploadDuration)


# create a distribution for the dataset
datasetDistribution = FairscapeDataDistribution(
	distributionType = 'minio',
	objectPath=str(uploadPath),
	objectBucket=fairscapeConfig.minio.default_bucket
)


# set contentUrl in metadata to fairscape pointer
datasetElem.contentUrl = f"{fairscapeConfig.url}/dataset/download/{datasetElem.guid}"



0.007513523101806641


In [79]:
str(pathlib.PurePosixPath(uploadPath))

'default/mal8ch/datasets/1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha/ppi_gene_node_attributes.tsv'

WindowsPath('jobs/625d5589-4956-46a7-bc32-d6571dac285c/extracted/1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha/ppi_gene_node_attributes.tsv')

True

In [None]:
# mint datasets