In [2]:
import pathlib
import zipfile
import io
import json

from pymongo.collection import Collection
import pymongo
from urllib.parse import  quote_plus
from minio.commonconfig import Tags
from minio.versioningconfig import VersioningConfig, ENABLED
from minio import Minio, S3Error
import hashlib

from pydantic import BaseModel

In [85]:
import sys
!{sys.executable} -m pip install rdflib

#sqid

Collecting rdflib
  Obtaining dependency information for rdflib from https://files.pythonhosted.org/packages/d4/b0/7b7d8b5b0d01f1a0b12cc2e5038a868ef3a15825731b8a0d776cf47566c0/rdflib-7.0.0-py3-none-any.whl.metadata
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Obtaining dependency information for isodate<0.7.0,>=0.6.0 from https://files.pythonhosted.org/packages/b6/85/7882d311924cbcfc70b1890780763e36ff0b140c7e51c110fc59a532f087/isodate-0.6.1-py2.py3-none-any.whl.metadata
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
   ---------------------------------------- 0.0/531.9 kB ? eta -:--:--
   ------- -------------------------------- 102.4/531.9 kB 3.0 MB/s eta 0:00:01
   ------------------------------ --------- 399.4/531.9 kB 5.0 MB/s eta 0:00:01
   ---------------------------------------- 531.9/531.9 kB 5.5 MB/s eta 0:00:00
Downloading isodate-0.6.1-py2.p

## Setup Databases

In [59]:
# iterating etag
def get_etag(filePath: pathlib.Path, partsize = 10*1024*1024):
    md5_digests = []
    with filePath.open('rb') as file:
      for chunk in iter(lambda: file.read(partsize), b''):
        md5_digests.append(hashlib.md5(chunk).digest())
    
    return hashlib.md5(b''.join(md5_digests)).hexdigest() + '-' + str(len(md5_digests))

In [3]:
user = "mongotestaccess"
password = "mongotestsecret"
host = "localhost"
port = "27017"
mongo_database = "fairscape"
local_mongo_connection_string = f"mongodb://{quote_plus(user)}:{quote_plus(password)}@{host}:{port}"

In [4]:
# setup mongo client
mongoClient = pymongo.MongoClient(local_mongo_connection_string)
mongoDB = mongoClient['fairscape']
identifierCollection = mongoDB['mds']
rocrateCollection = mongoDB['rocrate']
asyncCollection = mongoDB['async']

In [5]:
# setup minio client
minioClient = Minio(
	endpoint = "localhost:9000", 
	access_key = "miniotestadmin", 
	secret_key = "miniotestsecret",
	secure = False,
)

In [91]:
# create a bucket for testing and set object versioning
if not minioClient.bucket_exists('default'):
	minioClient.make_bucket('default')
	minioClient.set_bucket_versioning("default", VersioningConfig(ENABLED))

### Minio Example set bucket tags

```
from minio.commonconfig import Tags

tags = Tags.new_bucket_tags()
tags["Project"] = "Project One"
tags["User"] = "jsmith"
minioClient.set_bucket_tags("my-bucket", tags)
```

## Upload Job Refactoring for Prov

In [6]:
from pydantic import BaseModel, Field, field_serializer
import datetime
import uuid

In [7]:
class UploadJobMetadata(BaseModel):
  #userCN: str = Field(description="LDAP CN of the user who uploaded this ROCrate")
  jobID: uuid.UUID = Field(
    default_factory= uuid.uuid4, 
    description="a unique identifier given to this upload attempt"
    )
  zippedCratePath: str = Field(
    description="the path in minio where the zipped crate is deposited"
    )
  crateGUID: str | None = Field(
    default=None,
    description="the guid from the extracted ROCrate"
    )
  md5Hash: str | None = Field(
    default=None,
    description="md5 checksum of the zipped ROCrate"
  )
  success: bool | None = Field(
    default=False,  
    description="boolean indicating that the upload job has completed successfully"
    )
  completed: bool | None = Field(
    default=False, 
    description="boolean that indicates that the upload job has finished"
    )
  errors: list[str] = Field(
    default=[], 
    description="errors that occured during upload job"
    )
  identifiersMinted: list[str] = Field(
    default=[], 
    description="list of all minted identifiers from the uploaded ROCrate"
    )
  processedFiles: list[str] = Field(
    default=[], 
    description="list of all extracted file contents from the uploaded ROCrate"
    )
  timeStarted: datetime.datetime | None = Field(
    default_factory=datetime.datetime.now, 
    description="The time when the upload job began"
    )
  timeFinished: datetime.datetime | None = Field(
    default=None, 
    description="The time when the upload job completed"
    )
  log: list[str] | None = Field(
    default=[], 
    description="Logs for Upload Job Processing"
    )

  @field_serializer('jobID', when_used='always')
  def serialize_job_id(self, jobID: uuid.UUID):
    return str(jobID)

  @field_serializer('timeStarted', when_used='always')
  def serialize_time_started(self, timeStarted: datetime.datetime):
    return timeStarted.isoformat()


  @field_serializer('timeFinished', when_used='always')
  def serialize_time_finished(self, timeFinished: datetime.datetime):
    if timeFinished:
      return timeFinished.isoformat()
    else:
      return None

In [8]:
test_upload_job = UploadJobMetadata(zippedCratePath="/hello/world")
test_upload_job.model_dump()

{'jobID': '5b581872-dafd-4d77-87e3-8c1891ff7a43',
 'zippedCratePath': '/hello/world',
 'crateGUID': None,
 'md5Hash': None,
 'success': False,
 'completed': False,
 'errors': [],
 'identifiersMinted': [],
 'processedFiles': [],
 'timeStarted': '2024-09-26T15:07:35.466666',
 'timeFinished': None,
 'log': []}

In [None]:
def createUploadJob():
	pass

In [9]:
class CrateUpload():
	def __init__(
		self, 
		crate: pathlib.Path, 
		minioClient: Minio, 
		asyncCollection: Collection
		):
		self.crate = crate
		self.userDN = "cn=testUser"
		self.groupDN = "ou=testGroup"

		self.metadata = {}

		self.minioClient = minioClient
		self.asyncCollection = asyncCollection

		self.jobID = None
		self.crateHash = None
		self.zippedCrate = None
		self.partSize = 10*1024*1024
		self.bucketName = "default"
		self.objectName = f'test/{self.crate.name}'
		self.objectTags = Tags(for_object=True)
		self.objectTags["User"] = "test"

	def checkCrateUnique(self)->bool:
		""" Check that the crate to be uploaded is unique
		"""

		# check that the object name doesn't have a conflict
		# if there is no existing key an exception is triggered
		try:
			statResult = self.minioClient.stat_object(
				bucket_name = self.bucketName, 
				object_name = self.objectName
			)
		except S3Error as e:
			if e.code == "NoSuchKey":
				return True
			else:
				# TODO specific exception
				raise Exception(f"minio error {str(e)}")

		# get the crateHash
		localHash = self.getCrateHash()

		if statResult.etag == localHash:
			return False
		else:
			return True


	def getCrateHash(self)-> str:
		""" Get the md5 Hash of the Local File
		"""
		with self.crate.open('rb') as file:
			fileContents = file.read()
			contentHash = hashlib.md5(fileContents).hexdigest()

		self.crateHash = contentHash
		return contentHash


	def updateUploadRecord(self, updateMetadata):
		""" Update the record in mongo for this operation
		"""
		updateResult = self.asyncCollection.update_one({"jobID": str(self.jobID)}, updateMetadata)
		return updateResult

	def createUploadRecord(self):
		""" Create a record in mongo for this operation
		"""
		jobMetadata = UploadJobMetadata(
			zippedCratePath=self.objectName,
			md5Hash=self.crateHash
			)

		# store the generated job ID
		self.jobID = jobMetadata.jobID

		insertMetadata = jobMetadata.model_dump()
		insertResult = self.asyncCollection.insert_one(insertMetadata)

		if insertResult.inserted_id:
			return True
		else:
			return False

	def upload(self)->bool:
		"""Upload the ROCrate to Minio"""
		
		# check that the uploaded ROCrate is unique	
		if not self.checkCrateUnique():
			raise Exception('Crate isnt unique')

		uploadRecordResult = self.createUploadRecord()

		# open file
		with self.crate.open('rb') as crateFile:
			crateData= crateFile.read()
			crateBytes = io.BytesIO(crateData)

		# TODO set user for real example

		uploadResult = self.minioClient.put_object(
			bucket_name=self.bucketName,	
			object_name=self.objectName,
			data=crateBytes,
			length= -1,
			part_size=self.partSize,
			content_type='application/zip'
			)

		# log upload job
		logMessage = f"UploadSuccess created {uploadResult.object_name} object; etag: {uploadResult.etag}, version-id: {uploadResult.version_id}"
		self.updateUploadRecord({"$set": {"md5Hash": self.crateHash, "status": ""}, "$push": {"log": logMessage}})
		return True

	def readZip(self):
		"""" Read the Zipped ROCrate from Minio """
		try:
			response = self.minioClient.get_object(
				bucket_name= self.bucketName,
				object_name = self.objectName
			)
			zippedObjectContent = response.read()
		finally:
			response.close()
			response.release_conn()

		self.zippedCrate = zippedObjectContent
		return zippedObjectContent

	def readMetadataLocal(self):

		with self.crate.open('rb') as crateFile:
			zipContents = io.BytesIO(crateFile.read())

			with zipfile.ZipFile(zipContents, "r") as crateZip:      
				crateInfoList = crateZip.infolist()
			
				# extract the ro-crate-metadata.json from the zipfile
				metadataInfo = list(
					filter(
						lambda info: 'ro-crate-metadata.json' in info.filename, 
						crateInfoList
					)
				)
			
				if len(metadataInfo) != 1:
					raise Exception('ro-crate-metadata.json not found in crate')
				metadataFileInfo = metadataInfo[0]

				# uploaded crate path must be trimmed from all uploaded elements
				# crateParentPath = pathlib.Path(metadataFileInfo.filename).parent

				# extract and upload the content
				roCrateJSONContents = crateZip.read(metadataFileInfo.filename)

		roCrateMetadata = json.loads(roCrateJSONContents)
		self.metadata = roCrateMetadata


	def readMetadata(self):
		""" Extract the ROCrate JSON from the ROCrate"""

		if self.zippedCrate is None:
			self.readZip()

		zipContents = io.BytesIO(self.zippedCrate)
		
		with zipfile.ZipFile(zipContents, "r") as crateZip:      
			crateInfoList = crateZip.infolist()
			# extract the ro-crate-metadata.json from the zipfile
			metadataInfo = list(
				filter(
					lambda info: 'ro-crate-metadata.json' in info.filename, 
					crateInfoList
				)
			)
			
			if len(metadataInfo) != 1:
				raise Exception('ro-crate-metadata.json not found in crate')
			metadataFileInfo = metadataInfo[0]

      # uploaded crate path must be trimmed from all uploaded elements
			# crateParentPath = pathlib.Path(metadataFileInfo.filename).parent

      # extract and upload the content
			roCrateJSONContents = crateZip.read(metadataFileInfo.filename)
        # may have to seek the begining of the file

    # TODO validate ROCrate
		roCrateMetadata = json.loads(roCrateJSONContents)
		self.metadata = roCrateMetadata
		# return roCrateMetadata


	def processMetadata(self):
		""" Preprocess any metadata by normalizing arks, detecting identifier conflicts and reasoning
		"""

		if self.metadata is None:
			self.readMetadata()

		# format arks

		# reason the provenance
		self.reasonProv()
		pass

	def reasonProv(self):
		""" Materialize inverse edges 
		"""
		pass

	def extractCrate(self):
		if self.zippedCrate is None:
			self.readZip()
		
		crateContents = self.zippedCrate
		

		
		pass

In [208]:
testCrateUpload = all_crate_uploads[0]

## Procress Metadata from ROCrate

0. set md5Hash for ROCrate
1. process all arks into `ark:{NAAN}/{postfix}`
2. check for guid conflicts
- if conflict reassign guid in ROCrate and reassign references
3. local reasoning -> inverse edges and relations for all prov elements
4. global reasoning on existing database
- if input Dataset is an ROCrate, getOutputROCrate

In [27]:
# helper
# get the ark from identifier
import re

def parseArk(arkGUID):
    """ Parse the Ark and return a trimmed version"""
    arkMatches = re.search(r'ark:\d{5}/(.*)$', arkGUID)
    if arkMatches is None:
        return arkGUID
    else:
        return arkMatches[0]

def formatArks(crateMetadata):
    """ Given ro crate metadata normalize all Arks and references"""
    crateMetadata['@id'] = parseArk(crateMetadata['@id'])
    for metadataElem in crateMetadata['@graph']:
    # parse the ark
        metadataElem['@id'] = parseArk(metadataElem['@id'])

        elemType = metadataElem['@type']
        match elemType:
            case 'EVI:Software':
                metadataElem['usedByComputation'] = list(map(parseArk, metadataElem['usedByComputation']))

            case 'EVI:Dataset':
                metadataElem['generatedBy'] = list(map(parseArk, metadataElem['generatedBy']))
                metadataElem['usedBy'] = list(map(parseArk, metadataElem['usedBy']))

                if metadataElem.get('schema'):
                    metadataElem['schema'] = parseArk(metadataElem['schema'])

            case 'EVI:Computation':
                metadataElem['usedSoftware'] = list(map(parseArk, metadataElem['usedSoftware']))
                metadataElem['usedDataset'] = list(map(parseArk, metadataElem['usedDataset']))
                metadataElem['generated'] = list(map(parseArk, metadataElem['generated']))


def filterByGUID(crateMetadata, guid: str):
	""" Given a specified GUID """

	matchingCrateElements = list(
		filter(
			lambda elem: elem.get("@id") == guid, 
			crateMetadata['@graph']
			)
		)
	
	if len(matchingCrateElements) == 0:
		raise Exception(f"Crate Element Not Found: {guid}")

	if len(matchingCrateElements) > 1:
		raise Exception(f"Duplicative Identifiers Found: {guid}")

	if len(matchingCrateElements) == 1:
		return matchingCrateElements[0]

In [28]:
def processROCrateMetadata(crate: CrateUpload):

	crateMetadata = crate.metadata

	# set the ROCrate Metadata type
	crateMetadata["@type"] = "EVI:ROCrate"

	# format all arks
	formatArks(crateMetadata)

	# remove the isPartOf for ROCrate
	try:
		del crateMetadata['isPartOf']
	except KeyError:
		pass
	
	# set each member as being part of the ROCrate
	for crateElem in crateMetadata['@graph']:
		crateElem['isPartOf'] = {
			"@id": crateMetadata['@id'], 
			"@type": "EVI:ROCrate",
			"name": crateMetadata['name']
			}

	# set distribution
	setDistribution(
		crateMetadata=crateMetadata,
		minioBucket=crate.bucketName,
		minioObjectPath=crate.objectName
		)

	# set permissions on all documents
	userDN = "cn=testUser"
	groupDN = "ou=testGroup"
	setPermissions(crateMetadata=crateMetadata, userDN=userDN, groupDN=groupDN)

	# local entailment 
	# filter the computation element
	computation = filterComputation(crateMetadata)	
	metadataGraphGUIDS = [ elem.get("@id") for elem in crateMetadata['@graph']]
	computationLocalProv = {
		"@id": computation['@id'],
		"generated": [guidValue for guidValue in computation['generated'] if guidValue in metadataGraphGUIDS],
		"usedDataset": [guidValue for guidValue in computation['usedDataset'] if guidValue in metadataGraphGUIDS],
		"usedSoftware": [guidValue for guidValue in computation['usedSoftware'] if guidValue in metadataGraphGUIDS]
	}

	# modify the metadata records	
	for crateElem in crateMetadata['@graph']:
		elemGUID = crateElem['@id']

		if elemGUID in computationLocalProv['generated']:
			crateElem['generatedBy'] = computationLocalProv['@id']
		if elemGUID in computationLocalProv['usedDataset']:
			crateElem['usedBy'] = computationLocalProv['@id']
		if elemGUID in computationLocalProv['usedSoftware']:
			crateElem['usedBy'] = computationLocalProv['@id']


	return crateMetadata


def setDistribution(crateMetadata, minioBucket: str, minioObjectPath: str):
	""" Set the distribution property on uploaded rocrate """

	crateMetadata['distribution'] = {
		"archivedROCrateBucket": minioBucket,
		"archivedObjectPath": minioObjectPath
	}

def setPermissions(crateMetadata, userDN, groupDN):
	# set default permissions for uploaded crate
	crateMetadata['permissions'] = {
 	 "owner": userDN,
  "group": groupDN
  }

	# set default permissions for all identifiers
	for crateElem in crateMetadata['@graph']:
		# set permissions on all rocrate identifiers	
		crateElem['permissions'] = {
			"owner": userDN,
			"group": groupDN
		}

	pass

In [29]:
def globalProv(crateMetadata):
	''' Find all non local references in ROCrate metadata
	'''	
	
	computation = filterComputation(crateMetadata)
	
	metadataGraphGUIDS = [ elem.get("@id") for elem in crateMetadata['@graph']]
	computationGLobalProv = {
		"@id": computation['@id'],
		"generated": [guidValue for guidValue in computation['generated'] if guidValue in metadataGraphGUIDS],
		"usedDataset": [guidValue for guidValue in computation['usedDataset'] if guidValue in metadataGraphGUIDS],
		"usedSoftware": [guidValue for guidValue in computation['usedSoftware'] if guidValue in metadataGraphGUIDS]
	}
	pass

def localProv(crateMetadata):
	# find the computations inside the graph
	computation = filterComputation(crateMetadata)

	metadataGraphGUIDS = [ elem.get("@id") for elem in crateMetadata['@graph']]
	# allLocal = lambda guidSearch: all([elem in metadataGraphGUIDS for elem in guidSearch])

	computationProv = {
		"@id": computation['@id'],
		"generated": [guidValue for guidValue in computation['generated'] if guidValue in metadataGraphGUIDS],
		"usedDataset": [guidValue for guidValue in computation['usedDataset'] if guidValue in metadataGraphGUIDS],
		"usedSoftware": [guidValue for guidValue in computation['usedSoftware'] if guidValue in metadataGraphGUIDS]
	}
	
	# local inverse properties
	for localGUID in computationProv['generated']:
		matchedElem = list(filter(
			lambda elem: elem.get("@id") == localGUID, 
			crateMetadata['@graph']))

		generatedElem = matchedElem[0]
		print(generatedElem['@id'])
		generatedElem['generatedBy'] = computationProv['@id']

	for guid_value in computationProv['usedDataset']:
		matchedElem = list(filter(
			lambda elem: elem.get("@id") == guid_value, 
			crateMetadata['@graph']
			))

		datasetElem = matchedElem[0]
		
		print(datasetElem['@id'])
		
		datasetElem['usedByComputation'] = computationProv['@id']

	for guid_value in computationProv['usedSoftware']:
		matchedElem = list(
			filter(
			lambda elem: elem.get("@id") == guid_value, 
			crateMetadata['@graph'])
			)

		softwareElem = matchedElem[0]
		softwareElem['usedBy'] = computationProv['@id']

In [30]:
def filterComputation(crateMetadata):
	""" Return all computation elements from an ROCrat Metadata """
	computationList = list(
		filter(
			lambda elem: elem.get("@type")== "EVI:Computation", 
			crateMetadata['@graph']
			)
	)
	
	if len(computationList) == 0:
		raise Exception('Computation Not Found in Crate')

	#if len(computationList) == 1:
	return computationList[0]
	# TODO handle for multiple computations
	#else:
	#	return computationList
	


In [31]:
# insert all identifiers into Identifier Collection
def publishIdentifiers(crateMetadata):

	# Check if @id already exsists
	rocrateFound = rocrateCollection.find_one(
		{"@id": crateMetadata['@id']}
		)

	if rocrateFound:
		raise Exception("ROCrate Exists")

	# for every element in the rocrate model dump json
	insertMetadata = [ elem for elem in crateMetadata.get("@graph", [])]

	# TODO compressed rocrate representation
	compressedROCrate = crateMetadata.copy()
	compressedROCrate['@graph'] = [
		{
			"@id": crateElem['@id'],
			"@type": crateElem['@type'],
			"name": crateElem['name']
	 } for crateElem in crateMetadata['@graph']
	]
	

	# insert rocrate json into identifier collection
	insertMetadata.append(compressedROCrate)

	insertedIdentifiers = [ elem.get("@id") for elem in insertMetadata]

	# insert all identifiers into the identifier collection
	insertResult = identifierCollection.insert_many(insertMetadata)

	if len(insertResult.inserted_ids) != len(insertMetadata):
		raise Exception("ErrorMintingGUID")

	# insert into mongo
	insertResult = rocrateCollection.insert_one(crateMetadata)

	if insertResult.inserted_id is None:
		raise Exception("ErrorMintingGUID")

In [32]:
# processROCrateMetadata(all_crate_uploads[0])


In [15]:
# clear mongo database
rocrateCollection.delete_many({})	
identifierCollection.delete_many({})

<pymongo.results.DeleteResult at 0x237ba2b6fb0>

In [33]:
# all dataverse uploaded crates
allUploadedCrates= list(pathlib.Path("C:\\Users\\Max\\Documents\\Work\\Dataverse-Uploaded\\Dataverse-Uploaded").glob("*.zip"))

In [34]:
# upload all crates
all_crate_uploads = [ 
	CrateUpload(
		minioClient = minioClient,
		asyncCollection=asyncCollection,
		crate=crateElem
		) for crateElem in allUploadedCrates #[0:4]
]

for crate_upload_elem in all_crate_uploads:
	#crate_upload_elem.createUploadRecord()
	#crate_upload_elem.upload()
	crate_upload_elem.readMetadataLocal()

In [35]:
for crateElem in all_crate_uploads:
	print(crateElem.metadata['@id'])

https://fairscape.net/ark:59852/rocrate-1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-1.cm4ai_chromatin_mda-mb-468_untreated_imageloader_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-2.cm4ai_chromatin_mda-mb-468_untreated_apmsembed_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-2.cm4ai_chromatin_mda-mb-468_untreated_imageembedfold1_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-2.cm4ai_chromatin_mda-mb-468_untreated_imageembedfold2_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-3.cm4ai_chromatin_mda-mb-468_untreated_coembedfold1_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-3.cm4ai_chromatin_mda-mb-468_untreated_coembedfold2_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-4.cm4ai_chromatin_mda-mb-468_untreated_hierarchy_initialrun0.1alpha
https://fairscape.net/ark:59852/rocrate-cm4ai_chromatin_mda-mb-468_paclitaxel_ifimage_0.1_alpha
https://fairscape.n

In [36]:
for crateElem in all_crate_uploads:
	crateElem.metadata = processROCrateMetadata(crateElem)

In [37]:
for crateElem in all_crate_uploads:
	publishIdentifiers(crateElem.metadata)

In [164]:
# global provenance
crateMetadata = all_crate_uploads[0].metadata
metadataGraphGUIDS = [ elem['@id'] for elem in crateMetadata['@graph']]
metadataGraphGUIDS.append(crateMetadata['@id'])

### Testing finding provenance

In [115]:
# find all computations 
cursor = identifierCollection.find({"@type": "EVI:Computation"}, 
	projection={
		"_id": False,
		#"@id": True, 
		#"usedDataset": True, 
		#"usedSoftware": True,
		#"generated": True,
		#"isPartOf": True
	})

In [116]:
computations = list(cursor)

print(len(computations))
print(computations)
computationElem = computations[1]
computationElem['usedDataset']

12
[{'@id': 'ark:59852/computation-AP-MS-Loader-58sGTge', '@context': {'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'}, 'metadataType': 'https://w3id.org/EVI#Computation', 'url': None, 'name': 'AP-MS Loader', 'keywords': ['Ideker Lab', 'CM4AI', '0.1 alpha', 'MDA-MB-468', 'untreated', 'chromatin', 'Initial integration run', 'AP-MS edgelist download', 'tools', 'cellmaps_ppidownloader', 'gene', 'attributes', 'file', 'computation', 'download'], 'description': 'Ideker Lab CM4AI 0.1 alpha MDA-MB-468 untreated chromatin Initial integration run AP-MS Edgelist run of cellmaps_ppidownloader', 'runBy': 'Chris Churas', 'dateCreated': '2023-08-31', 'associatedPublication': 'Clark T, Schaffer L, Obernier K, Al Manir S, Churas CP, Dailamy A, Doctor Y, Forget A, Hansen JN, Hu M, Lenkiewicz J, Levinson MA, Marquez C, Mohan J, Nourreddine S, Niestroy J, Pratt D, Qian G, Thaker S, Belisle-Pipon J-C, Brandt C, Chen J, Ding Y, Fodeh S, Krogan N, Lundberg E, Mali P, Payne-Foster P, Ratcliff

['ark:59852/dataset-Simulated-unique-file-9LUEb5Y',
 'ark:59852/dataset-Simulated-Samples-file-9LUEb5Y']

## RDFLib Prov Graph

In [126]:
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import RDF
import rdflib
import json

In [130]:
help(rdflib.Namespace)
EVI = rdflib.Namespace("https://w3id.org/EVI#")
SCHEMA = rdflib.Namespace("https://schema.org/")
# EVI Classes
#EVIComputation = URIRef("https://w3id.org/EVI#Computation")
# EVIDataset = URIRef("https://w3id.org/EVI#Dataset")
#EVISoftware = URIRef("https://w3id.org/EVI#Software")

# EVI Properties
#EVIUsedDataset = URIRef("https://w3id.org/EVI#usedDataset")
#EVIDatasetUsedBy = URIRef("https://w3id.org/EVI#datasetUsedBy")
#EVIUsedSoftware = URIRef("https://w3id.org/EVI#usedSoftware")
#EVIGenerated = URIRef("https://w3id.org/EVI#generated")
#EVIGeneratedBy = URIRef("https://w3id.org/EVI#generatedBy")

Help on class Namespace in module rdflib.namespace:

class Namespace(builtins.str)
 |  Namespace(value: Union[str, bytes]) -> 'Namespace'
 |  
 |  Utility class for quickly generating URIRefs with a common prefix
 |  
 |  >>> from rdflib.namespace import Namespace
 |  >>> n = Namespace("http://example.org/")
 |  >>> n.Person # as attribute
 |  rdflib.term.URIRef('http://example.org/Person')
 |  >>> n['first-name'] # as item - for things that are not valid python identifiers
 |  rdflib.term.URIRef('http://example.org/first-name')
 |  >>> n.Person in n
 |  True
 |  >>> n2 = Namespace("http://example2.org/")
 |  >>> n.Person in n2
 |  False
 |  
 |  Method resolution order:
 |      Namespace
 |      builtins.str
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, ref: str) -> bool
 |      Allows to check if a URI is within (starts with) this Namespace.
 |      
 |      >>> from rdflib import URIRef
 |      >>> namespace = Namespace('http://example.org/')
 | 

In [118]:
computations[0]

{'@id': 'ark:59852/computation-AP-MS-Loader-58sGTge',
 '@context': {'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'},
 'metadataType': 'https://w3id.org/EVI#Computation',
 'url': None,
 'name': 'AP-MS Loader',
 'keywords': ['Ideker Lab',
  'CM4AI',
  '0.1 alpha',
  'MDA-MB-468',
  'untreated',
  'chromatin',
  'Initial integration run',
  'AP-MS edgelist download',
  'tools',
  'cellmaps_ppidownloader',
  'gene',
  'attributes',
  'file',
  'computation',
  'download'],
 'description': 'Ideker Lab CM4AI 0.1 alpha MDA-MB-468 untreated chromatin Initial integration run AP-MS Edgelist run of cellmaps_ppidownloader',
 'runBy': 'Chris Churas',
 'dateCreated': '2023-08-31',
 'associatedPublication': 'Clark T, Schaffer L, Obernier K, Al Manir S, Churas CP, Dailamy A, Doctor Y, Forget A, Hansen JN, Hu M, Lenkiewicz J, Levinson MA, Marquez C, Mohan J, Nourreddine S, Niestroy J, Pratt D, Qian G, Thaker S, Belisle-Pipon J-C, Brandt C, Chen J, Ding Y, Fodeh S, Krogan N, Lundberg E,

In [120]:
g = Graph()
g.namepsace_manager.bind("schema", "http://schema.org/", override=True, replace=True)

AttributeError: 'Graph' object has no attribute 'namepsace_manager'

In [124]:
list(g.namespaces())

[('brick', rdflib.term.URIRef('https://brickschema.org/schema/Brick#')),
 ('csvw', rdflib.term.URIRef('http://www.w3.org/ns/csvw#')),
 ('dc', rdflib.term.URIRef('http://purl.org/dc/elements/1.1/')),
 ('dcat', rdflib.term.URIRef('http://www.w3.org/ns/dcat#')),
 ('dcmitype', rdflib.term.URIRef('http://purl.org/dc/dcmitype/')),
 ('dcterms', rdflib.term.URIRef('http://purl.org/dc/terms/')),
 ('dcam', rdflib.term.URIRef('http://purl.org/dc/dcam/')),
 ('doap', rdflib.term.URIRef('http://usefulinc.com/ns/doap#')),
 ('foaf', rdflib.term.URIRef('http://xmlns.com/foaf/0.1/')),
 ('geo', rdflib.term.URIRef('http://www.opengis.net/ont/geosparql#')),
 ('odrl', rdflib.term.URIRef('http://www.w3.org/ns/odrl/2/')),
 ('org', rdflib.term.URIRef('http://www.w3.org/ns/org#')),
 ('prof', rdflib.term.URIRef('http://www.w3.org/ns/dx/prof/')),
 ('prov', rdflib.term.URIRef('http://www.w3.org/ns/prov#')),
 ('qb', rdflib.term.URIRef('http://purl.org/linked-data/cube#')),
 ('schema', rdflib.term.URIRef('https://sc

In [136]:
computation

{'@id': 'ark:59852/computation-AP-MS-Loader-58sGTge',
 '@context': {'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'},
 'metadataType': 'https://w3id.org/EVI#Computation',
 'url': None,
 'name': 'AP-MS Loader',
 'keywords': ['Ideker Lab',
  'CM4AI',
  '0.1 alpha',
  'MDA-MB-468',
  'untreated',
  'chromatin',
  'Initial integration run',
  'AP-MS edgelist download',
  'tools',
  'cellmaps_ppidownloader',
  'gene',
  'attributes',
  'file',
  'computation',
  'download'],
 'description': 'Ideker Lab CM4AI 0.1 alpha MDA-MB-468 untreated chromatin Initial integration run AP-MS Edgelist run of cellmaps_ppidownloader',
 'runBy': 'Chris Churas',
 'dateCreated': '2023-08-31',
 'associatedPublication': 'Clark T, Schaffer L, Obernier K, Al Manir S, Churas CP, Dailamy A, Doctor Y, Forget A, Hansen JN, Hu M, Lenkiewicz J, Levinson MA, Marquez C, Mohan J, Nourreddine S, Niestroy J, Pratt D, Qian G, Thaker S, Belisle-Pipon J-C, Brandt C, Chen J, Ding Y, Fodeh S, Krogan N, Lundberg E,

In [143]:
def addComputation(passedGraph, computation):
	pass

computation=computations[0]



fairscapeURI = "https://fairscape.net/"

computationGUID = URIRef(fairscapeURI + computation['@id'])
generatedGUIDS = [URIRef(fairscapeURI + elemGUID) for elemGUID in computation['generated']]
usedDatasetGUIDS = [URIRef(fairscapeURI + elemGUID) for elemGUID in computation['usedDataset']]
usedSoftwareGUID = URIRef(fairscapeURI + computation['usedSoftware'][0])

g = Graph()
g.add((computationGUID, RDF.type, EVI['Computation']))
g.add((computationGUID, SCHEMA['name'], Literal(computation['name']) ))


# add prov
for generated in generatedGUIDS:
	g.add(
		(computationGUID, EVI['generated'], generated)
	)

# add used datasets
for dataset in usedDatasetGUIDS:
	g.add(
		(computationGUID, EVI['usedDataset'], dataset)
	)

# add used software

g.add(
	(computationGUID, EVI['usedSoftware'], usedSoftwareGUID)
)


# add author
g.add(
	(computationGUID, EVI['runBy'], Literal(computation['runBy']))
)

# add keywords 
for key in computation['keywords']:
	g.add(
		(computationGUID, SCHEMA['keywords'], Literal(key))
	)

In [144]:
g.all_nodes()

{rdflib.term.URIRef('https://fairscape.net/ark:59852/computation-AP-MS-Loader-58sGTge'),
 rdflib.term.URIRef('https://fairscape.net/ark:59852/dataset-cellmaps_ppidownloader-output-file-58sGTge'),
 rdflib.term.URIRef('https://fairscape.net/ark:59852/software-cellmaps_ppidownloader-58sGTge'),
 rdflib.term.URIRef('https://w3id.org/EVI#Computation'),
 rdflib.term.Literal('0.1 alpha'),
 rdflib.term.Literal('AP-MS Loader'),
 rdflib.term.Literal('AP-MS edgelist download'),
 rdflib.term.Literal('CM4AI'),
 rdflib.term.Literal('Chris Churas'),
 rdflib.term.Literal('Ideker Lab'),
 rdflib.term.Literal('Initial integration run'),
 rdflib.term.Literal('MDA-MB-468'),
 rdflib.term.Literal('attributes'),
 rdflib.term.Literal('cellmaps_ppidownloader'),
 rdflib.term.Literal('chromatin'),
 rdflib.term.Literal('computation'),
 rdflib.term.Literal('download'),
 rdflib.term.Literal('file'),
 rdflib.term.Literal('gene'),
 rdflib.term.Literal('tools'),
 rdflib.term.Literal('untreated')}

In [55]:
# given a dataset guid
datasetGUID = computationElem['usedDataset'][0]

# find the computation this guid is a part of 

# usedDataset computation
usedDatasetComputation = identifierCollection.find_one(
	{"usedDataset": datasetGUID}
)
# generatedBy
usedGeneratedBy = identifierCollection.find_one(
	{"generated": datasetGUID}
)

In [62]:
def findIdentifierByGUID(guid: str):
	return identifierCollection.find_one({"@id": guid}, projection={"_id": 0})

In [None]:
evidenceGraphList =[]

In [84]:
import rdflib

ModuleNotFoundError: No module named 'rdflib'

In [69]:
isinstance([], list)

True

In [70]:
def appendList(mainList, element):
	if isinstance(element, list):
		for item in element:
			mainList.append(item)
	if isinstance(element, str):
		mainList.append(element)

In [74]:
test_list =['hello', 'world']
test_list.pop()

'world'

In [75]:
test_list

['hello']

In [82]:

queryList = []

def provQuery(initialGUID: str):

	queryList = [initialGUID]
	evidenceGraph = []

	while len(queryList) != 0:
		queryGUID = queryList.pop()
		elementIdentifier = identifierCollection.find_one(
			{"@id": queryGUID}
		)
		evidenceGraph.append(elementIdentifier)

		# determine type of identifier
		match elementIdentifier['@type']:
			case "EVI:Dataset":
				print(f"guid: {elementIdentifier['@id']}\ttype: {elementIdentifier['@type']}")

				appendList(queryList, elementIdentifier['generatedBy'])
				#appendList(queryList, elementIdentifier['usedBy'])

			case "EVI:Software":
				print(f"guid: {elementIdentifier['@id']}\ttype: {elementIdentifier['@type']}")
				appendList(queryList, elementIdentifier['usedBy'])

			case "EVI:Computation":
				print(f"guid: {elementIdentifier['@id']}\ttype: {elementIdentifier['@type']}")
				appendList(queryList, elementIdentifier['usedDataset'])
				appendList(queryList, elementIdentifier['usedSoftware'])


	return evidenceGraph


#findIdentifierByGUID(guid_node['usedBy'])


In [83]:
datasetGUID = computationElem['usedDataset'][0]
provQuery(datasetGUID)

guid: ark:59852/dataset-Simulated-unique-file-9LUEb5Y	type: EVI:Dataset
guid: ark:59852/computation-IF-Image-Loader-xDirGcr	type: EVI:Computation
guid: ark:59852/software-cellmaps_imagedownloader-9LUEb5Y	type: EVI:Software
guid: ark:59852/computation-IF-Image-Loader-xDirGcr	type: EVI:Computation
guid: ark:59852/software-cellmaps_imagedownloader-9LUEb5Y	type: EVI:Software
guid: ark:59852/computation-IF-Image-Loader-xDirGcr	type: EVI:Computation
guid: ark:59852/software-cellmaps_imagedownloader-9LUEb5Y	type: EVI:Software
guid: ark:59852/computation-IF-Image-Loader-xDirGcr	type: EVI:Computation
guid: ark:59852/software-cellmaps_imagedownloader-9LUEb5Y	type: EVI:Software
guid: ark:59852/computation-IF-Image-Loader-xDirGcr	type: EVI:Computation
guid: ark:59852/software-cellmaps_imagedownloader-9LUEb5Y	type: EVI:Software
guid: ark:59852/computation-IF-Image-Loader-xDirGcr	type: EVI:Computation
guid: ark:59852/software-cellmaps_imagedownloader-9LUEb5Y	type: EVI:Software
guid: ark:59852/comput

KeyboardInterrupt: 

In [195]:
# evidence graph query history
testGUID = "ark:59852/dataset-B2AI_1_untreated_C8_R6_red-jpg-red-channel-image-9LUEb5Y"

# find the rocrate this guid is a part of 
testGUID 

['ark:59852/dataset-Simulated-unique-file-9LUEb5Y',
 'ark:59852/dataset-Simulated-Samples-file-9LUEb5Y']

In [187]:
generatedDataset=next(iter(computationElem['generated']))

In [189]:
generatedDatasetMetadata= identifierCollection.find_one(
	{"@id": generatedDataset}
	)

In [191]:
if generatedDatasetMetadata['@type'] == "EVI:ROCrate":
	pass
elif generatedDatasetMetadata['@type'] == "EVI:Dataset":
	# check that generatedBy is set correctly
	# TODO only overwrite if generatedDatasetMetadata['generatedBy'] is none or []
	if generatedDatasetMetadata['generatedBy'] != computationElem['@id']:
		# set generatedBy on generated dataset
		identifierCollection.update_one(
			{"@id": generatedDataset},
			{"$set": {'generatedBy': computationElem['@id']} }
			)

'EVI:Dataset'

True

In [None]:
for generatedDataset in computationElem['generated']:	
	generatedDataset = identifierCollection.find_one({"@id": generatedDataset})

In [185]:
usedSoftware = computationElem['usedSoftware'][0]
usedSoftwareMetadata = identifierCollection.find_one({"@id": usedSoftware})

# TODO handle if node doesnt exist
if usedSoftwareMetadata is None:
	pass

# check if node has inverse
if usedSoftwareMetadata['usedBy'] == computationElem['@id']:
	pass

else:
	# update identifier
	updateResult = identifierCollection.update_one(
		{"@id": usedSoftware},
		{"$set": {"usedBy": computationElem['@id']}}
	)

	# TODO check that update result is successfull


In [184]:
usedSoftwareMetadata

{'_id': ObjectId('66f44829204bd649d3f0ebd7'),
 '@id': 'ark:59852/software-cellmaps_ppidownloader-58sGTge',
 '@context': {'@vocab': 'https://schema.org/', 'evi': 'https://w3id.org/EVI#'},
 'metadataType': 'https://w3id.org/EVI#Software',
 'url': 'https://github.com/idekerlab/cellmaps_ppidownloader',
 'name': 'cellmaps_ppidownloader',
 'keywords': ['Ideker Lab',
  'CM4AI',
  '0.1 alpha',
  'MDA-MB-468',
  'untreated',
  'chromatin',
  'Initial integration run',
  'AP-MS edgelist download',
  'tools',
  'cellmaps_ppidownloader'],
 'description': 'Ideker Lab CM4AI 0.1 alpha MDA-MB-468 untreated chromatin Initial integration run AP-MS Edgelist A tool to download AP-MS data for Cell Maps pipeline',
 'author': 'Clark T, Schaffer L, Obernier K, Al Manir S, Churas CP, Dailamy A, Doctor Y, Forget A, Hansen JN, Hu M, Lenkiewicz J, Levinson MA, Marquez C, Mohan J, Nourreddine S, Niestroy J, Pratt D, Qian G, Thaker S, Belisle-Pipon J-C, Brandt C, Chen J, Ding Y, Fodeh S, Krogan N, Lundberg E, Mali 

In [None]:
# generate inverses globally off of 

for usedSoftware in computationElem['usedSoftware']:
	usedSoftwareMetadata = identifierCollection.find_one({"@id": usedSoftware})
	pass

for usedDataset in computationElem['usedDataset']:
	pass


for generatedDataset in computationElem['generated']:	
	pass

In [169]:
def crateOutput(crateMetadata):
	''' Given input of ROCrate Metadata return all generated datasets'''
	crateComputation = filterComputation(crateMetadata)
	return crateComputation['generated']

In [None]:
# query for computations that usedDataset any of these elems

# if usedDataset is an ROCrate set the crate output as input datasets

# query for computations that generated any of these elements


In [85]:
# datasets that were generated by a previous ROCrate
generatedDatasets = list(filter(
	lambda elem: elem['@type'] == 'EVI:Dataset' and elem.get("generatedBy"), 
	crateMetadata['@graph']
	))



[]

## Evidence Graph Query

Given a Node get all history

In [None]:
def getProv(guid: str):


### Resolve Identifier Conflicts

In [78]:
# TODO check for identifier conflicts
cursor = identifierCollection.find({"$or": [{"@id": elemGUID} for elemGUID in metadataGraphGUIDS] })
guidCollisions = [ elem.get("@id") for elem in list(cursor)]

# deal with identifier collisions
if len(guidCollisions) != 0:
	# find the element in the crate metadata
	for guid in guidCollisions:

		# 
		aliasGUID(crateMetadata, guid)


	pass

In [81]:
def aliasGUID(crateMetadata, oldGUID):

		matchedMetadataElem = filterByGUID(crateMetadata, oldGUID)

		# generate New identifier
		newGUID = generateIdentifier(
			matchedMetadataElem['name'],
			matchedMetadataElem['@type']
		)
		
	pass

In [84]:
# if a conflict is found
import sqid

def generateIdentifier():
	pass

def reassignIdentifier():
	pass

ModuleNotFoundError: No module named 'sqid'

### 4. Global Reasoning

In [None]:
def getROCrateOutput(roCrateCollection: Collection, guid: str):
	""" Get the Output Datasets of a specified ROCrate"""

	pass

# input RO Crate needs usedBy -> next RO Crate

In [21]:
class ProcessCrate():
	def __init__(
			self, 
			objectName: str, 
			bucketName: str, 
			minioClient: Minio, 
			asyncCollection: Collection
		):
		self.objectName = objectName
		self.bucketName = bucketName
		self.minioClient = minioClient
		self.asyncCollection = asyncCollection

		self.metadata = None
		self.zippedCrate = None

	def readZip(self):
		"""" Read the Zipped ROCrate from Minio """
		try:
			response = self.minioClient.get_object(
				bucket_name= self.bucketName,
				object_name = self.objectName
			)
			zippedObjectContent = response.read()
		finally:
			response.close()
			response.release_conn()

		#self.zippedCrate = zippedObjectContent
		return zippedObjectContent

	def readMetadata(self):
		""" Extract the ROCrate JSON from the ROCrate"""

		if self.zippedCrate is None:
			self.readZip()

		zipContents = io.BytesIO(self.zippedCrate)
		
		with zipfile.ZipFile(zipContents, "r") as crateZip:      
			crateInfoList = crateZip.infolist()
			# extract the ro-crate-metadata.json from the zipfile
			metadataInfo = list(filter(lambda info: 'ro-crate-metadata.json' in info.filename, crateInfoList))
			
			if len(metadataInfo) != 1:
				raise Exception('ro-crate-metadata.json not found in crate')
			metadataFileInfo = metadataInfo[0]

        # uploaded crate path must be trimmed from all uploaded elements
			crateParentPath = pathlib.Path(metadataFileInfo.filename).parent

      # extract and upload the content
			roCrateJSONContents = crateZip.read(metadataFileInfo.filename)
        # may have to seek the begining of the file

    # TODO validate ROCrate
		roCrateMetadata = json.loads(roCrateJSONContents)
		return roCrateMetadata

	def processMetadata(self):
		""" Preprocess any metadata by normalizing arks, detecting identifier conflicts and reasoning
		"""

		if self.metadata is None:
			self.readMetadata()


		metadata = self.reasonProv()
		pass

	def reasonProv():
		""" Materialize inverse edges 
		"""
		pass

	def extractCrate(self, crateZip=None):
		if crateZip:
			crateContents = crateZip
		else:
			crateContents = self.readZip()

		
		pass

In [25]:
testCrateUpload.jobID

In [None]:
ProcessCrate(
	objectName="test/1.cm4ai_chromatin_mda-mb-468_untreated_imageloader_initialrun0.1alpha.zip",
	bucketName="default",
	jobID="",
	minioClient=minioClient,
	asyncCollection=asyncCollection
)

In [None]:
# delete the uploaded test crate

In [None]:
# upload all crates into minio


In [15]:
# read the zip 
test_crate = allUploadedCrates[0]

In [12]:
test_crate.name

'1.cm4ai_chromatin_mda-mb-468_untreated_apmsloader_initialrun0.1alpha.zip'

In [35]:
with test_crate.open("rb") as zippedCrate:
	zipContents = zippedCrate.read()

with zipfile.ZipFile(io.BytesIO(zipContents), "r") as crateZip:
	crateInfoList = crateZip.infolist()

	# extract the ro-crate-metadata.json from the zipfile
	metadataInfo = list(filter(lambda info: 'ro-crate-metadata.json' in info.filename, crateInfoList))

	# TODO raise a more descriptive exception
	if len(metadataInfo) != 1:
		raise Exception('ro-crate-metadata.json not found in crate')

	metadataFileInfo = metadataInfo[0]

	fileContents = crateZip.read(metadataFileInfo.filename)

In [36]:

roCrateMetadata = json.loads(fileContents)

In [39]:
# clean all ark references 



In [41]:
formatArks(roCrateMetadata)

In [44]:


def checkProvLocal():
	''' Given an ROCrate find the elements referenced which are not local to the ROCrate
	'''
	pass


metadataGraphGUIDS = [ elem.get("@id") for elem in roCrateMetadata['@graph']]

allLocal = lambda guidSearch: all([elem in metadataGraphGUIDS for elem in guidSearch])

generatedLocal = allLocal(computationProv['generated'])
usedDatasetLocal = allLocal(computationProv['usedDataset'])
usedSoftwareLocal = allLocal(computationProv['usedSoftware'])




True
True
True


In [None]:
[ ]

In [45]:
computationProvk

{'@id': 'ark:59852/computation-AP-MS-Loader-58sGTge',
 'generated': ['ark:59852/dataset-cellmaps_ppidownloader-output-file-58sGTge'],
 'usedDataset': [],
 'usedSoftware': ['ark:59852/software-cellmaps_ppidownloader-58sGTge']}

In [None]:
# if provenance isn't local
from pymongo.collection import Collection

def addProv(
		identifierCollection: Collection,
		sourceGUID: str, 
		sinkGUID: str, 
		propertyName: str,
		):
	'''
  :param identifierCollection pymongo.collection.Collection: the pymongo collection that contains all identifier metadata
	:param sourceGUID str: the GUID to be modified
	:param sinkGUID str: the GUID which is going to be referenced
	:param propertyName str: the property name which is going to be modified
	'''

	updateResult = identifierCollection.update_one(
		{"@id": sourceGUID},
		{"$push": {propertyName: sinkGUID}}
	)

	# assert that 
	if updateResult.matched_count != 1 and updateResult.modified_count != 1:
		raise Exception()



In [None]:
def addInverse(propertyName: str, sourceGUID: str, sinkGUID: str, metadataGraph):
	''' Add the inverse properties within a guid
	'''

	return metadataGraph