In [None]:
# Required libraries
import json
from neo4j import GraphDatabase
import os
import regex as re
import openai
import credentials

# Initalized and ensures the validity of Neo4j credentials. URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = credentials.NEO4J_URI
AUTH = credentials.NEO4J_AUTH

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()

In [None]:
# All root nodes share an id of "NODE_ROOT" and this can cause issues. This appends the project id so as to clarify which root node is used
def specify_root():
    global new_root_name
    new_root_name = "NODE_ROOT+"+metadata.get("nodes")[root_key].get("value").get("projectId")
    for link in metadata.get("links"):
        if link.get("source")=="NODE_ROOT":
            link["source"]=new_root_name
    metadata.get("nodes")[root_key]["id"]=new_root_name

In [None]:
# There does not seem to be a case so far where the root node is not the one at index 0 of the list of nodes, this function handels the chance that changes though.
def find_root():
    found = False
    i = 0
    for node in metadata.get("nodes"):
        if node.get("id") == "NODE_ROOT":
            return i
    raise Exception("Root node could not be found")

In [None]:
# Gets the dataset number (anywhere between 1-999)
def get_dataset_number():
    projectID = metadata.get("nodes")[root_key].get("value").get("projectId")
    num = re.findall(r'\d{1,3}',projectID)
    return int(num[0])

In [None]:
# Credentials for both the LLM and embedding models contained in credentials.py
CHATBOT_API_KEY = credentials.CHATBOT_API_KEY
CHATBOT_BASE_URL = credentials.CHATBOT_URL
CHATBOT_MODEL = credentials.CHATBOT_MODEL

EMBEDDING_API_KEY = credentials.EMBEDDING_API_KEY
EMBEDDING_URL = credentials.EMBEDDING_URL
EMBEDDING_MODEL = credentials.EMBEDDING_MODEL

# Temperature and Top_P parameters decided on in the credentials.py file
TEMP = credentials.TEMP
TOP_P = credentials.TOP_P

# Create clients with both the embedding model and llm model
keyword_client = openai.OpenAI(
    api_key=CHATBOT_API_KEY,
    base_url=CHATBOT_BASE_URL,
)

embedding_client = openai.OpenAI(
    base_url=EMBEDDING_URL,
    api_key=EMBEDDING_API_KEY,
)

# Takes in a data description and uses the LLM to return a list of keywords
def keywords(text):
    response = keyword_client.chat.completions.create(
        model=CHATBOT_MODEL,
        messages=[{"role":"system","content":"You are a helpful assistant who must find key words in a given data description. Return only a comma seperated list of those key words. Return the list only once, do not try to reformat it"},{"role":"user","content":text}],
        temperature =  TEMP,
        top_p = TOP_P
    )
    return response.choices[0].message.content

# Takes in a piece of text and embeds it as a vector
def embed(text):
    response = embedding_client.embeddings.create(
        model=EMBEDDING_MODEL,
        input=[text]
    )
    return response.data[0].embedding

# Returns a pair of keywords and their corresponding embedding
def return_keywords(description):
    desc_keywords = keywords(description)
    embedded_keywords = embed(desc_keywords)
    return (desc_keywords,embedded_keywords)

In [None]:
# Because there may be many authors, the first and last names are concatenated and then all authors are joined in one string which passes to dataset
def author_names():
    names = []
    authors = metadata.get("nodes")[root_key].get("value").get("authors")
    for person in authors:
        name = person.get('first_name') + ' ' + person.get('last_name')
        names.append(name)
    return ', '.join(names)

In [None]:
# Creates a dataset node which is the parent of all other nodes
def create_dataset():
    title = metadata.get("nodes")[root_key].get("value").get("title")
    description = metadata.get("nodes")[root_key].get("value").get("description")
    doi = metadata.get("nodes")[root_key].get("value").get("doi")
    identifier = metadata.get("nodes")[root_key].get("id")
    authors = author_names()
    license = metadata.get("nodes")[root_key].get("value").get("license")
    pub_date = metadata.get("nodes")[root_key].get("value").get("publicationDate")
    # identified_keywords = return_keywords(description)
    # llm_keywords = identified_keywords[0]
    # embedding = identified_keywords[1]
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            CREATE (p:Dataset {title: $title})
            SET p.description = $description
            SET p.doi = $doi
            SET p.identifier = $identifier
            SET p.authors = $authors
            SET p.license = $license
            SET p.publicationDate = $pub_date
            SET p.datasetNumber = $dataset_number
            //SET p.llmKeywords = $llm_keywords
            //SET p.descriptionEmbedding = $embedding
            """,
            title=title, description=description, doi=doi, identifier=identifier,
            authors=authors,license=license, pub_date=pub_date,dataset_number=dataset_number,#embedding=embedding,
            #llm_keywords=llm_keywords,
            database_="neo4j",
        )

In [None]:
# Creates a related publication node based on the index provided (there may be many related publications in a list)
def create_related_publication(pub_key):
    pub_title = metadata.get("nodes")[root_key].get("value").get("relatedPublications")[pub_key].get('publicationTitle')
    pub_author = metadata.get("nodes")[root_key].get("value").get("relatedPublications")[pub_key].get('publicationAuthor')
    pub_abstract = metadata.get("nodes")[root_key].get("value").get("relatedPublications")[pub_key].get('publicationDescription')
    pub_link = metadata.get("nodes")[root_key].get("value").get("relatedPublications")[pub_key].get('publicationLink')
    pub_publicationDate = metadata.get("nodes")[root_key].get("value").get("relatedPublications")[pub_key].get('publicationDateOfPublication')
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            CREATE (rp:RelatedPublication {title: $pub_title})
            SET rp.authors = $pub_author
            SET rp.abstract = $pub_abstract
            SET rp.link = $pub_link
            SET rp.publicationDate = $pub_publicationDate
            SET rp.datasetNumber = $dataset_number
            WITH rp
            MATCH (p:Dataset{identifier:$project_id})
            CREATE (p) <-[:PART_OF]- (rp)
            """,
            pub_title=pub_title, pub_author=pub_author, pub_abstract=pub_abstract,
            pub_link=pub_link,pub_publicationDate=pub_publicationDate,project_id=new_root_name,dataset_number=dataset_number,
            database_="neo4j",
        )

In [None]:
#Create a node for a related software. !!!This function was created before this field was fully availabe. It should be tested that this works correctly
def create_related_software(software_key):
    software_title = metadata.get("nodes")[root_key].get("value").get("relatedSoftware")[software_key].get('softwareTitle')
    software_description = metadata.get("nodes")[root_key].get("value").get("relatedSoftware")[software_key].get('softwareDescription')
    software_link = metadata.get("nodes")[root_key].get("value").get("relatedSoftware")[software_key].get('softwareLink')
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            CREATE (rs:RelatedSoftware {title: $software_title})
            SET rs.description = $software_description
            SET rs.link = $software_link
            SET rs.datasetNumber = $dataset_number
            WITH rs
            MATCH (p:Dataset{identifier:$project_id})
            CREATE (p) <-[:PART_OF]- (rs)
            """,
            software_title=software_title, software_description=software_description, software_link=software_link,
            project_id=new_root_name,dataset_number=dataset_number,
            database_="neo4j",
        )

In [None]:
#Create a node for a related dataset. !!!This function was created before this field was fully availabe. It should be tested that this works correctly
def create_related_dataset(dataset_key):
    dataset_title = metadata.get("nodes")[root_key].get("value").get("relatedDatasets")[dataset_key].get('datasetTitle')
    dataset_description = metadata.get("nodes")[root_key].get("value").get("relatedDatasets")[dataset_key].get('datasetDescription')
    dataset_link = metadata.get("nodes")[root_key].get("value").get("relatedDatasets")[dataset_key].get('datasetLink')
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            CREATE (rd:RelatedDataset {title: $dataset_title})
            SET rd.description = $dataset_description
            SET rd.link = $dataset_link
            SET rd.datasetNumber = $dataset_number
            WITH rd
            MATCH (p:Dataset{identifier:$project_id})
            CREATE (p) <-[:PART_OF]- (rd)
            """,
            dataset_title=dataset_title, dataset_description=dataset_description, dataset_link=dataset_link,
            project_id=new_root_name,dataset_number=dataset_number,
            database_="neo4j",
        )

In [None]:
# Creates a sample node from the index provided.
def create_sample(key):
    sample_identifier = metadata.get("nodes")[key].get("id")
    sample_location = metadata.get("nodes")[key].get("value").get("geographicalLocation")
    sample_media_type = metadata.get("nodes")[key].get("value").get("porousMediaType")
    sample_porosity = metadata.get("nodes")[key].get("value").get("porosity")
    sample_name = metadata.get("nodes")[key].get("value").get("name")
    sample_source = metadata.get("nodes")[key].get("value").get("source")
    sample_description = metadata.get("nodes")[key].get("value").get("description")
    sample_geographic_origin = metadata.get("nodes")[key].get("value").get("geographicOrigin")
    sample_grain_size_avg = metadata.get("nodes")[key].get("value").get("grainSizeAvg")
    sample_grain_size_min = metadata.get("nodes")[key].get("value").get("grainSizeMin")
    sample_grain_size_max = metadata.get("nodes")[key].get("value").get("grainSizeMax")
    sample_grain_size_units = metadata.get("nodes")[key].get("value").get("grainSizeUnits")
    sample_collection_method = metadata.get("nodes")[key].get("value").get("collectionMethod")
    sample_onshore_offshore = metadata.get("nodes")[key].get("value").get("onshoreOffshore")
    sample_depth = metadata.get("nodes")[key].get("value").get("depth")
    sample_water_depth = metadata.get("nodes")[key].get("value").get("waterDepth")
    sample_procedure = metadata.get("nodes")[key].get("value").get("procedure")
    sample_equipment = metadata.get("nodes")[key].get("value").get("equipment")
    sample_algorithm_description = metadata.get("nodes")[key].get("value").get("algorithmDescription")
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            CREATE (s:Sample{title:$sample_name})
            SET s.identifier = $sample_identifier
            SET s.location = $sample_location
            SET s.porousMediaType = $sample_media_type
            SET s.porosity = $sample_porosity
            SET s.source = $sample_source
            SET s.description = $sample_description
            SET s.geographicOrigin = $sample_geographic_origin
            SET s.grainSizeAvg = $sample_grain_size_avg
            SET s.grainSizeMin = $sample_grain_size_min
            SET s.grainSizeMax = $sample_grain_size_max
            SET s.grainSizeUnits = $sample_grain_size_units
            SET s.collectionMethod = $sample_collection_method
            SET s.onshoreOffshore = $sample_onshore_offshore
            SET s.depth = $sample_depth
            SET s.waterDepth = $sample_water_depth
            SET s.procedure = $sample_procedure
            SET s.equipment = $sample_equipment
            SET s.algorithmDescription = $sample_algorithm_description
            SET s.datasetNumber = $dataset_number
            """,
            sample_identifier=sample_identifier,sample_location=sample_location,sample_media_type=sample_media_type,
            sample_porosity=sample_porosity,sample_name=sample_name,sample_source=sample_source,
            sample_description=sample_description, sample_geographic_origin=sample_geographic_origin,
            sample_grain_size_avg=sample_grain_size_avg,sample_grain_size_min=sample_grain_size_min,
            sample_grain_size_max=sample_grain_size_max,sample_grain_size_units=sample_grain_size_units,
            sample_collection_method=sample_collection_method,sample_onshore_offshore=sample_onshore_offshore,
            sample_depth=sample_depth,sample_water_depth=sample_water_depth,sample_procedure=sample_procedure,
            sample_equipment=sample_equipment,sample_algorithm_description=sample_algorithm_description,
            dataset_number=dataset_number,
            database_="neo4j",
        )

In [None]:
# Combines all the information provided on voxel dimensions into one string
def voxel_dim(key):
    x = metadata.get("nodes")[key].get("value").get("voxelX")
    y = metadata.get("nodes")[key].get("value").get("voxelY")
    z = metadata.get("nodes")[key].get("value").get("voxelZ")
    dim = metadata.get("nodes")[key].get("value").get("voxelUnits")
    try:
        string = "X, Y, Z units (in "+dim+"s): "+str(x)+", "+str(y)+", "+str(z)
    except:
        if x == None and y == None and z == None and dim == None:
            return None
        string = "X, Y, Z units: "+str(x)+", "+str(y)+", "+str(z)+". Unit type not provided"
    return string

In [None]:
# Find the file types in that node
def get_file_types(node):
    node_files = set()
    for file in node.get('value').get('fileObjs'):
        try:
            full_type = re.findall(r'(?:\.[A-z]+\d?)+$',file.get('path'))[0]
            remove_front_dot = re.sub(r'^\.','',full_type)
            cleaned = re.sub(r'\.thumb\.jpg','',remove_front_dot)
            cleaned = re.sub(r'npy_[A-z]{7}\.','',cleaned)
            cleaned = re.sub(r'\.histogram\.(?:(?:csv)|(?:jpg))','',cleaned)
            node_files.add(cleaned.lower())
            return list(node_files)
        except:
            return None

In [None]:
# Creates a node for a digital dataset based on a provided index.
def create_digital_dataset(key):
    dig_identifier = metadata.get("nodes")[key].get("id")
    dig_name = metadata.get("nodes")[key].get("label")
    dig_voxels = voxel_dim(key)
    dig_description = metadata.get("nodes")[key].get("value").get("description")
    dig_segmented = metadata.get("nodes")[key].get("value").get("isSegmented")
    dig_imaging_center = metadata.get("nodes")[key].get("value").get("imagingCenter")
    dig_imaging_equipment_and_model = metadata.get("nodes")[key].get("value").get("imagingEquipmentAndModel")
    dig_image_format = metadata.get("nodes")[key].get("value").get("imageFormat")
    dig_image_dimensions = metadata.get("nodes")[key].get("value").get("imageDimensions")
    dig_image_byte_order = metadata.get("nodes")[key].get("value").get("imageByteOrder")
    dig_dimensionality = metadata.get("nodes")[key].get("value").get("dimensionality")
    dig_files = len(metadata.get("nodes")[key].get("value").get("fileObjs"))
    dig_file_types = get_file_types(metadata.get("nodes")[key])
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            CREATE (dd:DigitalDataset{title:$dig_name})
            SET dd.identifier = $dig_identifier
            SET dd.voxelDimensions = $dig_voxels
            SET dd.description = $dig_description
            SET dd.segmented = $dig_segmented
            SET dd.imagingCenter = $dig_imaging_center
            SET dd.imagingEquipmentAndModel = $dig_imaging_equipment_and_model
            SET dd.imageFormat = $dig_image_format
            SET dd.imageDimensions = $dig_image_dimensions
            SET dd.imageByteOrder = $dig_image_byte_order
            SET dd.dimensionality = $dig_dimensionality
            SET dd.numberOfFiles = $dig_files
            SET dd.datasetNumber = $dataset_number
            SET dd.fileTypes = $dig_file_types
            """,
            dig_identifier=dig_identifier,dig_name=dig_name,dig_voxels=dig_voxels,
            dig_description=dig_description,dig_segmented=dig_segmented,dig_imaging_center=dig_imaging_center,
            dig_imaging_equipment_and_model=dig_imaging_equipment_and_model,dig_image_format=dig_image_format,
            dig_image_dimensions=dig_image_dimensions,dig_image_byte_order=dig_image_byte_order,
            dig_dimensionality=dig_dimensionality,dig_files=dig_files,dataset_number=dataset_number,
            dig_file_types=dig_file_types,
            database_="neo4j",
        )
    make_part_of(dig_identifier)

In [None]:
# Creates an analysis dataset node from the index provided
def create_analysis_dataset(key):
    analysis_identifier = metadata.get("nodes")[key].get("id")
    analysis_name = metadata.get("nodes")[key].get("label")
    analysis_segmented = metadata.get("nodes")[key].get("value").get("isSegmented")
    analysis_description = metadata.get("nodes")[key].get("value").get("description")
    analysis_type = metadata.get("nodes")[key].get("value").get("datasetType")
    analysis_digital_dataset = metadata.get("nodes")[key].get("value").get("digitalDataset")
    analysis_sample = metadata.get("nodes")[key].get("value").get("sample")
    analysis_files = len(metadata.get("nodes")[key].get("value").get("fileObjs"))
    analysis_file_types = get_file_types(metadata.get("nodes")[key])
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            CREATE (ad:AnalysisDataset{title:$analysis_name})
            SET ad.identifier = $analysis_identifier
            SET ad.segmented = $analysis_segmented
            SET ad.description = $analysis_description
            SET ad.type = $analysis_type
            SET ad.referencedDigitalDataset = $analysis_digital_dataset
            SET ad.referencedSample = $analysis_sample
            SET ad.datasetNumber = $dataset_number
            SET ad.numberOfFiles = $analysis_files
            SET ad.fileTypes = $analysis_file_types
            """,
            analysis_name=analysis_name, analysis_segmented=analysis_segmented, analysis_identifier=analysis_identifier,
            analysis_description=analysis_description, analysis_type=analysis_type, analysis_files=analysis_files,
            analysis_digital_dataset=analysis_digital_dataset,analysis_sample=analysis_sample,dataset_number=dataset_number,
            analysis_file_types=analysis_file_types,
            database_="neo4j",
        )
    make_part_of(analysis_identifier)

In [None]:
# The PartOf relationship between Digital and Analysis dataset nodes and the Dataset node are not present in the list of "links." So instead this function creates the appropriate arrows for that pointing
def make_part_of(node_id):
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            MATCH (n{identifier:$node_id})
            MATCH (r:Dataset{identifier:$identifier})
            MERGE (n) -[:PART_OF]-> (r)
            """,
            node_id=node_id,identifier=new_root_name,
            database_="neo4j",
        )

In [None]:
# Creates a relationship between two nodes identified by the 'source' and 'target' parameters. Is a PartOf relationship if it is the sample pointing to the root and InputFor if is somewhere else
def establish_connection(source,target):
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        if source == new_root_name:
            driver.execute_query("""
                MATCH (s{identifier:$source})
                MATCH (t{identifier:$target})
                CREATE (s) <-[:PART_OF]- (t)
            """,
            source=source, target=target,
            database_="neo4j",
            )
        else:
            driver.execute_query("""
                MATCH (s{identifier:$source})
                MATCH (t{identifier:$target})
                CREATE (s) <-[:INPUT_FOR]- (t)
            """,
            source=source, target=target,
            database_="neo4j",
            )

In [None]:
# Creates a Neo4j graph for a single metadata file. That file is first opened and loaded as a json file. A related publication is created for each index in that list, same with related software and datasets. And finally the list of nodes is iterated through and depending on the data type, the appropriate node is created.
def create_neo4j(filename):
    with open(filename, 'r') as single_file:
        global metadata    
        metadata = json.load(single_file)
    global root_key 
    root_key = find_root()
    global dataset_number
    dataset_number = get_dataset_number()
    specify_root()
    create_dataset()
    for i in range(len(metadata.get("nodes")[root_key].get("value").get("relatedPublications"))):
        create_related_publication(i)
    for i in range(len(metadata.get("nodes")[root_key].get("value").get("relatedSoftware"))):
        create_related_software(i)
    for i in range(len(metadata.get("nodes")[root_key].get("value").get("relatedDatasets"))):
        create_related_dataset(i)
    for i in range(len(metadata.get("nodes"))):
        if metadata.get("nodes")[i].get("value").get("dataType")=="sample":
            create_sample(i)
        if metadata.get("nodes")[i].get("value").get("dataType")=="digital_dataset":
            create_digital_dataset(i)
        if metadata.get("nodes")[i].get("value").get("dataType")=="analysis_data":
            create_analysis_dataset(i)
    for link in metadata.get("links"):
        establish_connection(link.get("source"),link.get("target"))

In [None]:
def remove_identifiers():
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            MATCH (n)
            REMOVE n.identifier
            """,
            database_="neo4j",
        )

In [None]:
# Removes all nodes in the Neo4j instance
def clear():
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        driver.execute_query("""
            MATCH (n)
            DETACH DELETE n
            """,
            database_="neo4j",
        )

In [None]:
# Unpacks every file in the folder of metadata files and creates a Neo4j graph for each of them
folder = '/Users/zacharynowacek/Desktop/Austin/DRP-Metadata'

directory = os.fsencode(folder)

clear()

for file in os.listdir(directory):
    if os.fsdecode(file) != ".DS_Store":
        filename = os.fsdecode(file)
        filename = folder+'/'+filename
        create_neo4j(filename)
remove_identifiers()