# 🧪 Create Assay Nodes & Relationships

This notebook reads the dataset manifest, extracts assay and material metadata, maps tissues/cells to UBERON/Cell Ontology, and writes Neo4j CSVs for assay nodes and their relationships to studies, anatomy, cell types, genes, and methylation regions using `genelab_utils` and `ontology_mapper`.

Author: Peter W. Rose, UC San Diego (pwrose.ucsd@gmail.com)

In [34]:
import os
import pandas as pd
import ontology_mapper
import genelab_utils as gl

In [35]:
pd.set_option('display.max_rows', None)  # Shows all rows
pd.set_option('display.max_colwidth', None)  # Shows full content of each cell

## Setup Environment Variables
Edit `../.env` to configure the environment.  

In [36]:
# Node and relationship directory paths
node_dir, rel_dir = gl.setup_environment()

Environment setup for KG version: v0.0.3


In [37]:
# Edit `../.env` to define `BIOPORTAL_API_KEY` for ontology lookups. 
apikey = os.getenv("BIOPORTAL_API_KEY")
if not apikey:
    raise Exception("BIOPORTAL_API_KEY is not set in the .env file!")

In [38]:
MANIFEST_PATH = "../data/manifest.csv"

## Get Info about available Datasets

In [39]:
manifest = pd.read_csv(MANIFEST_PATH, dtype=str, keep_default_na=False,
                       usecols=["identifier", "technology", "measurement", "assay_name", "organism", "material", "filename"])
# Remove duplicates that have multiple materials. These will be extracted from the factors to ensure a proper mapping
manifest.drop_duplicates(subset=["identifier", "technology", "measurement", "assay_name", "filename"], inplace=True)
manifest.head()

Unnamed: 0,identifier,technology,measurement,assay_name,organism,material,filename
0,OSD-100,RNA Sequencing (RNA-Seq),transcription profiling,OSD-100_transcription-profiling_rna-sequencing-(rna-seq),Mus musculus,left eye,GLDS-100_rna_seq_differential_expression.csv
1,OSD-101,RNA Sequencing (RNA-Seq),transcription profiling,OSD-101_transcription-profiling_rna-sequencing-(rna-seq)_Illumina,Mus musculus,Left gastrocnemius,GLDS-101_rna_seq_differential_expression.csv
2,OSD-102,RNA Sequencing (RNA-Seq),transcription profiling,OSD-102_transcription-profiling_rna-sequencing-(rna-seq)_Illumina HiSeq 4000,Mus musculus,Left kidney,GLDS-102_rna_seq_differential_expression.csv
3,OSD-103,Whole Genome Bisulfite Sequencing,DNA methylation profiling,OSD-103_dna-methylation-profiling_whole-genome-bisulfite-sequencing,Mus musculus,Quadriceps-left,GLDS-103_Gwgbs_differential_methylation_tiles_GLMethylSeq.csv
4,OSD-103,RNA Sequencing (RNA-Seq),transcription profiling,OSD-103_transcription-profiling_rna-sequencing-(rna-seq),Mus musculus,Quadriceps-left,GLDS-103_rna_seq_differential_expression.csv


## Extract Assay Data

In [40]:
variables = {"transcription profiling": "Log2fc_",
             "DNA methylation profiling": "meth.diff_",
            }

In [41]:
assays = gl.extract_assay_info(manifest, variables)
assays.head()

Unnamed: 0,identifier,technology,measurement,assay_name,organism,material,filename,factors,factors_1,factors_2
0,OSD-100,RNA Sequencing (RNA-Seq),transcription profiling,OSD-100_transcription-profiling_rna-sequencing-(rna-seq),Mus musculus,left eye,GLDS-100_rna_seq_differential_expression.csv,(Ground Control)v(Space Flight),[Ground Control],[Space Flight]
0,OSD-100,RNA Sequencing (RNA-Seq),transcription profiling,OSD-100_transcription-profiling_rna-sequencing-(rna-seq),Mus musculus,left eye,GLDS-100_rna_seq_differential_expression.csv,(Space Flight)v(Ground Control),[Space Flight],[Ground Control]
1,OSD-101,RNA Sequencing (RNA-Seq),transcription profiling,OSD-101_transcription-profiling_rna-sequencing-(rna-seq)_Illumina,Mus musculus,Left gastrocnemius,GLDS-101_rna_seq_differential_expression.csv,(Ground Control)v(Space Flight),[Ground Control],[Space Flight]
1,OSD-101,RNA Sequencing (RNA-Seq),transcription profiling,OSD-101_transcription-profiling_rna-sequencing-(rna-seq)_Illumina,Mus musculus,Left gastrocnemius,GLDS-101_rna_seq_differential_expression.csv,(Space Flight)v(Ground Control),[Space Flight],[Ground Control]
2,OSD-102,RNA Sequencing (RNA-Seq),transcription profiling,OSD-102_transcription-profiling_rna-sequencing-(rna-seq)_Illumina HiSeq 4000,Mus musculus,Left kidney,GLDS-102_rna_seq_differential_expression.csv,(Ground Control)v(Space Flight),[Ground Control],[Space Flight]


## Extract Material Information and Map to UBERON Ontology

In [42]:
materials = gl.extract_materials(assays)
# Remove rows where 'factors' contains any digit followed by space (e.g., 1 day) since they cannot match any cell or tissue types
materials = materials[~materials["material"].str.contains(r'\d ', na=False)]
print("Number of materials to map:", materials.shape[0])

Number of materials to map: 258


In [43]:
mapped_materials = ontology_mapper.map_ontology(materials, "material", "material", "UBERON", apikey)
mapped_materials = mapped_materials[mapped_materials["material_id"] != ""].reset_index()

In [44]:
mapped_materials[["material", "material_name", "material_id", "material_uri"]]

Unnamed: 0,material,material_name,material_id,material_uri
0,3D Cells,cell,CL:0000000,http://purl.obolibrary.org/obo/CL_0000000
1,3D cells,cell,CL:0000000,http://purl.obolibrary.org/obo/CL_0000000
2,Adrenal gland,adrenal gland,UBERON:0002369,http://purl.obolibrary.org/obo/UBERON_0002369
3,Adrenal glands- both sides,adrenal gland,UBERON:0002369,http://purl.obolibrary.org/obo/UBERON_0002369
4,Blood,blood,UBERON:0000178,http://purl.obolibrary.org/obo/UBERON_0000178
5,Bone Marrow,bone marrow,UBERON:0002371,http://purl.obolibrary.org/obo/UBERON_0002371
6,Brain,brain,UBERON:0000955,http://purl.obolibrary.org/obo/UBERON_0000955
7,Cells,cell,CL:0000000,http://purl.obolibrary.org/obo/CL_0000000
8,Cerebrum,cerebrum,UBERON:6110636,http://purl.obolibrary.org/obo/UBERON_6110636
9,Colon,colon,UBERON:0001155,http://purl.obolibrary.org/obo/UBERON_0001155


## Create Anatomy (proxy) Nodes

In [45]:
material_ids = mapped_materials[["material_id"]].copy()
material_ids.rename(columns={"material_id": "identifier"}, inplace=True)
anatomy = material_ids[material_ids["identifier"].str.startswith("UBERON:")].copy()

In [46]:
anatomy_nodes = gl.save_dataframe_to_kg(anatomy, 'Anatomy', node_dir)
print(f"Number of Anatomy nodes: {anatomy_nodes.shape[0]}")
anatomy_nodes.head()

Number of Anatomy nodes: 33


Unnamed: 0,identifier
2,UBERON:0002369
4,UBERON:0000178
5,UBERON:0002371
6,UBERON:0000955
8,UBERON:6110636


## Create CellType (proxy) Nodes

In [47]:
cell_type = material_ids[material_ids["identifier"].str.startswith("CL:")].copy()

In [48]:
cell_type_nodes = gl.save_dataframe_to_kg(cell_type, 'CellType', node_dir)
print(f"Number of CellType nodes: {cell_type_nodes.shape[0]}")
cell_type_nodes.head()

Number of CellType nodes: 7


Unnamed: 0,identifier
0,CL:0000000
15,CL:4030029
26,CL:0000056
27,CL:0000081
28,CL:0000084


## Assign Materials for Assay Factors 1 and 2

In [49]:
assays = gl.assign_material_to_assays(assays, mapped_materials)

## Assign Unique Assay Identifier

In [50]:
assays = gl.add_assay_identifiers(assays)

## Create Study-PERFORMED_SpA-Assay Relationships

In [51]:
study_performed_assay = assays[["study_id", "identifier"]].copy()
study_performed_assay.rename(columns={"study_id": "from", "identifier": "to"}, inplace=True)

In [52]:
study_performed_assay_rels = gl.save_dataframe_to_kg(study_performed_assay, 'Study-PERFORMED_SpAS-Assay', rel_dir)
print(f"Number of Study-PERFORMED_SpAS-Assay relationships: {study_performed_assay_rels.shape[0]}")
study_performed_assay_rels.head()

Number of Study-PERFORMED_SpAS-Assay relationships: 6185


Unnamed: 0,from,to
0,OSD-100,OSD-100-daab5c089f2a23b4d18cbf45d113535b
0,OSD-100,OSD-100-0ee3281ad117571bed3fd0b4543f80ea
1,OSD-101,OSD-101-59e73a75c7ba08b2907095b0ca9e57e8
1,OSD-101,OSD-101-fefc0a3bd4b859b1e035ced363f4e08f
2,OSD-102,OSD-102-f2039e8d02fe8eb80f4edc65677b0f77


## Create Assay-INVESTIGATED_AiA-Anatomy Relationships

In [53]:
assay_investigated_material_1 = assays[["identifier", "material_id_1"]].copy()
assay_investigated_material_1.rename(columns={"identifier": "from", "material_id_1": "to"}, inplace=True)

assay_investigated_material_2 = assays[["identifier", "material_id_2"]].copy()
assay_investigated_material_2.rename(columns={"identifier": "from", "material_id_2": "to"}, inplace=True)

assay_investigated_material = pd.concat([assay_investigated_material_1, assay_investigated_material_2]).drop_duplicates()

assay_investigated_anatomy = assay_investigated_material[assay_investigated_material["to"].str.startswith("UBERON:")]

In [54]:
assay_investigated_anatomy_rels = gl.save_dataframe_to_kg(assay_investigated_anatomy, 'Assay-INVESTIGATED_ASiA-Anatomy', rel_dir)
print(f"Number of Assay-INVESTIGATED_ASiA-Anatomy relationships: {assay_investigated_anatomy_rels.shape[0]}")
assay_investigated_anatomy_rels.head()

Number of Assay-INVESTIGATED_ASiA-Anatomy relationships: 6071


Unnamed: 0,from,to
0,OSD-100-daab5c089f2a23b4d18cbf45d113535b,UBERON:0004548
0,OSD-100-0ee3281ad117571bed3fd0b4543f80ea,UBERON:0004548
1,OSD-101-59e73a75c7ba08b2907095b0ca9e57e8,UBERON:0001388
1,OSD-101-fefc0a3bd4b859b1e035ced363f4e08f,UBERON:0001388
2,OSD-102-f2039e8d02fe8eb80f4edc65677b0f77,UBERON:0004538


## Create Assay-INVESTIGATED_AiCT-CellType Relationships

In [55]:
assay_investigated_cell_type = assay_investigated_material[assay_investigated_material["to"].str.startswith("CL:")]

In [56]:
assay_investigated_cell_type_rels = gl.save_dataframe_to_kg(assay_investigated_cell_type, 'Assay-INVESTIGATED_ASiCT-CellType', rel_dir)
print(f"Number of Assay-INVESTIGATED_ASiCT-CellType: {assay_investigated_cell_type_rels.shape[0]}")
assay_investigated_cell_type_rels.head()

Number of Assay-INVESTIGATED_ASiCT-CellType: 1134


Unnamed: 0,from,to
8,OSD-109-311bb16f4e456a48c4f1bc555878f9fd,CL:0000000
8,OSD-109-24d7925b9a73c1ecb5e18e44174fd043,CL:0000000
8,OSD-109-b647183b7061e4076f2e0251f406ad48,CL:0000000
8,OSD-109-850e09fdbf94fc58e5ef83b1c6cf5d64,CL:0000000
8,OSD-109-810f01e6c3e983ca673d6ce6606e6e9e,CL:0000000


## Create Assay Nodes

In [57]:
assays.rename(columns={'assay_name': 'name'}, inplace=True)
assay_props = assays[["identifier", "name", "technology", "measurement", 
                      "factors_1", "factors_2", 
                      "material_1", "material_2", 
                      "material_name_1", "material_name_2",
                      "material_id_1", "material_id_2"
                     ]].copy()

assay_nodes = gl.save_dataframe_to_kg(assay_props, 'Assay', node_dir)
print(f"Number of Assays nodes: {assay_nodes.shape[0]}")
assay_nodes.head()

Number of Assays nodes: 6185


Unnamed: 0,identifier,name,technology,measurement,factors_1,factors_2,material_1,material_2,material_name_1,material_name_2,material_id_1,material_id_2
0,OSD-100-daab5c089f2a23b4d18cbf45d113535b,OSD-100_transcription-profiling_rna-sequencing-(rna-seq),RNA Sequencing (RNA-Seq),transcription profiling,Ground Control,Space Flight,left eye,left eye,left eye,left eye,UBERON:0004548,UBERON:0004548
0,OSD-100-0ee3281ad117571bed3fd0b4543f80ea,OSD-100_transcription-profiling_rna-sequencing-(rna-seq),RNA Sequencing (RNA-Seq),transcription profiling,Space Flight,Ground Control,left eye,left eye,left eye,left eye,UBERON:0004548,UBERON:0004548
1,OSD-101-59e73a75c7ba08b2907095b0ca9e57e8,OSD-101_transcription-profiling_rna-sequencing-(rna-seq)_Illumina,RNA Sequencing (RNA-Seq),transcription profiling,Ground Control,Space Flight,Left gastrocnemius,Left gastrocnemius,gastrocnemius,gastrocnemius,UBERON:0001388,UBERON:0001388
1,OSD-101-fefc0a3bd4b859b1e035ced363f4e08f,OSD-101_transcription-profiling_rna-sequencing-(rna-seq)_Illumina,RNA Sequencing (RNA-Seq),transcription profiling,Space Flight,Ground Control,Left gastrocnemius,Left gastrocnemius,gastrocnemius,gastrocnemius,UBERON:0001388,UBERON:0001388
2,OSD-102-f2039e8d02fe8eb80f4edc65677b0f77,OSD-102_transcription-profiling_rna-sequencing-(rna-seq)_Illumina HiSeq 4000,RNA Sequencing (RNA-Seq),transcription profiling,Ground Control,Space Flight,Left kidney,Left kidney,left kidney,left kidney,UBERON:0004538,UBERON:0004538


In [58]:
assay_measured_mgene = gl.extract_transcription_data(assays, threshold=0.05)

processing: OSD-100
processing: OSD-101
processing: OSD-102
processing: OSD-103
processing: OSD-104
processing: OSD-105
processing: OSD-109
No statistically significant data for OSD-109: Log2fc_(Fe-56 ion radiation & 1 day)v(Fe-56 ion radiation & 3 day)
No statistically significant data for OSD-109: Log2fc_(Fe-56 ion radiation & 1 day)v(sham-irradiated & 1 day)
No statistically significant data for OSD-109: Log2fc_(Fe-56 ion radiation & 3 day)v(Fe-56 ion radiation & 1 day)
No statistically significant data for OSD-109: Log2fc_(sham-irradiated & 1 day)v(Fe-56 ion radiation & 1 day)
processing: OSD-117
No statistically significant data for OSD-117: Log2fc_(proton & 1 day)v(proton & 12 day)
No statistically significant data for OSD-117: Log2fc_(proton & 1 day)v(proton & 26 day)
No statistically significant data for OSD-117: Log2fc_(proton & 1 day)v(proton & 5 day)
No statistically significant data for OSD-117: Log2fc_(proton & 1 day)v(sham-irradiated & 1 day)
No statistically significant 

## Create Assay-MEASURED_AmMG-MGene Nodes

In [59]:
assay_measured_mgene_rels = gl.save_dataframe_to_kg(assay_measured_mgene, 'Assay-MEASURED_ASmMG-MGene', rel_dir)
print(f"Number of Assay-MEASURED_ASmMG-MGene relationships: {assay_measured_mgene_rels.shape[0]}")
assay_measured_mgene_rels.head()

Number of Assay-MEASURED_ASmMG-MGene relationships: 25878862


Unnamed: 0,from,to,log2fc,adj_p_value
0,OSD-100-daab5c089f2a23b4d18cbf45d113535b,23849,0.211489,0.001988
1,OSD-100-daab5c089f2a23b4d18cbf45d113535b,235339,0.355894,0.007801
2,OSD-100-daab5c089f2a23b4d18cbf45d113535b,12444,0.300122,0.040228
3,OSD-100-daab5c089f2a23b4d18cbf45d113535b,66108,0.293361,0.02302
4,OSD-100-daab5c089f2a23b4d18cbf45d113535b,57278,0.320432,0.024716


In [60]:
methylation_data = gl.extract_methylation_data(assays, threshold=0.05)

processing: OSD-103
processing: OSD-105
processing: OSD-47
processing: OSD-48


## Create MethylationRegion Nodes

In [61]:
methylation_data["name"] = methylation_data["methylation_id"]
methylation_region = methylation_data[["methylation_id", "name", "chr", "start", "end", "dist.to.feature", "in_promoter", "in_exon", "in_intron"]].copy()
methylation_region.rename(columns={"methylation_id": "identifier", "chr": "chromosome", "dist.to.feature": "dist_to_feature"}, inplace=True)
methylation_region["dist_to_feature"] = methylation_region["dist_to_feature"].astype(int)

In [62]:
methylation_region_nodes = gl.save_dataframe_to_kg(methylation_region, 'MethylationRegion', node_dir)
print(f"Number of MethylationRegion nodes: {methylation_region_nodes.shape[0]}")
methylation_region_nodes.head()

Number of MethylationRegion nodes: 5663


Unnamed: 0,identifier,name,chromosome,start,end,dist_to_feature,in_promoter,in_exon,in_intron
0,1:17167001-17168000,1:17167001-17168000,1,17167001,17168000,114,True,True,True
1,1:21022001-21023000,1:21022001-21023000,1,21022001,21023000,91,True,True,True
2,1:24496001-24497000,1:24496001-24497000,1,24496001,24497000,-4176,False,False,True
3,1:34341001-34342000,1:34341001-34342000,1,34341001,34342000,7476,False,True,True
4,1:36308001-36309000,1:36308001-36309000,1,36308001,36309000,3517,False,True,True


## Create Assay-MEASURED_AmMR-MethylationRegion Relationships

In [63]:
assay_measured_methylation_region = methylation_data[["assay_id", "methylation_id", "methylation_diff", "q_value"]].copy()
assay_measured_methylation_region.rename(columns={"assay_id": "from", "methylation_id": "to"}, inplace=True)

In [64]:
assay_measured_methylation_region_rel = gl.save_dataframe_to_kg(assay_measured_methylation_region, 'Assay-MEASURED_ASmMR-MethylationRegion', rel_dir)
print(f"Number of Assay-MEASURED_ASmMR-MethylationRegion relationships: {assay_measured_methylation_region_rel.shape[0]}")
assay_measured_methylation_region_rel.head()

Number of Assay-MEASURED_ASmMR-MethylationRegion relationships: 9555


Unnamed: 0,from,to,methylation_diff,q_value
0,OSD-103-07920f1ed30d671670e714edffc6f250,1:17167001-17168000,8.671846,0.002030562
1,OSD-103-07920f1ed30d671670e714edffc6f250,1:21022001-21023000,11.667655,0.02470766
2,OSD-103-07920f1ed30d671670e714edffc6f250,1:24496001-24497000,-9.373073,0.03356339
3,OSD-103-07920f1ed30d671670e714edffc6f250,1:34341001-34342000,-10.682111,0.02283094
4,OSD-103-07920f1ed30d671670e714edffc6f250,1:36308001-36309000,-28.820344,2.347983e-14


## Create MGene-METHYLATED_IN_MGmMR-MethylationRegion Relationships

In [65]:
mgene_methylated_in_methylation_region = methylation_data[["ENTREZID", "methylation_id"]].copy()
mgene_methylated_in_methylation_region.rename(columns={"ENTREZID": "from", "methylation_id": "to"}, inplace=True)

In [66]:
mgene_methylated_in_methylation_region_rels = gl.save_dataframe_to_kg(mgene_methylated_in_methylation_region, 'MGene-METHYLATED_IN_MGmMR-MethylationRegion', rel_dir)
print(f"Number of MGene-METHYLATED_IN_MGmMR-MethylationRegion relationships: {mgene_methylated_in_methylation_region_rels.shape[0]}")
mgene_methylated_in_methylation_region_rels.head()

Number of MGene-METHYLATED_IN_MGmMR-MethylationRegion relationships: 5694


Unnamed: 0,from,to
0,57339,1:17167001-17168000
1,71877,1:21022001-21023000
2,12823,1:24496001-24497000
3,13518,1:34341001-34342000
4,214854,1:36308001-36309000
