# TWOC Jupyter Notebook

Provided by X-omics.
This is the original file using isatools.


In [1]:
# Set DMZ working directory, all paths will be relative (downstream) to this directory

import os

working_directory = "../../data/Su_2020_FAIR"  #TODO: parameterize?

In [2]:
import os
os.listdir()

['isa-to-rdfs.py',
 'TWOC_create_isa original.ipynb',
 'TWOC_create_isa.ipynb',
 'README.md',
 '.ipynb_checkpoints',
 'TWOC_test_isa.ipynb']

In [3]:
# Read in Patient Metadata

import pandas as pd
patient_metadata = pd.read_csv('../../data/Su_2020_original/IDs_Individuals-vs-Samples_incl-all-CONTROLS.csv')
patient_metadata.columns = patient_metadata.columns.str.replace(' ', '_', regex=True)
patient_metadata


Unnamed: 0,Study_Subject_ID,Sample_ID,BD_Time-Point,Transcript.,Metabol.,Prote.,Sex,Age,Unnamed:_8,*_ref_=_see_earlier_Master_Table_;_**_T2_Transcriptomics_samples_are_available_but_were_not_included_in_our_FAIRification_process
0,INCOV001,INCOV001-BL,T1,Y,Y,Y,ref,ref,,
1,INCOV001,INCOV001-AC,T2,N,Y,Y,ref,ref,,
2,INCOV002,INCOV002-BL,T1,Y,Y,Y,ref,ref,,
3,INCOV002,INCOV002-AC,T2,N,Y,Y,ref,ref,,
4,INCOV003,INCOV003-BL,T1,Y,Y,Y,ref,ref,,
...,...,...,...,...,...,...,...,...,...,...
500,Healthy_1975030,1975030,healthy donor,N,Y,Y,Male,50,,
501,Healthy_1980311,1980311,healthy donor,N,Y,N,Female,65,,
502,Healthy_1982074,1982074,healthy donor,N,Y,N,Female,51,,
503,Healthy_1994373,1994373,healthy donor,N,N,Y,Male,78,,


In [4]:
# Read in Phenotype information 

import pandas as pd
phenotype_data = pd.read_csv('../../data/Su_2020_original/TWOC-MultiOmics-Studies-COVID_300Samples.csv', header=1)
phenotype_data.columns = phenotype_data.columns.str.replace(' ', '_', regex=True)
phenotype_data

Unnamed: 0,Study_Subject,Patient,Symptoms_Consistent_with_COVID-19,COVID-19,Sex,Age-Years,Patient_Status,ICU,Study_DOI,Ethnicity,...,COVID-19_Disease_Severity_(WHO_Ordinal)_Scale,Charlson_Comorbidity_Index,APACHE_II_Score,Mechanical_Ventilation,Asthma,COPD,DM,CRP_(mg/L),Ferritin_(ng/mL),IL6
0,INCOV001,YES,YES,YES,Female,77.0,Hospital,NO,https://doi.org/10.1016/j.cell.2020.10.037,Caucasian,...,3.0,,,NO,NO,NO,NO,,,
1,INCOV002,YES,YES,YES,Male,39.0,ICU,YES,https://doi.org/10.1016/j.cell.2020.10.037,Caucasian,...,5.0,,,NO,NO,NO,NO,,,
2,INCOV003,YES,YES,YES,Male,64.0,ICU,YES,https://doi.org/10.1016/j.cell.2020.10.037,Caucasian,...,7.0,,,YES,NO,NO,NO,,,
3,INCOV004,YES,YES,YES,Male,76.0,Hospital,NO,https://doi.org/10.1016/j.cell.2020.10.037,Caucasian,...,4.0,,,NO,NO,YES,NO,,,
4,INCOV005,YES,YES,YES,Male,75.0,Hospital,NO,https://doi.org/10.1016/j.cell.2020.10.037,Caucasian,...,4.0,,,NO,NO,YES,NO,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290,GSM4914341,NO,NO,NO,Male,37.0,Home,NO,https://doi.org/10.1016/j.immuni.2020.11.017,Caucasian,...,0.0,,,NO,,,,,,
291,GSM4914342,NO,NO,NO,Male,44.0,Home,NO,https://doi.org/10.1016/j.immuni.2020.11.017,Caucasian,...,0.0,,,NO,,,,,,
292,GSM4914343,NO,NO,NO,Male,38.0,Home,NO,https://doi.org/10.1016/j.immuni.2020.11.017,Caucasian,...,0.0,,,NO,,,,,,
293,GSM4914344,NO,NO,NO,Female,35.0,Home,NO,https://doi.org/10.1016/j.immuni.2020.11.017,Caucasian,...,0.0,,,NO,,,,,,


In [5]:
# Set Sample IDs to strings (some control samples are integers)
patient_metadata['Sample_ID'] = patient_metadata['Sample_ID'].astype(str)

In [6]:
from isatools.model import *
ontologies = {
    "CHEBI": OntologySource(
        name = "CHEBI - Chemical Entities of Biological Interest", 
        file = "http://purl.obolibrary.org/obo/chebi.owl",
        description = "A structured classification of molecular entities of biological interest focusing on 'small' chemical compounds."), 
    "CHMO": OntologySource(
        name = "CHMO - Chemical Methods Ontology", 
        file = "http://purl.obolibrary.org/obo/chmo.owl",
        description = "CHMO, the chemical methods ontology, describes methods used to collect data in chemical experiments, such as mass spectrometry and electron microscopy prepare and separate material for further analysis, such as sample ionisation, chromatography, and electrophoresis synthesise materials, such as epitaxy and continuous vapour deposition It also describes the instruments used in these experiments, such as mass spectrometers and chromatography columns. It is intended to be complementary to the Ontology for Biomedical Investigations (OBI)."), 
    "CRO": OntologySource(
        # The Contributor Role Ontology (CRO) is an extension of the CASRAI Contributor Roles Taxonomy (CRediT) and replaces the former Contribution Ontology.
        name = "CRO - Contributor Role Ontology",
        file = "http://purl.obolibrary.org/obo/cro.owl",
        description = "A classification of the diverse roles performed in the work leading to a published research output in the sciences. Its purpose to provide transparency in contributions to scholarly published work, to enable improved systems of attribution, credit, and accountability."),
    "EDAM": OntologySource(
        name = "EDAM - EMBRACE Data and Methods",
        file = "http://edamontology.org/EDAM.owl",
        description = "Bioinformatics operations, data types, formats, identifiers and topics"),  
    "EFO": OntologySource(
        name = "EFO - Experimental Factor Ontology", 
        file = "http://www.ebi.ac.uk/efo/efo.owl",
        description = "The Experimental Factor Ontology (EFO) provides a systematic description of many experimental variables available in EBI databases, and for external projects such as the NHGRI GWAS catalogue. It combines parts of several biological ontologies, such as anatomy, disease and chemical compounds. The scope of EFO is to support the annotation, analysis and visualization of data handled by many groups at the EBI and as the core ontology for OpenTargets.org"), 
    "ExO": OntologySource(
        name = "ExO - Exposure ontology", 
        file = "http://purl.obolibrary.org/obo/exo.owl",
        description = "ExO is intended to bridge the gap between exposure science and diverse environmental health disciplines including toxicology, epidemiology, disease surveillance, and epigenetics."), 
    "GECKO": OntologySource(
        name = "GECKO - Genomics Cohorts Knowledge Ontology",
        file = "http://purl.obolibrary.org/obo/gecko.owl"),
    "HP": OntologySource(
        name = "HP - Human Phenotype Ontology",
        file = "http://purl.obolibrary.org/obo/hp.owl",
        description = "The Human Phenotype Ontology (HPO) provides a standardized vocabulary of phenotypic abnormalities and clinical features encountered in human disease."),
    "MI": OntologySource(
        name = "MI - Molecular Interactions Controlled Vocabulary", 
        file = "http://purl.obolibrary.org/obo/mi.owl",
        description = "A structured controlled vocabulary for the annotation of experiments concerned with protein-protein interactions."),
    "MS": OntologySource(
        name = "MS - Mass spectrometry ontology",
        file = "http://purl.obolibrary.org/obo/ms.owl",
        description = "A structured controlled vocabulary for the annotation of experiments concerned with proteomics mass spectrometry."),
    "MSIO": OntologySource(
        name = "MS - Mass spectrometry ontology",
        file = "http://purl.obolibrary.org/obo/msio.owl",
        description = "MSIO aims to provide a single point of entry to support semantic markup of experiments making use of NMR and MS techniques to identify, measure and quantify small molecules known as metabolites. MSIO covers metabolite profiling, targeted or undertargeted, tracer based applications. MSIO reuses a number of resources such as CHEBI, DUO, NMRCV, OBI, and STATO."),
    "NCBITAXON": OntologySource(
        name = "NCBI organismal classification", 
        file = "http://purl.obolibrary.org/obo/ncbitaxon.owl",
        description = "An ontology representation of the NCBI organismal taxonomy"),
    "NCIT": OntologySource(
        name = "NCI Thesaurus OBO Edition", 
        file = "http://purl.obolibrary.org/obo/ncit.owl",
        description = "The NCIt OBO Edition project aims to increase integration of the NCIt with OBO Library ontologies. NCIt is a reference terminology that includes broad coverage of the cancer domain, including cancer related diseases, findings and abnormalities. NCIt OBO Edition releases should be considered experimental."),
    "OBI": OntologySource(
        name = "OBI - Ontology for Biomedical Investigations", 
        file = "http://purl.obolibrary.org/obo/obi.owl",
        description = "An integrated ontology for the description of life-science and clinical investigations"),
    "OMIABIS": OntologySource(
        name = "Ontologized MIABIS", 
        file = "http://purl.obolibrary.org/obo/omiabis.owl",
        description = "An ontological version of MIABIS (Minimum Information About BIobank data Sharing)"),
    "PRIDE": OntologySource(
        name = "PRIDE Controlled Vocabulary",
        file = "http://purl.obolibrary.org/obo/pride_cv.obo",
        description = "The PRIDE PRoteomics IDEntifications (PRIDE) database is a centralized, standards compliant, public data repository for proteomics data, including protein and peptide identifications, post-translational modifications and supporting spectral evidence."),
   "UBERON": OntologySource(
        name = "Uber-anatomy ontology",
        file = "http://purl.obolibrary.org/obo/uberon.owl",
        description = "Uberon is an integrated cross-species anatomy ontology representing a variety of entities classified according to traditional anatomical criteria such as structure, function and developmental lineage. The ontology includes comprehensive relationships to taxon-specific anatomical ontologies, allowing integration of functional, phenotype and expression data.")
    
}


In [7]:
investigation = Investigation(
    filename = "i_investigation.txt", 
    identifier = "", 
    title = "Trusted World of Corona (TWOC)",
    description = "The goal of this trusted guide to the world of COVID-19 is to help clinicians, the scientific community, policy makers and politicians and the public at large to get near real time accurate, expert-annotated and specific information in a modern, user friendly and easily accessible format. The benefits will include better use of treatments, faster development of vaccines and a clearer view on factors that may negatively affect the outcomes of a COVID-19 infection and other future virus outbreaks. Website: https://www.health-holland.com/project/2020/trusted-world-of-corona",
    submission_date = "",
    public_release_date = "",
    ontology_source_references = [o for o in ontologies.values()],
     publications = [
        Publication(doi="https://doi.org/10.31219/osf.io/9mz27", 
                    title='Mild as well as severe disease caused by COVID-19 might be part of the same problem: Machine-assisted analysis of congruent clinical observations and the underlying molecular mechanisms in order to rationalise drug repurposing',
                    status=OntologyAnnotation(
                                term="preprint",
                                term_source= ontologies["EFO"],
                                term_accession="http://www.ebi.ac.uk/efo/EFO_0010558"),
                    author_list="Barend Mons, Peter-Bram 't Hoen, Dirkjan Kuijpers, Thomas Hankemeier, Gianpiero Pescarmona")],
    
    contacts = [
        Person(
            last_name = "Mons", 
            first_name = "Barend",
            affiliation = "",
            email = "info@twoc.eu",
            address = "http://www.twoc.eu/",
            roles = [
                OntologyAnnotation(
                    term = "project management role",
                    term_source = ontologies["CRO"], 
                    term_accession ="http://purl.obolibrary.org/obo/CRO_0000065")])],
    studies = None,
    comments = None)

In [8]:
cohort_study = Study(
    filename = "s_study.txt", 
    identifier = "https://doi.org/10.1016/j.cell.2020.10.037", 
    title = "Multi-Omics Resolves a Sharp Disease-State Shift between Mild and Moderate COVID-19",
    description = "Multi-omics and clinical of patients with mild, moderate and severe corona infection", 
    submission_date = "", 
    public_release_date = "",
    contacts = [
        Person(
            last_name = "Su", 
            first_name = "Yanpeng",
            #mid_initials = "",
            affiliation = "Institute for Systems Biology, Seattle, WA 98109, USA",
            roles = [
                OntologyAnnotation(
                    term = "author role",
                    term_source = ontologies["CRO"], 
                    term_accession ="http://purl.obolibrary.org/obo/CRO_0000001")]),
        Person(
            last_name = "Heath", 
            first_name = "James",
            mid_initials = "R.",
            affiliation = "Institute for Systems Biology, Seattle, WA 98109, USA",
            roles = [
                OntologyAnnotation(
                    term = "author role",
                    term_source = ontologies["CRO"], 
                    term_accession ="http://purl.obolibrary.org/obo/CRO_0000001")])],
    design_descriptors = [
        OntologyAnnotation(
                term = "Multi-omics study",
                term_source = ontologies["PRIDE"],
                term_accession = "http://purl.obolibrary.org/obo/PRIDE_0000461"),
        OntologyAnnotation(
                term = "population based study design",
                term_source = ontologies["OMIABIS"],
                term_accession = "http://purl.obolibrary.org/obo/OMIABIS_0001022")],
    publications = [
        Publication(doi="https://doi.org/10.1016/j.cell.2020.10.037", pubmed_id= '33171100',
                    title='Multi-Omics Resolves a Sharp Disease-State Shift between Mild and Moderate COVID-19',
                    status=OntologyAnnotation(term="indexed in PubMed"),
                    author_list="Su Y, Chen D, Yuan D, Lausted C, Choi J, Dai CL, Voillet V, Duvvuri VR, Scherler K, Troisch P, Baloni P, Qin G, Smith B, Kornilov SA, Rostomily C, Xu A, Li J, Dong S, Rothchild A, Zhou J, Murray K, Edmark R, Hong S, Heath JE, Earls J, Zhang R, Xie J, Li S, Roper R, Jones L, Zhou Y, Rowen L, Liu R, Mackay S, O'Mahony DS, Dale CR, Wallick JA, Algren HA, Zager MA; ISB-Swedish COVID19 Biobanking Unit, Wei W, Price ND, Huang S, Subramanian N, Wang K, Magis AT, Hadlock JJ, Hood L, Aderem A, Bluestone JA, Lanier LL, Greenberg PD, Gottardo R, Davis MM, Goldman JD, Heath JR")],
    factors = None, 
    protocols = None,
    assays = None,
    sources = None,
    samples = None,
    process_sequence = None,
    other_material = None,
    characteristic_categories = None,
    comments = None,
    units = None)
investigation.studies.append(cohort_study)

In [9]:
# Define protocol parameters 
protocol_params = {
    "Post Extraction": ProtocolParameter(
        parameter_name = "Post Extraction"
        ),

    "Derivatization": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "Derivatization",
            term_source = ontologies["MSIO"],
            term_accession = "http://purl.obolibrary.org/obo/MSIO_0000111")
        ),

    "Chromatography Instrument": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "Chromatography Instrument",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0000485")
        ),

    "Column model": ProtocolParameter(
        parameter_name = "Column model"
        ),

    "Column type": ProtocolParameter(
        parameter_name = "Column type"
        ),

    "Scan polarity": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "scan polarity",
            term_source = ontologies["MS"],
            term_accession = "http://purl.obolibrary.org/obo/MS_1000465")
        ),
        
    "Scan m/z range": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "Scan m/z range")
        ),
            #term_source = ontologies[""],
            #term_accession = "")),
        
    "Instrument": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "Instrument",
            term_source = ontologies["MS"],
            term_accession = "http://purl.obolibrary.org/obo/MS_1000463")
        ),

    "Ion source": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "Ion source",
            term_source = ontologies["CHMO"],
            term_accession = "http://purl.obolibrary.org/obo/CHMO_0000960")
        ),

    "Mass analyzer": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "Mass analyzer",
            term_source = ontologies["MS"],
            term_accession = "http://purl.obolibrary.org/obo/MS_1000451")
        ),

    "method reference": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "method reference",
            term_source = ontologies["MI"],
            term_accession = "http://purl.obolibrary.org/obo/MI_0357")
        ),

    "Panel amount": ProtocolParameter(
        parameter_name = "Panel amount"
        ),

    "Panel size": ProtocolParameter(
        parameter_name = "Panel size"
        ),
    
    "PEA Instrument": ProtocolParameter(
        parameter_name = "PEA Instrument"
#        OntologyAnnotation(term = 
#                           
#            term_source = ontologies["OBI"],
#            term_accession = "http://purl.obolibrary.org/obo/OBI_0003113")
        ),
    
    "PCR instrument": ProtocolParameter(
        parameter_name = OntologyAnnotation(
            term = "real-time PCR machine",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0001110")
        )
}

In [10]:
# Define Assays

assays = {
    "metabolomics": Assay(
        filename = "a_assay_metabolomics.txt",
        measurement_type = OntologyAnnotation(
            term = "targeted metabolite profiling",
            term_source = ontologies["MSIO"], 
            term_accession = "http://purl.obolibrary.org/obo/MSIO_0000100"),
    
        technology_type = OntologyAnnotation(
            term = "liquid chromatography-mass spectrometry",
            term_source = ontologies["CHMO"],
            term_accession = "http://purl.obolibrary.org/obo/CHMO_0000524")
        ),

    "proteomics": Assay(
        filename = "a_assay_proteomics.txt",
        measurement_type = OntologyAnnotation(
            term = "plasma proteomics assay",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0003229"),

        technology_type = OntologyAnnotation(
            term = "analyte assay",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0000443")
        ),

    "transcriptomics": Assay(
        filename = "a_assay_transcriptomics.txt",
        measurement_type = OntologyAnnotation(
            term = "single-cell RNA sequencing assay",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0002631"),

        technology_type = OntologyAnnotation(
            term = "single-cell RNA sequencing assay",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0002631")
        )
}

In [11]:
# Define protocols

protocols = {
    
    "sample_collection": Protocol( 
        name = "sample_collection_protocol",
        protocol_type = OntologyAnnotation(
            term = "sample collection", # alternative term, original is material sampling process
            term_source = ontologies["OBI"],  
            term_accession = "http://purl.obolibrary.org/obo/OBI_0000744"),
	#TODO: Blood draw; first, second draw
	#TODO: Plasma and PBMC isolation
       # metabolomics protocols
        description = "WOS for time of blood draw were determined by manual expert review. WOS for Figure S1A were automatically generated from data extracted from the electronic health record for hospitalized patients, and plotted for 6-hour time intervals based on end-interval grade. Automated results were compared against manual expert review for 15% of study subjects. The follng data were collected from the subject’s electronic health record (EHR): complete blood count (CBC) with differential, comprehensive metabolic panel, APTT, D-dimer, fibrinogen, prothrombin time, thrombin time ,and troponin I. Lab data were extracted from the nearest time point to each blood draw, if available within a window ± two days. First blood draw (n = 76), second blood draw (n = 54). Blood draws were classified as WOS = 3-4 (n = 83) and WOS = 5-7 (n = 47). We used an unpaired Wilcoxon-test to determine the statistical difference between WOS = 3-4 and WOS = 5-7, and P values were FDR adjusted. Spearman correlation coefficient was calculated using R package ‘corrplot v0.84’ to observe the associations between EHR labs and WOS disease severity, and the correlation significance was reported as FDR adjusted P values."),
    
    "Extraction": Protocol(
        name = "Extraction", 
        protocol_type = OntologyAnnotation(
            term = "extraction",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0302884"), #also in: MSIO, etc.
        parameters = [protocol_params["Post Extraction"],
                      protocol_params["Derivatization"]],
        description = "Plasma and PBMC isolation were conducted with standard protocols from Bloodworks Northwest (Seattle, WA). Patient blood was collected in BD Vacutainer (EDTA) tubes (Becton, Dickinson and Company, Franklin Lakes, NJ). Plasma fractions were collected after centrifuged at 800 x g at 4°C for 10 min, aliquoted, and stored until use at −80°C. The rest of the blood was diluted with PBS (pH7.2) to 2X of the original volume and layered over 15 mL Ficoll (GE Healthcare, Waukesha, WI) in SepMate-50 tubes (StemCell, Vancouver, BC). After centrifuged at 800 x g for 15 min at room temperature, the PBMC layer (did not include granulocytes (such as neutrophils)) was poured into a 50 mL conical tube. The cells were washed twice with autoMACS Rinsing Solution (Miltenyi Biotec, Auburn, CA) and centrifuge at 250 x g for 10 min, at RT. PBMC pellets were gently resuspended in 5 mL Rinsing Solution and a 5 microLiter aliquot was diluted 1:10 v/v for cell counting. Cells in 18 microLiter of diluted samples were first mixed with 2 microLiter of Acridine Orange / Propidium Iodide Stain (Logos Biosystems, Annandale, VA), 10 microLiter was then loaded to a PhotonSlide (Logos Biosystems) and counted in a LUNA FL Dual Fluorescence cell counter (Logos Biosystems). Cryopreservation freeze media CryoStor CS-10 (Biolife Solutions, Bothell, WA) was slowly added to make a concentration of 2.5 million PBMC/ml. Cells were aliquoted in 2.0 mL Cryotube vials (ThermoFisher, Waltham, MA) and frozen in CoolCell LX Cell Freezing Container (Corning, Corning, NY) at −80°C for at least 2 hours before stored in liquid nitrogen until use."),
    
    "Labelling samples": Protocol(
        name = "Labelling samples",
        protocol_type = OntologyAnnotation(
            term = 'Labelling',
            term_source = ontologies["CHMO"],
            term_accession = 'http://purl.obolibrary.org/obo/CHMO_0001675')
        ),
    
    "Chromatography": Protocol(
        name = "Chromatography", 
        protocol_type = OntologyAnnotation(
            term = "chromatography",
            term_source = ontologies["CHMO"],
            term_accession = "http://purl.obolibrary.org/obo/CHMO_0001000"), 
        parameters = [protocol_params["Chromatography Instrument"],
                      protocol_params["Column model"],
                      protocol_params["Column type"]],
        description = "Metabolon (Morrisville, NC, USA) conducted the metabolomics assays for all participant plasma samples used in this study. Data were generated with the Global Metabolomics platform via ultra-high-performance liquid chromatography/tandem accurate mass spectrometry. 100 microLiter of plasma was aliquoted and transported on dry ice to Metabolon Inc. for analysis. Sample handling and quality control were performed by Metabolon in their CLIA-certified laboratory. Mass spectrometry was performed using Metabolon’s ultra-high-performance liquid chromatography/tandem mass spectrometry (UHPLC/MS/MS) Global Platform, which consisting of four independent UPLC–MS/MS instruments, each with a Waters ACQUITY ultra-performance liquid chromatography (UPLC) and Thermo Scientific Q-Exactive high resolution/accurate mass spectrometer interfaced with a heated electrospray ionization (HESI-II) source and Orbitrap mass analyzer at 35,000 mass resolution."),
    
    "Mass spectrometry": Protocol(
        name = "Mass spectrometry", 
        protocol_type = OntologyAnnotation(
            term = "mass spectrometry",
            term_source = ontologies["CHMO"],
            term_accession = "http://purl.obolibrary.org/obo/CHMO_0000470"), 
        parameters = [protocol_params["Scan polarity"],
                      protocol_params["Scan m/z range"],
                      protocol_params["Instrument"],
                      protocol_params["Ion source"],
                      protocol_params["Mass analyzer"],
                      protocol_params["method reference"]]
        ),
    
    "Data transformation": Protocol(
        name = "Data transformation",
        protocol_type = OntologyAnnotation(
            term = "data transformation",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0200000"),
        description = "Raw metabolomics data were median scaled within each batch such that the median value for each metabolite was one"
        ),

    "metabolite identification": Protocol(
        name = 'metabolite identification',
        protocol_type = OntologyAnnotation(
            term = 'metabolite identification',
            term_source = ontologies["MI"],
            term_accession = "http://purl.obolibrary.org/obo/MI_2131")   
        ),
    
    "Proximity extension assay": Protocol(
        name = "Proximity extension assay",
        protocol_type = OntologyAnnotation(
            term = "Proximity extension assay",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0003113"),
        parameters = [protocol_params["PEA Instrument"],
                    protocol_params["Panel size"],
                    protocol_params["Panel amount"]]
        ),
        
    "real-time PCR": Protocol(
        name = "real-time PCR",
        protocol_type = OntologyAnnotation(
            term = "real-time PCR instrument",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0001110"),
        parameters = [protocol_params["PCR instrument"]
                        ]
        ),
    
    "feature annotation": Protocol(
        name = "feature annotation",
        protocol_type = OntologyAnnotation(
            term = "Annotation",
            term_source = ontologies["EDAM"],
            term_accession = "http://edamontology.org/operation_0226")
        ),    

    "PBMC preparation": Protocol(
        name = "PBMC preparation",
        protocol_type = OntologyAnnotation(
            term = "staining",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0302887"),
        description = "Chromium Single Cell Kits (10x Genomics) were utilized to analyze the transcriptomic, surface protein levels and, TCR sequences simultaneously from the same cell. Experiments were performed according to the manufacturer’s instructions. Briefly, cryopreserved PBMCs were thawed and 1X red blood cell lysis solution (BioLegend) was used to lyse any remaining red blood cells in the PBMC samples. Cells were stained with a panel of TotalSeq-C human antibodies that includes hashtag multiplexing antibodies (BioLegend) detailed in Table S1.8 according to the manufacturer’s protocol."
        ),

    "Single cell sequencing": Protocol(
        name = "Single cell sequencing",
        protocol_type = OntologyAnnotation(
            term = "DNA sequencing assay",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0000626"),
        description =  "Stained cells were then loaded onto a Chromium Next GEM chip G. Cells were lysed for reverse transcription and complementary DNA (cDNA) amplification in the Chromium Controller (10X Genomics). The polyadenylated transcripts were reverse-transcribed inside each gel bead-in-emulsion afterward. Full-length cDNA along with cell barcode identifiers were PCR-amplified and sequencing libraries were prepared and normalized. The constructed library was sequenced on the Novaseq platform (Illumina)."
        ),

     "Single cell RNA-seq data qc": Protocol(
         name = "Single cell RNA-seq data qc",
         protocol_type = OntologyAnnotation(
            term = "quality control testing design",
            term_source = ontologies["OBI"],
            term_accession = "http://purl.obolibrary.org/obo/OBI_0001336"),
         description =  "Droplet-based sequencing data were aligned and quantified using the Cell Ranger Single-Cell Software Suite (version 3.0.0, 10x Genomics) against the GRCh38 human reference genome. Cells from each demultiplexed sample were first filtered for cells that expressed a minimum of 200 genes, then they were filtered based on three metrics: 1) the total number of unique molecular identifiers (UMI) counts per cell (library size) must be less than 10000; 2) the number of detected genes per cell must be less than 2500; and 3) the proportion of mitochondrial gene counts (UMIs from mitochondrial genes / total UMIs) must be less than 10%. Doublets were either simultaneously identified in sample demultiplexing or identified using scrublet (Wolock et al., 2019) and were removed prior to the aforementioned filtering. After QC metric filtering, a total of 559,583 cells were retained for downstream analysis. Scanpy (Wolf et al., 2018) was used to normalize cells via CPM normalization (UMI total count of each cell was set to 106) and log1p transformation (natural log of CPM plus one)."
        ),

     "Pseudo-bulk aggregation": Protocol(
        name = "Pseudo-bulk aggregation",
        protocol_type = OntologyAnnotation(
            term = "pseudo-bulk aggregation of single-cell expression data",
            term_source = ontologies["EFO"],
            term_accession =  "http://www.ebi.ac.uk/efo/EFO_0030053")
        #description = "TODO"
        )


     
        
    # TODO: Single cell RNA-seq cell type identification?
    # TODO: Single cell RNA-seq batch information?
    # TODO: Single cell RNA-seq gene regulatory networks?
    # TODO: Single cell RNA-seq signature score?
    # TODO: Single cell RNA-seq marker selection?
       
}


# append to study protocols
for protocol in protocols.values():
    cohort_study.protocols.append(protocol)

In [12]:
# add samples
for index, row in patient_metadata.iterrows():
    
    # create source (=individual)
    source_name = row["Study_Subject_ID"]
    source = Source(
        name = source_name,
        characteristics = [
            Characteristic(
                category = OntologyAnnotation(
                    term = "Organism",
                    term_source = ontologies["OBI"],
                    term_accession = "http://purl.obolibrary.org/obo/OBI_0100026"),
                value = OntologyAnnotation(
                    term = "Homo sapiens",
                    term_source = ontologies["NCBITAXON"],
                    term_accession = "http://purl.obolibrary.org/obo/NCBITaxon_9606"))])
     
    
    # Iterate over phenotype information (and add if available)
    for pheno_index, pheno_row in phenotype_data.iterrows():
        if source_name == pheno_row["Study_Subject"]:
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "patient",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C16960"),
                    value = pheno_row["Patient"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "COVID-19 Infection",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C171133"),
                    value = pheno_row["COVID-19"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Sex",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C28421"),
                    value = pheno_row["Sex"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Age-Years",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C37908"),
                    value = pheno_row["Age-Years"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Patient Status",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C166244"),
                    value = pheno_row["Patient_Status"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Intensive Care Unit",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C53511"),
                    value = pheno_row["ICU"]))
                
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Ethnicity",
                        term_source = ontologies['GECKO'],
                        term_accession = "http://purl.obolibrary.org/obo/GECKO_0000061"),
                    value = pheno_row["Ethnicity"]))
        
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "BMI",
                        term_source = ontologies['ExO'],
                        term_accession = "http://purl.obolibrary.org/obo/ExO_0000105"),
                    value = pheno_row["BMI"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Smoking",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C154329"),
                    value = pheno_row["Smoking"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "COVID-19_Disease_Severity_(WHO_Ordinal)_Scale",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C178899"),
                    value = pheno_row["COVID-19_Disease_Severity_(WHO_Ordinal)_Scale"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Charlson_Comorbidity_Index",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C176422"),
                    value = pheno_row["Charlson_Comorbidity_Index"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "APACHE_II_Score",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C121113"),
                    value = pheno_row["APACHE_II_Score"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Mechanical_Ventilation",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C70909"),
                    value = pheno_row["Mechanical_Ventilation"]))
            
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Asthma",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C28397"),
                    value = pheno_row["Asthma"]))
            
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "COPD",
                        term_source = ontologies['HP'],
                        term_accession = "http://purl.obolibrary.org/obo/HP_0006510"),
                    value = pheno_row["COPD"]))
            
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "DM",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C2985"),
                    value = pheno_row["DM"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "CRP_(mg/L)",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C64548"),
                    value = pheno_row["CRP_(mg/L)"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "Ferritin_(ng/mL)",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C74737"),
                    value = pheno_row["Ferritin_(ng/mL)"]))
            
            source.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(
                        term = "IL6",
                        term_source = ontologies['NCIT'],
                        term_accession = "http://purl.obolibrary.org/obo/NCIT_C74834"),
                    value = pheno_row["IL6"]))
            
            
    
    # create sample
    sample_name = row["Sample_ID"]
    blood_sample = Sample(
        name = sample_name, 
        derives_from = [source])
    blood_sample.characteristics.append(
        Characteristic(
            category = OntologyAnnotation(
                term = "anatomical entity",
                term_source = ontologies["UBERON"],
                term_accession = "http://purl.obolibrary.org/obo/UBERON_0001062"),
            value = OntologyAnnotation(
                term = "blood",
                term_source = ontologies["UBERON"],
                term_accession = "http://purl.obolibrary.org/obo/UBERON_0000178")))
    cohort_study.samples.append(blood_sample)
    
    # Sample collection_process                
    sample_collection_process = Process(
        name = "samplecollection_{0}".format(row["Study_Subject_ID"]),
        executes_protocol = protocols["sample_collection"],
        inputs = [source],
        outputs = [blood_sample])
    cohort_study.process_sequence.append(sample_collection_process)

In [13]:
# Add samples to metabolomics assay


# Metabolomics subdirectory
metabolomics_dir = os.path.join(working_directory, "metabolomics")

# Datafiles (not all are available)
raw_datafile = DataFile(filename= "blank", label = "Raw Spectral Data File")
normalized_datafile = DataFile(filename= "", label = "Normalization Name")
derived_spectral_datafile = DataFile(filename= "", label = "Derived Spectral Data File")
transformation_datafile = DataFile(filename= os.path.join(metabolomics_dir, "metabolomics_Su_2020_feature-data.csv"), label = "Derived Transformation Name")
MAF = DataFile(filename= os.path.join(metabolomics_dir, "metabolomics_Su_2020_feature-metadata.csv"), label="Metabolite Assignment File")

# Iterate over samples in study
for index, row in patient_metadata.iterrows():
    for idx, sample in enumerate(cohort_study.samples):
        if row["Metabol."] == 'Y':
            if sample.name == row["Sample_ID"]:

                # Metabolomics extraction
                material_extract = Material(
                            name = "extract_{0}".format(sample.name),
                            type_ = "Extract Name"
                        )

                Post_extraction = ParameterValue(
                            category = protocol_params["Post Extraction"], 
                            value = "100microLiter of plasma was aliquoted and transported on dry ice"
                        )

                Derivatization = ParameterValue(
                            category = protocol_params["Derivatization"], 
                            value = "Handled by Metabolon Inc."
                        ) # Metabolon?

                extraction_process = Process(
                            executes_protocol=protocols["Extraction"], 
                            parameter_values=[Post_extraction, Derivatization],
                            inputs = [sample],
                            outputs = [material_extract]
                        )

                ## Labeling
                material_label = Material(
                            name ="labeled_{0}".format(sample.name),
                            type_ ="Labeled Extract Name"
                        )

                labelling_process = Process(
                            executes_protocol=protocols["Labelling samples"],
                            inputs = [extraction_process.outputs[0]],
                            outputs = [material_label]
                        )

                ## Chromatography
                chromatography_instrument = ParameterValue(
                            category = protocol_params["Chromatography Instrument"], 
                            value = "Waters ACQUITY HPLC"
                        )

                column_model = ParameterValue(
                            category = protocol_params["Column model"], 
                            value = ""
                        ) #TODO unspecified in article???

                column_type = ParameterValue(
                            category = protocol_params["Column type"], 
                            value = ""
                        ) #TODO unspecified in article???

                chromatography_process = Process(
                            name = "chromatography_{0}".format(sample.name),
                            executes_protocol = protocols["Chromatography"],
                            parameter_values = [chromatography_instrument, column_model, column_type],
                            inputs = [labelling_process.outputs[0]],
                            outputs = []
                        )

                ## Mass spectrometry
                scan_polarity = ParameterValue(
                            category = protocol_params["Scan polarity"], 
                            value = ""
                        ) #TODO unspecified in article

                scan_range = ParameterValue(
                            category = protocol_params["Scan m/z range"], 
                            value = ""
                        ) #TODO unspecified in article

                ms_instrument = ParameterValue(
                            category = protocol_params["Instrument"], 
                            value = "Thermo Scientific Q-Exactive"
                        )

                ion_source = ParameterValue(
                            category = protocol_params["Ion source"], 
                            value = "HESI-II"
                        )

                mass_analyzer = ParameterValue(
                            category = protocol_params["Mass analyzer"], 
                            value = "Orbitrap"
                        )

                mass_spectrometry_process = Process(
                            name = "mass_spectrometry_{0}".format(sample.name),
                            executes_protocol= protocols["Mass spectrometry"],
                            parameter_values = [scan_polarity, scan_range, ms_instrument, ion_source, mass_analyzer],
                            inputs = [],
                            outputs = [raw_datafile]
                        )

                ## Data transformation
                data_transformation_process = Process(
                            name = "data_transformation_{0}".format(sample.name),
                            executes_protocol = protocols["Data transformation"],
                            inputs = [raw_datafile],
                            outputs = [normalized_datafile, derived_spectral_datafile]  
                        )

                ## Metabolite identification
                metabolite_identification_process = Process(
                            name = "metabolite_identification_{0}".format(sample.name),
                            executes_protocol =  protocols["metabolite identification"],
                            inputs = [normalized_datafile],
                            outputs= [transformation_datafile, MAF]                 
                        )


                # Link processes
                plink(extraction_process, labelling_process)
                plink(labelling_process, chromatography_process)
                plink(chromatography_process, mass_spectrometry_process)
                plink(mass_spectrometry_process, data_transformation_process)
                plink(data_transformation_process, metabolite_identification_process)


                # Add samples, materials and data files to the amines assay
                assays["metabolomics"].samples.append(blood_sample)
                assays["metabolomics"].other_material.append(material_extract)
                assays["metabolomics"].other_material.append(material_label)
                assays["metabolomics"].data_files.append(raw_datafile)
                assays["metabolomics"].data_files.append(normalized_datafile)
                assays["metabolomics"].data_files.append(derived_spectral_datafile)
                assays["metabolomics"].data_files.append(transformation_datafile)                                                                                                   
                assays["metabolomics"].data_files.append(MAF)


                ## Add processes to the amines assay
                assays["metabolomics"].process_sequence.append(extraction_process)
                assays["metabolomics"].process_sequence.append(labelling_process)
                assays["metabolomics"].process_sequence.append(chromatography_process)
                assays["metabolomics"].process_sequence.append(mass_spectrometry_process)
                assays["metabolomics"].process_sequence.append(data_transformation_process)
                assays["metabolomics"].process_sequence.append(metabolite_identification_process)
    

In [14]:
# Add samples to proteomics assay



# Proteomics subdirectory
proteomics_dir = os.path.join(working_directory, "proteomics")

# Datafiles (not all are available)
raw_datafile = DataFile(filename= os.path.join(proteomics_dir, "proteomics_Su_2020_feature-data.csv"), label = "Raw Spectral Data File")
feature_annotation_file = DataFile(filename= os.path.join(proteomics_dir, "proteomics_Su_2020_feature-metadata.csv"), label = "Feature Annotation File")



# Iterate over samples in study
for index, row in patient_metadata.iterrows():
    for idx, sample in enumerate(cohort_study.samples):
        if row["Prote."] == 'Y':
            if sample.name == row["Sample_ID"]:

                # proteomics extraction
                material_extract = Material(
                            name = "extract_{0}".format(sample.name),
                            type_ = "Extract Name"
                        )

                Post_extraction = ParameterValue(
                            category = protocol_params["Post Extraction"], 
                            value = "100microLiter of plasma was aliquoted and transported on dry ice"
                        )

                Derivatization = ParameterValue(
                            category = protocol_params["Derivatization"], 
                            value = ""
                        ) #TODO unspecified in articles

                extraction_process = Process(
                            executes_protocol=protocols["Extraction"], 
                            parameter_values=[Post_extraction, Derivatization],
                            inputs = [sample],
                            outputs = [material_extract]
                        )

                ## Labeling
                material_label = Material(
                            name ="labeled_{0}".format(sample.name),
                            type_ ="Labeled Extract Name"
                        )

                labelling_process = Process(
                            executes_protocol=protocols["Labelling samples"],
                            inputs = [extraction_process.outputs[0]],
                            outputs = [material_label]
                        )

                ## Proximity extension assay
                pea_instrument = ParameterValue(
                            category = protocol_params["PEA Instrument"], 
                            value = "Olink Bioscience"
                        )

                Panel_size = ParameterValue(
                            category = protocol_params["Panel size"], 
                            value = "92"
                        )

                Panel_amount = ParameterValue(
                            category = protocol_params["Panel amount"], 
                            value = "5"
                        )

                pea_process = Process(
                            name = "PEA_{0}".format(sample.name),
                            executes_protocol = protocols["Proximity extension assay"],
                            parameter_values = [pea_instrument, Panel_size, Panel_amount],
                            inputs = [labelling_process.outputs[0]],
                            outputs = []
                        )

                ## real-time PCR
                pcr_instrument = ParameterValue(
                            category = protocol_params["PCR instrument"], 
                            value = "high-throughput microfludic real-time PCR"
                        )

                rt_pcr_process = Process(
                            name = "real-time PCR_{0}".format(sample.name),
                            executes_protocol= protocols["real-time PCR"],
                            parameter_values = [pcr_instrument],
                            inputs = [],
                            outputs = [raw_datafile]
                        )

                ## feature annotation
                feature_annotation_process = Process(
                            name = "feature_annotation_{0}".format(sample.name),
                            executes_protocol = protocols["feature annotation"],
                            inputs = [raw_datafile],
                            outputs = [feature_annotation_file]
                )


                # Link processes
                plink(extraction_process, labelling_process)
                plink(labelling_process, pea_process)
                plink(pea_process, rt_pcr_process)
                plink(rt_pcr_process, feature_annotation_process)


                # Add samples, materials and data files to the amines assay
                assays["proteomics"].samples.append(blood_sample)
                assays["proteomics"].other_material.append(material_extract)
                assays["proteomics"].other_material.append(material_label)
                assays["proteomics"].data_files.append(raw_datafile)
                assays["proteomics"].data_files.append(feature_annotation_file)


                ## Add processes to the amines assay
                assays["proteomics"].process_sequence.append(extraction_process)
                assays["proteomics"].process_sequence.append(labelling_process)
                assays["proteomics"].process_sequence.append(pea_process)
                assays["proteomics"].process_sequence.append(rt_pcr_process)
                assays["proteomics"].process_sequence.append(feature_annotation_process)

In [15]:
# Add samples to transcriptomics assay



# Transcriptomics subdirectory
transcriptomics_dir = os.path.join(working_directory, "transcriptomics")

# Datafiles (not all are available)
pseudobulked_file = DataFile(filename= os.path.join(transcriptomics_dir, " transcriptomics_Su_2020_feature-data.csv"), label = "Pseudobulk File")
feature_annotation_file = DataFile(filename= os.path.join(transcriptomics_dir, "transcriptomics_Su_2020_feature-metadata.csv"), label = "Feature Annotation File")



# Iterate over samples in study
for index, row in patient_metadata.iterrows():
    for idx, sample in enumerate(cohort_study.samples):
        if row["Transcript."] == 'Y':
            if sample.name == row["Sample_ID"]:
      

                # transcriptomics extraction
                material_extract = Material(
                            name = "extract_{0}".format(sample.name),
                            type_ = "Extract Name"
                        )

                Post_extraction = ParameterValue(
                            category = protocol_params["Post Extraction"], 
                            value = " Cells were aliquoted in 2.0 mL Cryotube vials (ThermoFisher, Waltham, MA) and frozen in CoolCell LX Cell Freezing Container (Corning, Corning, NY) at −80°C for at least 2 hours before stored in liquid nitrogen until use."
                        )

                Derivatization = ParameterValue(
                            category = protocol_params["Derivatization"], 
                            value = ""
                        ) #TODO unspecified in articles


                extraction_process = Process(
                            executes_protocol=protocols["Extraction"], 
                            #parameter_values=[Post_extraction, Derivatization],
                            inputs = [sample],
                            outputs = [material_extract]
                        )


#                 # Single cell transcriptomics cell preparation
#                 material_label = Material(
#                             name = "labeled_{0}".format(sample.name),
#                             type_ = "Labeled Extract Name"
#                         )

#                 cell_preparation_process = Process(
#                             executes_protocol=protocols["Labelling samples"], 
#                             inputs = [material_extract],
#                             outputs = [material_label]
#                         )


                # Sequencing

                sc_sequences_raw_file = DataFile(
                    filename = "scs_rawdata_{0}_R1.tsv".format(material_extract.name), 
                    label = "Raw Data File R1", 
                    generated_from = [material_extract])

                sequencing_process = Process(
                            executes_protocol=protocols["Single cell sequencing"], 
            #                 parameter_values=[feature_metadata_file],
                            inputs = [material_extract],
                            outputs = [sc_sequences_raw_file]
                        )


                # Sequencing data QC

                sc_sequences_qc_file = DataFile(
                    filename = "scs_qc_{0}.tsv".format(material_extract.name), 
                    label = "Single-cell Sequencing Raw Data File", 
                    generated_from = [sc_sequences_raw_file])

                sequencing_data_qc_process = Process(
                            executes_protocol=protocols["Single cell RNA-seq data qc"], 
                            inputs = [sc_sequences_raw_file],
                            outputs = [sc_sequences_qc_file]

                        )

                # Pseudo-bulk aggregation

                pseudobulk_process = Process(
                            executes_protocol=protocols["Pseudo-bulk aggregation"],
                            inputs = [sc_sequences_qc_file],
                            outputs = [pseudobulked_file]
                ) 


                 ## feature annotation
                feature_annotation_process = Process(
                            name = "feature_annotation_{0}".format(material_extract.name),
                            executes_protocol = protocols["feature annotation"],
                            inputs = [pseudobulked_file],
                            outputs = [feature_annotation_file]
                )


                # Link processes
#                 plink(extraction_process, cell_preparation_process)
                plink(extraction_process, sequencing_process)
                plink(sequencing_process, sequencing_data_qc_process)
                plink(sequencing_data_qc_process, pseudobulk_process)
                plink(pseudobulk_process, feature_annotation_process)

                # Add samples, materials and data files to the amines assay
                assays["transcriptomics"].samples.append(blood_sample)
                assays["transcriptomics"].other_material.append(material_extract)
#                 assays["transcriptomics"].other_material.append(material_label)
                assays["transcriptomics"].data_files.append(sc_sequences_raw_file)
                assays["transcriptomics"].data_files.append(sc_sequences_qc_file)
                assays["transcriptomics"].data_files.append(pseudobulked_file)
                assays["transcriptomics"].data_files.append(feature_annotation_file)


                ## Add processes to the  assay
                assays["transcriptomics"].process_sequence.append(extraction_process)
#                 assays["transcriptomics"].process_sequence.append(cell_preparation_process)
                assays["transcriptomics"].process_sequence.append(sequencing_process)
                assays["transcriptomics"].process_sequence.append(sequencing_data_qc_process)
                assays["transcriptomics"].process_sequence.append(pseudobulk_process)
                assays["transcriptomics"].process_sequence.append(feature_annotation_process)

In [16]:
# Add assays to cohort study
for assay in assays.values():
        cohort_study.assays.append(assay)

## Write ISA-Tab files

In [17]:
# Write to ISA-Tab
from isatools import isatab
isatab.dump(investigation, working_directory)
print()

2023-06-02 10:04:08,049 [INFO]: graph.py(_all_end_to_end_paths:20) >> [0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96, 99, 102, 105, 108, 111, 114, 117, 120, 123, 126, 129, 132, 135, 138, 141, 144, 147, 150, 153, 156, 159, 162, 165, 168, 171, 174, 177, 180, 183, 186, 189, 192, 195, 198, 201, 204, 207, 210, 213, 216, 219, 222, 225, 228, 231, 234, 237, 240, 243, 246, 249, 252, 255, 258, 261, 264, 267, 270, 273, 276, 279, 282, 285, 288, 291, 294, 297, 300, 303, 306, 309, 312, 315, 318, 321, 324, 327, 330, 333, 336, 339, 342, 345, 348, 351, 354, 357, 360, 363, 366, 369, 372, 375, 378, 381, 384, 387, 390, 393, 396, 399, 402, 405, 408, 411, 414, 417, 420, 423, 426, 429, 432, 435, 438, 441, 444, 447, 450, 453, 456, 459, 462, 465, 468, 471, 474, 477, 480, 483, 486, 489, 492, 495, 498, 501, 504, 507, 510, 513, 516, 519, 522, 525, 528, 531, 534, 537, 540, 543, 546, 549, 552, 555, 558, 561, 564, 567, 570, 573, 576, 57




## Write to ISA-json

In [18]:
import json
from isatools.isajson import ISAJSONEncoder
with open(os.path.join(working_directory, "isa.json"), "w") as out_file:
    json.dump(
        investigation, 
        out_file,
        cls = ISAJSONEncoder, 
        sort_keys = True, 
        indent = 4, 
        separators = (',', ': '))