# End-to-end Workflow

### Connect to instance

In [1]:
import os, time, json
import pandas as pd
import integration_curator, study_curator, sample_curator, expression_curator

os.environ['PRED_SPOT_HOST'] = 'occam.genestack.com'
os.environ['PRED_SPOT_TOKEN'] = '<token>'
os.environ['PRED_SPOT_VERSION'] = 'default-released'

template_id = 'GSF996995'

### Import study

In [2]:
study = study_curator.StudySPoTApi().add_study(source=study_curator.StudyGenericSource(
    link='https://bio-test-data.s3.amazonaws.com/odm/demo/study.tsv',
    template_id = template_id
))
study

{'data': {'genestack:accession': 'GSF997221',
  'Study ID': 'Study101',
  'Study Description': 'Transcriptomics of kidney cancer cells',
  'Therapeutic Area': 'Oncology',
  'Contributor': 'Kevin Dialdestoro',
  'Release Date': None,
  'Keywords': 'Kidney, cancer, renal, mammals',
  'Study Title': 'Kidney cancer in mammals'}}

### Import samples

In [3]:
samples = sample_curator.SampleSPoTApi().add_samples(source=sample_curator.SampleSource(
    link='https://bio-test-data.s3.amazonaws.com/odm/demo/samples.tsv',
    template_id = template_id
))
samples

   'The value "ibprofen" for attribute "Sample Treatment / Compound" is not present in the controlled vocabulary "GSF996814".',
   'Value "Fastq Read 1 File" should have "com.genestack.api.metainfo.ExternalLink" type, but has "com.genestack.api.metainfo.StringValue" type',
   'Value "Fastq Read 2 File" should have "com.genestack.api.metainfo.ExternalLink" type, but has "com.genestack.api.metainfo.StringValue" type'],
  'data': {'genestack:accession': 'GSF997223',
   'Sample Source ID': 'Sample1',
   'Subject ID': 'Subject1',
   'Organism': 'Mus musculus',
   'Disease': None,
   'Sex': 'female',
   'Age': 34,
   'Age Unit': 'week',
   'Sample Collection Site': 'Cambridge, UK',
   'Sample Collection Date': None,
   'Tissue': 'liver',
   'Sample Treatment / Compound': 'ibprofen',
   'Sample Treatment / Dose': 2,
   'Sample Treatment / Unit': 'milligram per millilitre',
   'Experiment Instrument': 'Illumina HiSeq 2000',
   'Experiment Type': 'Tx',
   'Library Type': 'paired-end',
   'Libra

### Link samples to study


In [4]:
for sample in samples:
    integration_curator.StudyIntegrationApi().create_sample_study_link(
        source_id = sample['data']['genestack:accession'],
        target_id = study['data']['genestack:accession']
    )

### Get samples from study

In [6]:
study_accession = study['data']['genestack:accession']

data = integration_curator.OmicsQueriesApi().search_samples(
    study_filter = 'genestack:accession=' + study_accession
)
samples = pd.DataFrame.from_dict([item['metadata'] for item in data.data])
samples

Unnamed: 0,genestack:accession,Sample Source ID,Subject ID,Organism,Disease,Sex,Age,Age Unit,Sample Collection Site,Sample Collection Date,...,Library Type,Library Preparation Protocol,Read Length,RNA Integrity Number,Fastq Read 1 File,Fastq Read 2 File,Pipeline ID,BAM File,Read length,Sample Source
0,GSF997223,Sample1,Subject1,Mus musculus,,female,34,week,"Cambridge, UK",,...,paired-end,NGS,,7,~/sample1_1.fq,~/sample1_2.fq,,,75,Internal
1,GSF997224,Sample2,Subject2,Mus musculus,,male,40,week,"Cambridge, UK",,...,paired-end,NGS,,8,~/sample2_1.fq,~/sample2_2.fq,,,75,Internal
2,GSF997225,Sample3,Subject3,Mus musculus,,male,2,week,"Cambridge, UK",,...,paired-end,NGS,,9,~/sample3_1.fq,~/sample3_2.fq,,,75,Internal
3,GSF997226,Sample4,Subject4,Mus musculus,,female,10,week,"Cambridge, UK",,...,paired-end,NGS,,10,~/sample4_1.fq,~/sample4_2.fq,,,150,Internal


### Additional metadata

In [7]:
additional_metadata = pd.read_csv('https://bio-test-data.s3.amazonaws.com/odm/demo/Secondary+processing+metadata.tsv', sep='\t').set_index('Sample Source ID')
additional_metadata

Unnamed: 0_level_0,Pipeline ID,Pipeline Version,BAM File,% Reads with MAPQ 0-10,% Reads with MAPQ > 40
Sample Source ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sample1,STAR+RSEM,2.2,~/sample1.bam,"0,90%","99,10%"
Sample2,STAR+RSEM,2.2,~/sample2.bam,"0,89%","99,11%"
Sample3,STAR+RSEM,2.2,~/sample3.bam,"0,91%","99,09%"
Sample4,STAR+RSEM,2.2,~/sample4.bam,"0,87%","99,13%"


### Add additional metadata

In [8]:
for sample_id in additional_metadata.index:
    genestack_accession = samples.loc[samples['Sample Source ID']==sample_id,
                                      'genestack:accession'].to_list()[0]
    sample_curator.SampleSPoTApi().update_sample(
        id=genestack_accession,
        body=json.loads(additional_metadata.loc[sample_id].to_json())
    )

### Add processed expression data

In [9]:
expressions = expression_curator.ExpressionSPoTApi().add_expression(source=expression_curator.SignalSource(
    link='https://bio-test-data.s3.amazonaws.com/odm/demo/expression.genes.TPM.gct',
    metadata_link='https://bio-test-data.s3.amazonaws.com/odm/demo/expression.genes.gct.tsv',
    template_id = template_id
))
expressions

{'experiment': 'GSF997233',
 'runs': [{'Sample Source ID': 'Sample1', 'genestack:accession': 'GSF997234'},
          {'Sample Source ID': 'Sample2', 'genestack:accession': 'GSF997235'},
          {'Sample Source ID': 'Sample3', 'genestack:accession': 'GSF997236'},
          {'Sample Source ID': 'Sample4', 'genestack:accession': 'GSF997237'}],

### Link expression data to samples

In [10]:
for run in expressions.runs:
    integration_curator.ExpressionIntegrationApi().create_expression_sample_link(
        source_id = run['genestack:accession'],
        target_id = samples.loc[samples['Sample Source ID']==run['Sample Source ID'],
                                      'genestack:accession'].to_list()[0]
    )

### Query expression data

In [20]:
expressions = integration_curator.OmicsQueriesApi().search_expression_data(
    study_filter='genestack:accession=%s' % study_accession,
    sample_filter='Sex=male',
    ex_query='Gene=ENSMUSG00000000001.4 MinValue=0.0'
)
expressions

{'cursor': 'GSF997221#GSF997224#GSF997224#GSF997236-ENSMUSG00000000001.4',
 'data': [{'expression': 0.0,
           'gene': 'ENSMUSG00000000001.4',
           'groupId': 'GSF997233',
           'itemId': 'GSF997235-ENSMUSG00000000001.4',
           'metadata': {'Feature Type': 'Gene',
                        'Genome Version': 'GRCm38_gencode.vM23',
                        'Normalization Method': 'TPM',
                        'Run Source ID': 'Sample2',
                        'Script ID': 'Script1',
                        'Script Version': '1.2'},
           'relationships': {'sample': 'GSF997224'},
           'runId': 'GSF997235'},
          {'expression': 0.0,
           'gene': 'ENSMUSG00000000001.4',
           'groupId': 'GSF997233',
           'itemId': 'GSF997236-ENSMUSG00000000001.4',
           'metadata': {'Feature Type': 'Gene',
                        'Genome Version': 'GRCm38_gencode.vM23',
                        'Normalization Method': 'TPM',
                        'R

### Add additional processed expression data

In [15]:
expressions = expression_curator.ExpressionSPoTApi().add_expression(source=expression_curator.SignalSource(
    link='https://bio-test-data.s3.amazonaws.com/odm/demo/expression.transcripts.TPM.gct',
    metadata_link='https://bio-test-data.s3.amazonaws.com/odm/demo/expression.transcripts.gct.tsv',
    template_id = template_id
))

for run in expressions.runs:
    integration_curator.ExpressionIntegrationApi().create_expression_sample_link(
        source_id = run['genestack:accession'],
        target_id = samples.loc[samples['Sample Source ID']==run['Sample Source ID'],
                                      'genestack:accession'].to_list()[0]
    )

### Query transcript-level expression data

In [19]:
expressions = integration_curator.OmicsQueriesApi().search_expression_data(
    study_filter='genestack:accession=%s' % study_accession,
    sample_filter='Sex=male',
    ex_filter='"Feature Type"=Transcript',
    ex_query='Gene=ENSMUST00000070533.4 MinValue=0.0'
)
expressions

{'cursor': 'GSF997221#GSF997224#GSF997224#GSF997242-ENSMUST00000070533.4',
 'data': [{'expression': 12.08136551462,
           'gene': 'ENSMUST00000070533.4',
           'groupId': 'GSF997239',
           'itemId': 'GSF997241-ENSMUST00000070533.4',
           'metadata': {'Feature Type': 'Transcript',
                        'Genome Version': 'GRCm38_gencode.vM23',
                        'Normalization Method': 'TPM',
                        'Run Source ID': 'Sample2',
                        'Script ID': 'Script1',
                        'Script Version': '1.2'},
           'relationships': {'sample': 'GSF997224'},
           'runId': 'GSF997241'},
          {'expression': 13.6785082946958,
           'gene': 'ENSMUST00000070533.4',
           'groupId': 'GSF997239',
           'itemId': 'GSF997242-ENSMUST00000070533.4',
           'metadata': {'Feature Type': 'Transcript',
                        'Genome Version': 'GRCm38_gencode.vM23',
                        'Normalization Method