# Data Loading via Script

### Load and link study, samples, expression, variant, and flow cytometry data

In [1]:
!python 'load_and_link_data_for_odm.py' --token '<token>' \
--study https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g.study.tsv \
--samples https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g.samples.tsv \
--expression_metadata https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g_tx.gct.tsv \
--expression https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g_tx.gct \
--expression_metadata https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g_tx_log2.gct.tsv \
--expression https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g_tx_log2.gct \
--expression_metadata https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g_px.gct.tsv \
--expression https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g_px.gct \
--flow_cytometry https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g.facs \
--flow_cytometry_metadata https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g.facs.tsv \
--variant https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g.vcf \
--variant_metadata https://bio-test-data.s3.amazonaws.com/1000gEFV/Test_1000g.vcf.tsv \
--template GSF991585 \
--server https://occam.genestack.com --debug


This script is used for uploading and linking the data from public FTP/HTTP. Script uses 'Default template' if nothing else is specified.
See https://odm-user-guide.readthedocs.io/en/latest/doc-odm-user-guide/import-data-using-api.html
* Make sure you have a Genestack API token first.
[31m* Mandatory! [0mYou need to provide your token via parameter -t or --token [TOKEN]. 
[31m* Mandatory! [0mDon't forget to provide the link to study file in valid format via -s or --study [URL]
  OR(!) you can provide accession of existing study via parameter -sa or --study_accession [ACCESSION].
  You're allowed to set only one of them (link to study or accession of existing study) 
[31m* Mandatory! [0mProvide link of samples file in valid format via -sm or --samples [URL]. 
* Optional. Provide expression data file and expression metadata file 
  via -e or --expression [URL] and -em or --expression_metadata [URL] respectively. 
[32m* Optional. [0mProvide variant data file and variant metadata 

### Set instance and token (to query the uploaded data via API)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import integration_curator, os, statistics, time

os.environ['PRED_SPOT_HOST'] = 'occam.genestack.com'
os.environ['PRED_SPOT_TOKEN'] = '<token>'
os.environ['PRED_SPOT_VERSION'] = 'default-released'

omics_api = integration_curator.OmicsQueriesApi()

### Get samples

In [2]:
start = time.time()
data = omics_api.search_samples(
    study_filter='genestack:accession=GSF996648', 
    sample_filter='"Population"="British" OR "Population"="Finnish"'
)
samples = pd.DataFrame.from_dict([item['metadata'] for item in data.data])
print('Time to get %s samples: %i seconds\n' % (samples.shape[0], time.time()-start))

samples.head()

Time to get 4 samples: 0 seconds



Unnamed: 0,genestack:accession,Sample Source ID,Organism,Disease,Tissue,Age,Sex,Compound / Compound,Compound / Dose,Compound / Dose Unit,My own attribute,Sample Source,Raw FACS,Raw Variant,Raw Expression,Population
0,GSF996650,HG00119,,,,,M,,,,,1000 Genomes Project,~/Flow/HG00119.facs,~/DNA/HG00119.fq,~/RNA/HG00119.fq,British
1,GSF996651,HG00121,,,,,F,,,,,1000 Genomes Project,~/Flow/HG00121.facs,~/DNA/HG00121.fq,~/RNA/HG00121.fq,British
2,GSF996652,HG00183,,,,,M,,,,,1000 Genomes Project,~/Flow/HG00183.facs,~/DNA/HG00183.fq,~/RNA/HG00183.fq,Finnish
3,GSF996653,HG00176,,,,,F,,,,,1000 Genomes Project,~/Flow/HG00176.facs,~/DNA/HG00176.fq,~/RNA/HG00176.fq,Finnish


### Get and compare genotypes across groups

In [3]:
start = time.time()
data = omics_api.search_variant_data(
    study_filter='genestack:accession=GSF996648',
    sample_filter='"Population"="British" OR "Population"="Finnish"',
    vx_query='Intervals=4:142142600-142143000',
    page_limit=20000
)

def normalise_genotype(gt): return '1|0' if gt == '0|1' else gt
genotypes = pd.DataFrame.from_dict({'genestack:accession': x['relationships']['sample'], 
                                    'Genotype': normalise_genotype(x['genotype']['GT']),
                                    'Location': '%s:%s' % (x['contig'],x['start']),
                                    'ID': ', '.join(x['variationId']),
                                    'Ref / Alt': x['reference'] + ' / ' + ', '.join(x['alteration'])
                                   } 
                                   for x in data.data)

print('Time to get %s genotypes: %i seconds\n' % (genotypes.shape[0], time.time()-start))

samples_genotypes = pd.merge(samples, genotypes)
def f(x):
    d = {}
    for group in x['Population']:
        genotypes = x.loc[x['Population'] == group, 'Genotype']
        genotypes = '|'.join(genotypes).split('|')
        ac = sum([gt == '1' for gt in genotypes])
        an = ac + sum([gt == '0' for gt in genotypes])
        af = round(ac/an, 2)
        d[group+' AF (AC/AN)'] = '%s (%s/%s)' % (af, ac, an)
    return pd.Series(d)
    
samples_genotypes.groupby(['ID', 'Location', 'Ref / Alt']).apply(f)

Time to get 8 genotypes: 0 seconds



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,British AF (AC/AN),Finnish AF (AC/AN)
ID,Location,Ref / Alt,Unnamed: 3_level_1,Unnamed: 4_level_1
rs17007017,4:142142653,G / A,0.25 (1/4),0.75 (3/4)
rs201966773,4:142142999,T / TC,0.25 (1/4),1.0 (4/4)


### Get and compare expression values across groups

In [6]:
start = time.time()
data = omics_api.search_expression_data(
    study_filter='genestack:accession=GSF996648',
    sample_filter='"Population"="British" OR "Population"="Finnish"',
    ex_query = 'Gene=ENSG00000109445 MinValue=0.0',
    ex_filter='"Data Species"="Proteomics (Mass spectrometry)"'
)

expressions = pd.DataFrame.from_dict({'genestack:accession': item['relationships']['sample'], 
                                      'expression': item['expression'],
                                     'Gene': item['gene']} for item in data.data)

print('Time to get %s expression values: %i seconds\n' % (expressions.shape[0], time.time()-start))

samples_expressions = pd.merge(samples, expressions)
def f(x):
    d = {}
    for group in x['Population']:
        exprs = x.loc[x['Population'] == group, 'expression']
        quartiles = exprs.quantile([.25, .5, .75]) 
        d[group+' Median Expression (Q1, Q3)'] = '%i (%i, %i)' % (quartiles.iloc[0], quartiles.iloc[1], quartiles.iloc[2])
    return pd.Series(d)
    
samples_expressions.groupby(['Gene']).apply(f)

Time to get 4 expression values: 0 seconds



Unnamed: 0_level_0,"British Median Expression (Q1, Q3)","Finnish Median Expression (Q1, Q3)"
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000109445,"15 (15, 16)","17 (18, 18)"


### Get and compare flow cytometry counts across groups

In [8]:
start = time.time()
data = omics_api.search_flow_cytometry_data(
    study_filter='genestack:accession=GSF996648',
    sample_filter='"Population"="British" OR "Population"="Finnish"',
    fx_query='Marker=ENSG00000109445'
)

cellcounts = pd.DataFrame.from_dict({'genestack:accession': item['relationships']['sample'], 
                                     'CellType': item['cellPopulation'],
                                     'Count': item['expression']} for item in data.data)

print('Time to get %s flow cytometry cell counts: %i seconds\n' % (cellcounts.shape[0], time.time()-start))


samples_cells = pd.merge(samples, cellcounts)
samples_cells["Count"] = samples_cells["Count"].fillna(0)


def f(x):
    d = {}
    for group in x['Population']:
        exprs = x.loc[x['Population'] == group, 'Count']
        d[group+' Mean Cell Count'] = '%i' % exprs.mean()
    return pd.Series(d)

samples_cells.groupby(['CellType']).apply(f).head()

Time to get 112 flow cytometry cell counts: 0 seconds



Unnamed: 0_level_0,British Mean Cell Count,Finnish Mean Cell Count
CellType,Unnamed: 1_level_1,Unnamed: 2_level_1
"CD45+, live/CD45+, CD3+/CD4",1244,4337
"CD45+, live/CD45+, CD3+/CD4+/CD4 CM CCR7+ CD45RA+",1157,3104
"CD45+, live/CD45+, CD3+/CD4+/CD4 CM CCR7+ CD45RA+/CD4 CM",1018,1796
"CD45+, live/CD45+, CD3+/CD4+/CD4 EM",1403,5023
"CD45+, live/CD45+, CD3+/CD4+/CD4 EM/CD4 EM1",1255,2605
