### Aggregation of federated VUS analyses.
This notebook aggregates the results from executing the VUS analysis on three datasets carried out remotely on different platforms.

See the accompanying notebooks in this folder for the individual analyses.

The following are the DRS ids for the analysis results in the three different systems where they were conducted.

In [1]:
resultsFiles = {'osteosarcoma':'sbcav:6188354bd2f88b031ee29d82',
               'copdgene': 'sbbdc:61888303e6261a31b6ddaf9b',
                'tcga':'sbcgc:6180182b9c5e581c1b22026b'
               }

In [2]:
from fasp.loc import DRSMetaResolver
drsClient = DRSMetaResolver()

Searching the GA4GH registry for org.ga4gh:drs services
GA4GH registry unavailable, cannot get registered DRS services.
Continuing with locally known DRS services.


In [3]:
import requests
import os
def download(url, source):
    response = requests.get(url)
    data = response.json()
    # flatten cooccurrence output
    flat_vus = []
    for k, v in data['cooccurring vus'].items():
        pathogenic_count = len(v['pathogenic variants'])
        ## this is a pythonic way of merging dicts - it is cryptic
        z = {**{"source":source},**{"vus":k}, **v['likelihood data'], **v['allele frequencies'], **{"no_pathogenic_coocurrs":pathogenic_count}}
        flat_vus.append(z)
    return flat_vus

In [4]:
flat_vus = []
for source, drsCurie in resultsFiles.items():
    url = drsClient.getAccessURL(drsCurie,'s3')
    flat_vus = flat_vus + download(url,source)

# turn the array of dicts into a data frame    
import pandas as pd
flat_df = pd.DataFrame(flat_vus)
flat_df



Unnamed: 0,source,vus,p1,p2,n,k,likelihood,maxPop,maxPopFreq,cohortFreq,no_pathogenic_coocurrs
0,osteosarcoma,"('13', 32315831, 'G', 'A')",0.015152,0.001,28,1,0.097015,,,0.848485,1
1,osteosarcoma,"('13', 32318080, 'C', 'T')",0.015152,0.001,33,1,0.104188,,,1.000000,1
2,osteosarcoma,"('13', 32318598, 'T', 'C')",0.015152,0.001,32,1,0.102712,,,0.969697,1
3,osteosarcoma,"('13', 32321240, 'G', 'C')",0.015152,0.001,33,1,0.104188,,,1.000000,1
4,osteosarcoma,"('13', 32325741, 'C', 'T')",0.015152,0.001,33,1,0.104188,,,1.000000,1
...,...,...,...,...,...,...,...,...,...,...,...
108,tcga,"('13', 32968591, 'G', 'A')",0.000337,0.001,1383,1,1.186443,,,0.133122,1
109,tcga,"('13', 32962045, 'A', 'C')",0.000337,0.001,204,1,2.594216,,,0.019636,1
110,tcga,"('13', 32962088, 'C', 'T')",0.000337,0.001,144,1,2.699582,,,0.013861,1
111,tcga,"('13', 32893791, 'A', 'G')",0.000337,0.001,61,1,2.852431,,,0.005872,1


In [5]:
merged_df = flat_df.pivot(index='vus', columns='source', values=['n','cohortFreq','no_pathogenic_coocurrs'])

In [6]:
pd.set_option('display.max_rows',100)
merged_df

Unnamed: 0_level_0,n,n,n,cohortFreq,cohortFreq,cohortFreq,no_pathogenic_coocurrs,no_pathogenic_coocurrs,no_pathogenic_coocurrs
source,copdgene,osteosarcoma,tcga,copdgene,osteosarcoma,tcga,copdgene,osteosarcoma,tcga
vus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
"('13', 32315831, 'G', 'A')",9345.0,28.0,,0.914742,0.848485,,1.0,1.0,
"('13', 32318080, 'C', 'T')",10195.0,33.0,,0.997944,1.0,,1.0,1.0,
"('13', 32318598, 'T', 'C')",9047.0,32.0,,0.885572,0.969697,,1.0,1.0,
"('13', 32319654, 'A', 'G')",8119.0,,,0.794734,,,1.0,,
"('13', 32321240, 'G', 'C')",10196.0,33.0,,0.998042,1.0,,1.0,1.0,
"('13', 32323151, 'ATT', 'A')",4106.0,,,0.401919,,,1.0,,
"('13', 32325741, 'C', 'T')",10194.0,33.0,,0.997847,1.0,,1.0,1.0,
"('13', 32331128, 'G', 'A')",10194.0,33.0,,0.997847,1.0,,1.0,1.0,
"('13', 32333969, 'A', 'G')",10194.0,33.0,,0.997847,1.0,,1.0,1.0,
"('13', 32338918, 'A', 'G')",10194.0,33.0,,0.997847,1.0,,1.0,1.0,


Note, the TCGA locations are in GRCh37 coordinates. The cooccurrence code uses the location as the identifier of the vus. Need to match variants across reference genomes. 