In [2]:
import json
CASES = json.load(open('/projects/barthf/GLASS-WG/data/manifest/cases.json'))
SAMPLES = json.load(open('/projects/barthf/GLASS-WG/data/manifest/samples.json'))
ALIQUOTS = json.load(open('/projects/barthf/GLASS-WG/data/manifest/aliquots.json'))
FILES = json.load(open('/projects/barthf/GLASS-WG/data/manifest/files.json'))
READGROUPS = json.load(open('/projects/barthf/GLASS-WG/data/manifest/readgroups.json'))
PAIRS = json.load(open('/projects/barthf/GLASS-WG/data/manifest/pairs.json'))

In [34]:
## Turn an unnamed list of dicts into a nammed list of dicts
## Taken from stackoverflow
## https://stackoverflow.com/questions/4391697/find-the-index-of-a-dict-within-a-list-by-matching-the-dicts-value
def build_dict(seq, key):
    return dict((d[key], dict(d, index=index)) for (index, d) in enumerate(seq))

## CASES should be unique
## CHECK THAT ALL CASE_ID VALUES IN CASES ARE UNIQUE
## IN PROGRESS


## FILES -> FILE_UUID should be unique
## CHECK THAT ALL FILE_UUID VALUES IN FILES ARE UNIQUE
## IN PROGRESS


## PAIR -> PAIR_ID should be unique
## CHECK THAT ALL PAIR_ID VALUES IN PAIR ARE UNIQUE
## IN PROGRESS


## CASES -> DICT
CASES_DICT = build_dict(CASES, "case_id")

## FILES -> DICT
FILES_DICT = build_dict(FILES, "file_uuid")

## ALIQUOTS -> DICT
ALIQUOTS_DICT = build_dict(ALIQUOTS, "aliquot_id")

## SAMPLES -> DICT
SAMPLES_DICT = build_dict(SAMPLES, "sample_id")

## Pair IDs are unique, PAIRS -> DICT
PAIRS_DICT = build_dict(PAIRS, "pair_id")


## Aliquot IDs and BAM files map 1:1
ALIQUOT_TO_BAM_PATH = {}
for file in FILES:
    if file["file_format"] == "BAM":
        ALIQUOT_TO_BAM_PATH[ file["aliquot_id"] ] = file["file_path"]
        
## Case to aliquots
## Dict of aliquots per batch
BATCH_TO_ALIQUOT = {}
CASE_TO_ALIQUOT = {}
for aliquot in ALIQUOTS:
    aliquot["case_id"] = SAMPLES_DICT[ aliquot["sample_id"] ]["case_id"]
    aliquot["project_id"] = CASES_DICT[ aliquot["case_id"] ]["project_id"]
    
    if aliquot["case_id"] not in CASE_TO_ALIQUOT:
        CASE_TO_ALIQUOT[ aliquot["case_id"] ] = [ aliquot["aliquot_id"] ]
    elif aliquot["aliquot_id"] not in CASE_TO_ALIQUOT[ aliquot["case_id"] ]:
        CASE_TO_ALIQUOT[ aliquot["case_id"] ].append(aliquot["aliquot_id"])
    
    if aliquot["project_id"] not in BATCH_TO_ALIQUOT:
        BATCH_TO_ALIQUOT[ aliquot["project_id"] ] = [ aliquot["aliquot_id"] ]
    elif aliquot["aliquot_id"] not in BATCH_TO_ALIQUOT:
        BATCH_TO_ALIQUOT[ aliquot["project_id"] ].append(aliquot["aliquot_id"])


## Aliquots and RGIDs map 1:many
ALIQUOT_TO_RGID = {}        
for readgroup in READGROUPS:
    if readgroup["aliquot_id"] not in ALIQUOT_TO_RGID:
        ALIQUOT_TO_RGID[ readgroup["aliquot_id"] ] = [ readgroup["RGID"] ]
    else:
        ALIQUOT_TO_RGID[ readgroup["aliquot_id"] ].append(readgroup["RGID"])
        

## Batches and normal aliquot IDs map 1:many
## Normal aliquot IDs are repeated across multiple pairs from same case
## Each pair has one normal and one tumor
BATCH_TO_NORMAL = {}
for pair in PAIRS:
    pair["project_id"] = CASES_DICT[ pair["case_id"] ]["project_id"]
    PAIRS_DICT[ pair["pair_id"] ]["project_id"] = pair["project_id"]
    if pair["project_id"] not in BATCH_TO_NORMAL:
        BATCH_TO_NORMAL[ pair["project_id"] ] = [ pair["normal_aliquot_id"] ]
    elif pair["normal_aliquot_id"] not in BATCH_TO_NORMAL[ pair["project_id"] ]:
        BATCH_TO_NORMAL[ pair["project_id"] ].append(pair["normal_aliquot_id"])
        

## Readgroup information and 
## Aliquots and RGIDs map 1:many
## RGIDs are unique within an aliquot
## Aliquot IDs and fastQ files map 1:many
## Because FQ files are also seperated by readgroup, create dictionary of FQ files here as well
ALIQUOT_TO_READGROUP = {} 
ALIQUOT_TO_FQ_PATH = {}
ALIQUOT_TO_SM = {} ## SM is shared across all RG 
for readgroup in READGROUPS:
    if readgroup["aliquot_id"] not in ALIQUOT_TO_SM:
        ALIQUOT_TO_SM[ readgroup["aliquot_id"] ] = readgroup["RGSM"]
    if readgroup["aliquot_id"] not in ALIQUOT_TO_READGROUP:
        ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ] = { readgroup["RGID"] : readgroup }
    else:
        ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ] = readgroup
    ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_path"] = FILES_DICT[ ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_uuid"] ]["file_path"]
    ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_format"] = FILES_DICT[ ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_uuid"] ]["file_format"]
    if ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_format"] == "FQ":
        if readgroup["aliquot_id"] not in ALIQUOT_TO_FQ_PATH:
            ALIQUOT_TO_FQ_PATH[ readgroup["aliquot_id"] ] = {}
        ALIQUOT_TO_FQ_PATH[ readgroup["aliquot_id"] ][ readgroup["RGID"] ] = ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_path"].split(",")

In [35]:
ALIQUOT_TO_SM

{'GLSS-HF-3016-NB-0T6U6R': 'LYNDA-FORD-HF-3016-10-28D',
 'GLSS-HF-3016-R1-I9CPLG': 'LYNDA-FORD-HF-3177-01-01D',
 'GLSS-HF-3016-TP-EWVA49': 'LYNDA-FORD-HF-3016-01-04D',
 'GLSS-HK-0005-TP-9DKW7W': 'GLSS-HK-0005-TP',
 'GLSS-HK-0005-NB-YNM276': 'GLSS-HK-0005-NB',
 'GLSS-HK-0005-R1-ODRHPZ': 'GLSS-HK-0005-R1',
 'GLSS-HK-0002-TP-DRX7N4': 'GLSS-HK-0002-TP',
 'GLSS-HK-0002-NB-9D0DVC': 'GLSS-HK-0002-NB',
 'GLSS-HK-0001-TP-HLH6TO': 'GLSS-HK-0001-TP',
 'GLSS-HK-0002-R1-S3QETN': 'GLSS-HK-0002-R1',
 'GLSS-HK-0004-TP-SJJS0H': 'GLSS-HK-0004-TP',
 'GLSS-HK-0001-NB-QOHVF8': 'GLSS-HK-0001-NB',
 'GLSS-HK-0001-R1-23BINE': 'GLSS-HK-0001-R1',
 'GLSS-HK-0003-TP-WAGBN9': 'GLSS-HK-0003-TP',
 'GLSS-HK-0003-NB-HR3VCU': 'GLSS-HK-0003-NB',
 'GLSS-HK-0004-NB-CCHWVE': 'GLSS-HK-0004-NB',
 'GLSS-HK-0004-R1-RYFPEB': 'GLSS-HK-0004-R1',
 'GLSS-HK-0003-R1-R7P485': 'GLSS-HK-0003-R1',
 'GLSS-MD-LP06-R1-4D3AKB': 'ROEL-JDG-RV-JDG6-01-01D',
 'GLSS-MD-LP08-NB-PUGO4Q': 'ROEL-JDG-RV-JDG8-10-01D',
 'GLSS-MD-LP09-NB-C42CZJ': 'ROEL-J

In [35]:
CASES_DICT[SAMPLES_DICT[ALIQUOTS_DICT["TCGA-DU-6397-TP-OXOUDE"]["sample_id"]]["case_id"]]["project_id"]

'TCGA-LGG'

In [13]:
CASES

[{'case_id': 'GLSS-HF-3016', 'project_id': 'GLSS-HF'},
 {'case_id': 'GLSS-HK-0005', 'project_id': 'GLSS-HK'},
 {'case_id': 'GLSS-HK-0002', 'project_id': 'GLSS-HK'},
 {'case_id': 'GLSS-HK-0001', 'project_id': 'GLSS-HK'},
 {'case_id': 'GLSS-HK-0004', 'project_id': 'GLSS-HK'},
 {'case_id': 'GLSS-HK-0003', 'project_id': 'GLSS-HK'},
 {'case_id': 'GLSS-MD-LP06', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP08', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP09', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP04', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP20', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP03', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP05', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP01', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP10', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP02', 'project_id': 'GLSS-MD'},
 {'case_id': 'GLSS-MD-LP07', 'project_id': 'GLSS-MD'},
 {'case_id': 'TCGA-DU-6397', 'project_id': 'TCGA-LGG'},
 {'case_i

In [33]:
READGROUPS

[{'file_uuid': '9dkw7wyn-9d0d-s3qe-qohv-wagbn9hr3vcu',
  'aliquot_id': 'GLSS-HF-3016-NB-0T6U6R',
  'RGID': '130713_SN1222_0206_BC2998ACXX_6',
  'RGPL': 'Illumina_HiSeq2000',
  'RGPU': 'BC2998ACXX.6',
  'RGLB': 'LYNDA-FORD-HF-3016-10-28D',
  'RGPI': 0,
  'RGDT': '2018-06-27T14:12:56+0000',
  'RGSM': 'LYNDA-FORD-HF-3016-10-28D',
  'RGCN': 'Harvard_GCC_02',
  'file_path': '/fastscratch/johnsk/hGBM/wgs_bams/LYNDA-FORD-HF-3016-10-28D_130713_SN1222_0206_BC2998ACXX_s_6_rg.sorted.recalibed.bam',
  'file_format': 'BAM'},
 {'file_uuid': 'm276odrh-vchl-tnsj-f823-cchwveryfpeb',
  'aliquot_id': 'GLSS-HF-3016-R1-I9CPLG',
  'RGID': '140325_SN208_0510_AC2PKUACXX_1',
  'RGPL': 'Illumina_HiSeq2000',
  'RGPU': 'AC2PKUACXX.1',
  'RGLB': 'LYNDA-FORD-HF-3177-01-01D',
  'RGPI': 0,
  'RGDT': '2018-06-27T14:12:56+0000',
  'RGSM': 'LYNDA-FORD-HF-3177-01-01D',
  'RGCN': 'Harvard_GCC_02',
  'file_path': '/fastscratch/johnsk/hGBM/wgs_bams/LYNDA-FORD-HF-3177-01-01D_140325_SN208_0510_AC2PKUACXX_s_1_rg.sorted.recalib

In [100]:
ALIQUOT_TO_RGID.values()

dict_values([['H233G.7', 'H233G.8'], ['H22WW.6'], ['C2V1B.5', 'C2V1B.6', 'C2VCW.8', 'C2VT4.4', 'C2W0C.4'], ['C33RU.1', 'C33RU.3', 'C3PFU.4', 'C3PFU.5'], ['C2H5W.3', 'C38R1.5', 'D2GTT.5'], ['C2U7P.6', 'C2V1N.7', 'C2V1N.8', 'C2VCW.1', 'C2VLC.7'], ['C2U7P.7', 'C2UJN.7', 'C2UJN.8', 'C2UPU.7', 'C2VCW.2'], ['C33RU.5', 'C33RU.6', 'C3PFU.7', 'C3PFU.8'], ['C2U7P.1', 'C2UJN.3', 'C2UJN.4', 'C2UPU.3', 'C2VLC.5'], ['C2LHU.1', 'C2LV3.6', 'C2V9G.4'], ['H255F.1', 'H255F.2'], ['H255F.3', 'H255F.4'], ['H22TC.3', 'H22TC.4'], ['H233G.3', 'H233G.4'], ['C2H5W.1', 'C38R1.2', 'C38R1.3'], ['H2JWT.1', 'H2JWT.2'], ['H22MT.3', 'H22MT.4'], ['H2557.1', 'H2557.2'], ['H255T.5'], ['H233G.5', 'H233G.6'], ['C2U7P.5', 'C2UJN.5', 'C2UJN.6', 'C2UPU.6', 'C2UPU.8'], ['C372V.7', 'C372V.8', 'C37V4.8', 'C3C0U.3'], ['C2U7P.3', 'C2UPU.4', 'C2V1B.1', 'C2V1B.2', 'C2VCW.3'], ['C33RU.4', 'C348L.4', 'C3PFU.2', 'C3PFU.6'], ['H22YH.2'], ['C2U7P.8', 'C2V1N.5', 'C2V1N.6', 'C2VCW.5', 'C2VLC.6'], ['H22TY.7', 'H22TY.8'], ['C33RU.2', 'C348L.3

In [94]:
list

[{'id': '1234', 'name': 'Jason'},
 {'id': '8888', 'name': 'Jason'},
 {'id': '2345', 'name': 'Tom'},
 {'id': '3456', 'name': 'Art'}]

In [95]:
rm(list)

/bin/sh: -c: line 0: syntax error near unexpected token `list'
/bin/sh: -c: line 0: `rm (list)'


In [96]:
list = None

In [97]:
list