In [6]:
import json
CASES = json.load(open('/projects/barthf/GLASS-WG/data/manifest/hongkong/cases.json'))
SAMPLES = json.load(open('/projects/barthf/GLASS-WG/data/manifest/hongkong/samples.json'))
ALIQUOTS = json.load(open('/projects/barthf/GLASS-WG/data/manifest/hongkong/aliquots.json'))
FILES = json.load(open('/projects/barthf/GLASS-WG/data/manifest/hongkong/files.json'))
READGROUPS = json.load(open('/projects/barthf/GLASS-WG/data/manifest/hongkong/readgroups.json'))
PAIRS = json.load(open('/projects/barthf/GLASS-WG/data/manifest/hongkong/pairs.json'))

In [9]:
## Turn an unnamed list of dicts into a nammed list of dicts
## Taken from stackoverflow
## https://stackoverflow.com/questions/4391697/find-the-index-of-a-dict-within-a-list-by-matching-the-dicts-value
def build_dict(seq, key):
    return dict((d[key], dict(d, index=index)) for (index, d) in enumerate(seq))

## CASES should be unique
## CHECK THAT ALL CASE_ID VALUES IN CASES ARE UNIQUE
## IN PROGRESS


## FILES -> FILE_UUID should be unique
## CHECK THAT ALL FILE_UUID VALUES IN FILES ARE UNIQUE
## IN PROGRESS


## PAIR -> PAIR_ID should be unique
## CHECK THAT ALL PAIR_ID VALUES IN PAIR ARE UNIQUE
## IN PROGRESS


## CASES -> DICT
CASES_DICT = build_dict(CASES, "case_id")


## FILES -> DICT
FILES_DICT = build_dict(FILES, "file_uuid")


## Pair IDs are unique, PAIRS -> DICT
PAIRS_DICT = build_dict(PAIRS, "pair_id")


## Aliquot IDs and BAM files map 1:1
ALIQUOT_TO_BAM_PATH = {}
for file in FILES:
    if file["file_format"] == "BAM":
        ALIQUOT_TO_BAM_PATH[ file["aliquot_id"] ] = file["file_path"]
    if file["file_format"] == "FQ":
        ALIQUOT_TO_FQ_PATH[ file["aliquot_id"] ] = file["file_path"].split(",")


## Aliquots and RGIDs map 1:many
ALIQUOT_TO_RGID = {}        
for readgroup in READGROUPS:
    if readgroup["aliquot_id"] not in ALIQUOT_TO_RGID:
        ALIQUOT_TO_RGID[ readgroup["aliquot_id"] ] = [ readgroup["RGID"] ]
    else:
        ALIQUOT_TO_RGID[ readgroup["aliquot_id"] ].append(readgroup["RGID"])


## Batches and normal aliquot IDs map 1:many
## Normal aliquot IDs are repeated across multiple pairs from same case
## Each pair has one normal and one tumor
BATCH_TO_NORMAL = {}
for pair in PAIRS:
    pair["project_id"] = CASES_DICT[ pair["case_id"] ]["project_id"]
    PAIRS_DICT[ pair["pair_id"] ]["project_id"] = pair["project_id"]
    if pair["project_id"] not in BATCH_TO_NORMAL:
        BATCH_TO_NORMAL[ pair["project_id"] ] = [ pair["normal_aliquot_id"] ]
    elif pair["normal_aliquot_id"] not in BATCH_TO_NORMAL[ pair["project_id"] ]:
        BATCH_TO_NORMAL[ pair["project_id"] ].append(pair["normal_aliquot_id"])
        

## Readgroup information and 
## Aliquots and RGIDs map 1:many
## RGIDs are unique within an aliquot
## Aliquot IDs and fastQ files map 1:many
## Because FQ files are also seperated by readgroup, create dictionary of FQ files here as well
ALIQUOT_TO_READGROUP = {} 
ALIQUOT_TO_FQ_PATH = {}
for readgroup in READGROUPS:
    if readgroup["aliquot_id"] not in ALIQUOT_TO_READGROUP:
        ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ] = { readgroup["RGID"] : readgroup }
    else:
        ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ] = readgroup
    ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_path"] = FILES_DICT[ ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_uuid"] ]["file_path"]
    ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_format"] = FILES_DICT[ ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_uuid"] ]["file_format"]
    if ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_format"] == "FQ":
        ALIQUOT_TO_FQ_PATH[ readgroup["aliquot_id"] ][ readgroup["RGID"] ] = ALIQUOT_TO_READGROUP[ readgroup["aliquot_id"] ][ readgroup["RGID"] ]["file_path"].split(",")

In [10]:
ALIQUOT_TO_FQ_PATH

{'GLSS-HK-0005-TP-9DKW7W': ['/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_02S3600_USPD16083300_HF5N3CCXY_L8_1.fq.gz',
  '/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_02S3600_USPD16083300_HF5N3CCXY_L8_2.fq.gz'],
 'GLSS-HK-0005-NB-YNM276': ['/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_04S3691_1_USPD16083295_HFCFWCCXY_L6_1.fq.gz',
  '/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_04S3691_1_USPD16083295_HFCFWCCXY_L6_2.fq.gz'],
 'GLSS-HK-0005-R1-ODRHPZ': ['/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_04S3691_USPD16083301_HFCFWCCXY_L7_1.fq.gz',
  '/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_04S3691_USPD16083301_HFCFWCCXY_L7_2.fq.gz'],
 'GLSS-HK-0002-TP-DRX7N4': ['/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_11S12329_USPD16083302_HF5N3CCXY_L7_1.fq.gz',
  '/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_11S12329_USPD16083302_HF5N3CCXY_L7_2.fq.gz'],
 'GLSS-HK-0002-NB-9D0DVC': ['/fastscratch/barthf/GLASS-WG/data/fastq/hongkong/WG_13S018500_4_U

In [89]:
ALIQUOT_TO_RGID

{'TCGA-DU-6397-TP-OXOUDE': ['H233G.7', 'H233G.8'],
 'TCGA-FG-A4MT-NB-W7DORI': ['H22WW.6'],
 'TCGA-14-1402-TP-WT36NV': ['C2V1B.5',
  'C2V1B.6',
  'C2VCW.8',
  'C2VT4.4',
  'C2W0C.4'],
 'TCGA-19-1389-R1-1NPCYQ': ['C33RU.1', 'C33RU.3', 'C3PFU.4', 'C3PFU.5'],
 'TCGA-14-1402-NB-PVBYZ1': ['C2H5W.3', 'C38R1.5', 'D2GTT.5'],
 'TCGA-06-0152-R1-EK2VYI': ['C2U7P.6',
  'C2V1N.7',
  'C2V1N.8',
  'C2VCW.1',
  'C2VLC.7'],
 'TCGA-06-0210-R1-PS45ZE': ['C2U7P.7',
  'C2UJN.7',
  'C2UJN.8',
  'C2UPU.7',
  'C2VCW.2'],
 'TCGA-06-0190-TP-BM9MHX': ['C33RU.5', 'C33RU.6', 'C3PFU.7', 'C3PFU.8'],
 'TCGA-06-0125-R1-MA69JO': ['C2U7P.1',
  'C2UJN.3',
  'C2UJN.4',
  'C2UPU.3',
  'C2VLC.5'],
 'TCGA-06-0210-NB-N1LE2J': ['C2LHU.1', 'C2LV3.6', 'C2V9G.4'],
 'TCGA-DU-6397-R1-TX5QZZ': ['H255F.1', 'H255F.2'],
 'TCGA-FG-5965-TP-YPJ8SN': ['H255F.3', 'H255F.4'],
 'TCGA-TQ-A8XE-R1-M2U7J4': ['H22TC.3', 'H22TC.4'],
 'TCGA-DU-6404-TP-7DHU4O': ['H233G.3', 'H233G.4'],
 'TCGA-14-1034-NB-S6YHXW': ['C2H5W.1', 'C38R1.2', 'C38R1.3'],
 'TCG

In [5]:
FILES_DICT

{'24c6f54a-e7a2-4148-8335-045e3c74096e': {'aliquot_id': 'TCGA-DU-6397-TP-OXOUDE',
  'file_path': '/fastscratch/barthf/GLASS-WG/download/24c6f54a-e7a2-4148-8335-045e3c74096e/G35154.TCGA-DU-6397-01A-11D-A461-08.3.bam',
  'file_name': 'G35154.TCGA-DU-6397-01A-11D-A461-08.3.bam',
  'file_uuid': '24c6f54a-e7a2-4148-8335-045e3c74096e',
  'file_size': 231594682782,
  'file_md5sum': 'a399d24d1f5631c4b7adb95bc56f8bd3',
  'file_format': 'BAM',
  'index': 0},
 'd184ed85-cb21-43cc-95f6-0619f9283b29': {'aliquot_id': 'TCGA-DU-6397-R1-TX5QZZ',
  'file_path': '/fastscratch/barthf/GLASS-WG/download/d184ed85-cb21-43cc-95f6-0619f9283b29/G35154.TCGA-DU-6397-02A-12D-A36O-08.3.bam',
  'file_name': 'G35154.TCGA-DU-6397-02A-12D-A36O-08.3.bam',
  'file_uuid': 'd184ed85-cb21-43cc-95f6-0619f9283b29',
  'file_size': 304596628085,
  'file_md5sum': '2ef77b1027dc36c0b5d6ed0078f681ba',
  'file_format': 'BAM',
  'index': 1},
 '623cc431-e17c-4b40-97ed-b5178540519c': {'aliquot_id': 'TCGA-DU-6397-NB-TCU1YQ',
  'file_path

In [100]:
ALIQUOT_TO_RGID.values()

dict_values([['H233G.7', 'H233G.8'], ['H22WW.6'], ['C2V1B.5', 'C2V1B.6', 'C2VCW.8', 'C2VT4.4', 'C2W0C.4'], ['C33RU.1', 'C33RU.3', 'C3PFU.4', 'C3PFU.5'], ['C2H5W.3', 'C38R1.5', 'D2GTT.5'], ['C2U7P.6', 'C2V1N.7', 'C2V1N.8', 'C2VCW.1', 'C2VLC.7'], ['C2U7P.7', 'C2UJN.7', 'C2UJN.8', 'C2UPU.7', 'C2VCW.2'], ['C33RU.5', 'C33RU.6', 'C3PFU.7', 'C3PFU.8'], ['C2U7P.1', 'C2UJN.3', 'C2UJN.4', 'C2UPU.3', 'C2VLC.5'], ['C2LHU.1', 'C2LV3.6', 'C2V9G.4'], ['H255F.1', 'H255F.2'], ['H255F.3', 'H255F.4'], ['H22TC.3', 'H22TC.4'], ['H233G.3', 'H233G.4'], ['C2H5W.1', 'C38R1.2', 'C38R1.3'], ['H2JWT.1', 'H2JWT.2'], ['H22MT.3', 'H22MT.4'], ['H2557.1', 'H2557.2'], ['H255T.5'], ['H233G.5', 'H233G.6'], ['C2U7P.5', 'C2UJN.5', 'C2UJN.6', 'C2UPU.6', 'C2UPU.8'], ['C372V.7', 'C372V.8', 'C37V4.8', 'C3C0U.3'], ['C2U7P.3', 'C2UPU.4', 'C2V1B.1', 'C2V1B.2', 'C2VCW.3'], ['C33RU.4', 'C348L.4', 'C3PFU.2', 'C3PFU.6'], ['H22YH.2'], ['C2U7P.8', 'C2V1N.5', 'C2V1N.6', 'C2VCW.5', 'C2VLC.6'], ['H22TY.7', 'H22TY.8'], ['C33RU.2', 'C348L.3

In [94]:
list

[{'id': '1234', 'name': 'Jason'},
 {'id': '8888', 'name': 'Jason'},
 {'id': '2345', 'name': 'Tom'},
 {'id': '3456', 'name': 'Art'}]

In [95]:
rm(list)

/bin/sh: -c: line 0: syntax error near unexpected token `list'
/bin/sh: -c: line 0: `rm (list)'


In [96]:
list = None

In [97]:
list