In [4]:
import os
import pandas as pd
from torch.utils.data import IterableDataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import requests
import json

gene_list = list(pd.read_csv("./pam50.txt",sep="\t", header = None, index_col = False)[0])
INPUT_SIZE = len(gene_list)
OUTPUT_SIZE = 2
HIDDEN_SIZE = 100

In [5]:
fields = [
    "file_name",
    "cases.submitter_id",
    "cases.samples.sample_type",
    "cases.disease_type",
    "file_id"
    ]

fields = ",".join(fields)

files_endpt = "https://api.gdc.cancer.gov/legacy/files"
# This set of filters is nested under an 'and' operator.
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.primary_site",
            "value": ["Breast", "Lung"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_category",
            "value": ["Gene expression"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Gene expression quantification"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.experimental_strategy",
            "value": ["RNA-Seq"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        }
    ]
}

# A POST is used, so the filter parameters can be passed directly as a Dict object.

params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "20000"
    }

# The parameters are passed to 'json' rather than 'params' in this case

In [6]:
from io import StringIO
def manifest_loader(fields, filters,label):
    assert label in fields
    params = {
    "filters": filters,
    "fields": fields,
    "format": "TSV",
    "size": "20000"
    }

    response = requests.post(files_endpt, headers = {"Content-Type": "application/json"}, json = params)
    manifest = response.content

    data = StringIO(str(response.content,'utf-8')) 
    manifest=pd.read_csv(data, sep = '\t', index_col = False)
    
    return manifest, label

## Naive Implementation

In [18]:
class GeneExpressionLoader(IterableDataset):
    def __init__(self, fields, filters,label, gene_list ):
        super(GeneExpressionLoader).__init__()
        raw_data, label_column = manifest_loader(fields, filters,label)
        raw_data.rename(lambda x : x.split('.')[-1], axis='columns', inplace = True)
        label_column = label_column.split('.')[-1]
        self.file_list = list(raw_data['file_id'])
        self.label_list = list(raw_data[label_column])
        self.label_dict = {}
        for idx, label in enumerate(set(self.label_list)):
            self.label_dict[label] = idx
        self.gene_list = gene_list
            
       
    def __iter__(self):
         for filename,label in zip(self.file_list, self.label_list):
            
            raw_data = pd.read_csv("./tcga3/"+filename+"/"+os.listdir("./tcga3/"+filename)[0], sep = '\t')
            label_idx = self.label_dict[label]
            try:
                raw_data['gene_id'] = raw_data['gene_id'].apply(lambda x: x.split('|')[0])
                raw_data[raw_data['gene_id'].isin(self.gene_list)]
                yield (torch.Tensor(list(raw_data['raw_count'])), label_idx)
            except Exception as e:
                continue

dataset = GeneExpressionLoader(fields, filters,"cases.disease_type",gene_list)
train_loader = DataLoader(dataset, batch_size = 2)

In [12]:
i = 0
for data in train_loader:
    print(data)
    i += 1
    if i == 10:
        break

OSError: [Errno 6] Device not configured: './tcga3/4e5531af-09a0-4c08-8424-bbae4f461036'