<a href="https://colab.research.google.com/github/emil-d/bioinf-proj/blob/master/CNN_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
import json
import re
import tarfile
import os
import shutil
from pathlib import Path

import pandas as pd
import numpy as np

from PIL import Image

import matplotlib
import matplotlib.pyplot as plt

In [0]:
def getData(disease_type=["Adenomas and Adenocarcinomas","Squamous Cell Neoplasms"], path=".", file_n=2, unzip=True):
    #Query example:
    #cases.disease_type in ["Adenomas and Adenocarcinomas","Squamous Cell Neoplasms"] 
    #and cases.primary_site in ["Bronchus and lung"] 
    #and files.data_category in ["DNA Methylation"] 
    #and files.platform in ["Illumina Human Methylation 450"] 
    #and cases.samples.sample_type in ["Primary Tumor","Recurrent Tumor"]
    
    files_endpt = "https://api.gdc.cancer.gov/files"
    
    filters = {
        "op": "and",
        "content":[
            {
            "op": "in",
            "content":{
                "field": "cases.disease_type",
                "value": disease_type 
                }
            },
            {
            "op": "in",
            "content":{
                "field": "cases.primary_site",
                "value": ["Bronchus and lung"] 
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.data_category",
                "value": ["DNA Methylation"] 
                }
            },
            {
            "op": "in",
            "content":{
                "field": "files.platform",
                "value": ["Illumina Human Methylation 450"]  
                }
            },
            {
            "op": "in",
            "content":{
                "field": "cases.samples.sample_type",
                "value": ["Primary Tumor","Recurrent Tumor"]  
                }
            }           
        ]
    }
    
    # A POST is used, so the filter parameters can be passed directly as a Dict object.
    params = {
        "filters": filters,
        "fields": "file_id",
        "format": "JSON",
        "size": file_n
        }
    
    # The parameters are passed to 'json' rather than 'params' in this case
    response = requests.post(files_endpt,
                             headers = {"Content-Type": "application/json"}, 
                             json = params)
    #print(response.content.decode("utf-8"))
    
    file_uuid_list = []

    # This step populates the download list with the file_ids from the previous query
    # JSON format is { "data": { "hits": ["file_id":"...", "id":"..." ] } }
    for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
        file_uuid_list.append(file_entry["file_id"])
        
    print("list of files to be downloaded:\n",file_uuid_list)
    
    data_endpt = "https://api.gdc.cancer.gov/data"
    
    params = {"ids": file_uuid_list}
    
    print("download in progress...")
    #query data corresponding to file ids
    response = requests.post(data_endpt, 
                             data = json.dumps(params), 
                             headers = {"Content-Type": "application/json"}
                             )

    
    response_head_cd = response.headers["Content-Disposition"]
    #print("resp headers", response.headers)
    file_name = path+"/"+ re.findall("filename=(.+)", response_head_cd)[0]
    #print("filename ",file_name)
    with open(file_name, "wb") as output_file:
        output_file.write(response.content)
    print("data downloaded")
    
    if unzip: 
        print("extracting data...")
        unzipFile(file_name, path)
        extractFilesFromDirs(file_uuid_list, path)
        print("data extracted")
        #os.remove(file_name)
        #os.remove(path+"MANIFEST.txt")
     
        
    
def unzipFile(fname,path="."):
    if (fname.endswith("tar.gz")):
        tar = tarfile.open(fname, "r:gz")
        tar.extractall(path=path)
        tar.close()
    elif (fname.endswith("tar")):
        tar = tarfile.open(fname, "r:")
        tar.extractall(path=path)
        tar.close()
    
def extractFilesFromDirs(dir_list, path="."):
    #extract downloaded and unzipped files from directories 
    #in path, whose name is in dir_list
    for dir in dir_list:
        filepath= path+"/"+dir
        #print(filepath)
        for root, dirs, files in os.walk(filepath, topdown=False):
            for file in files:
                try:
                    #print(file)
                    shutil.move(filepath+"/"+file, path+"/"+file)
                except OSError:
                    pass
                #delete directories
                shutil.rmtree(filepath)
                
def getDataToFolder(disease_types=["Adenomas and Adenocarcinomas","Squamous Cell Neoplasms"], path=".", file_n=2, unzip=True):
    for disease_type in disease_types:
        outputDir = (path + "/{}/").format(disease_type)
        #create category folder (every category must have their folder)
        if not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        getData(disease_type,path=outputDir,file_n=file_n,unzip=unzip)

In [0]:
path = Path("./data")  

In [0]:
getDataToFolder(path=str(path),file_n= 1)

list of files to be downloaded:
 ['8773847f-5b07-40a7-a214-bbe194c8543f']
download in progress...
data downloaded
extracting data...
data extracted
list of files to be downloaded:
 ['bd2c1469-848e-4fc3-8fe1-9d96c073eabd']
download in progress...
data downloaded
extracting data...
data extracted


In [0]:
def import_and_filter_data(path, files):
    dfs = []
    na_vals = "."
    column_names = ["Composite", "Beta_value", "Chromosome", "Start", "End", "Gene_Symbol", "Gene_Type", "Transcript_ID", "Position_to_TSS", "CGI_Coordinate", "Feature_Type"]
    #column_names = ["Composite", "Beta_value"]
    for file in files:
        df = pd.read_csv(os.path.join(path, file), sep= "\t",header = None, names = column_names,skiprows= 0, na_values = na_vals)
        # Elimino tutte le righe che non hanno Composite che inizia per "cg*"
        patternDel = "^cg"
        filter = df['Composite'].str.contains(patternDel)
        #print(filter)
        df = df[filter]
        dfs.append(df)
    return dfs

def create_images(dfs,path,disease_type,files):
    w, h, d  = 402, 402, 3
    category_name = disease_type #name of category
    outputDir = path + "/image/{}/".format(category_name)
    if not os.path.isdir(outputDir):
        os.makedirs(outputDir)
    # Transform data to image and save
    index = 0
    for df in dfs:
        vect = df.Beta_value.fillna(0).to_numpy().astype(np.double)
        ima = np.concatenate([vect,np.zeros(w*h*d-len(df))]).reshape(w,h,d)
        img = Image.fromarray(ima, 'RGB')
        img.save(outputDir+"{}.png".format(files[index]))
        index+=1

disease_types = ["Adenomas and Adenocarcinomas","Squamous Cell Neoplasms"]
for disease_type in disease_types :
    disease_type_path = os.path.join(path, disease_type)
    files = [f for f in os.listdir(disease_type_path) if os.path.isfile(os.path.join(disease_type_path, f))]
    dfs = import_and_filter_data(disease_type_path,files)
    create_images(dfs,path,disease_type,files)
    