In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import requests
import json

In [None]:
file_dir = "%s/../datasets/tcga/COAD/mirna_allsamples/"%os.getcwd()
os.listdir(file_dir)
label = "Subtype_Selected"

In [None]:
list(map(print,pd.read_csv(f"{file_dir}/trisbm/trisbm_level_0_metadata.csv")["Metadatum 7"].values))

In [None]:
df_files = pd.read_csv(f"{file_dir}/files.dat", index_col=0)

In [None]:
df_metadata = pd.read_csv(f"{file_dir}/trisbm/trisbm_level_0_metadatum-dist.csv", index_col=1).drop("i_doc", axis=1)
df_metadata = df_metadata.join(df_files.loc[:,label], how="inner")

In [None]:
df_metadata.groupby("Subtype_Selected").mean().transpose()

In [None]:
fig, ax= plt.subplots(figsize=(9,8))
df_metadata.groupby("Subtype_Selected").mean().transpose().plot(ax=ax, lw=10, ls="--")
ax.set_xticks(np.linspace(0, df_metadata.shape[1]-2,df_metadata.shape[1]-1))
ax.set_xticklabels(labels=df_metadata.columns[:-1], rotation=45)

ax.tick_params(labelsize=25, size=10)
plt.show()

In [None]:
df_metadata_centered = df_metadata.subtract(df_metadata.mean(0),1).abs().divide(df_metadata.std(0),1)
df_metadata_centered["Metadatum 7"].hist()

In [None]:
case_set = df_metadata_centered[df_metadata_centered["Metadatum 7"]>2].index.values
list(map(print, case_set))

In [None]:
def get_survival(case):
    if case["cases.0.demographic.vital_status"] == 1:
        return case["cases.0.demographic.days_to_death"]
    else:
        return case["cases.0.diagnoses.0.days_to_last_follow_up"]

df_files["days_survival"] = df_files.apply(get_survival,1)

In [None]:
df_metadata_centered = df_metadata_centered.join(df_files.loc[:, "days_survival"])

In [None]:
metadatum = "Metadatum 7"

In [None]:
x, y =  df_metadata.subtract(df_metadata.mean(0),1)[metadatum], df_metadata_centered["days_survival"]

In [None]:
fig, ax = plt.subplots(figsize=(9,8))
ax.scatter(x,y/365., s=20**2, c="gray")

#ax.set_title(f"{pearsonr(x[~y.isna()],y[~y.isna()])[0]}")
ax.set_xlabel("$\\bar{P}$("+metadatum+"|sample)", fontsize=35)
ax.set_ylabel("Survival [years]", fontsize=35)

ax.tick_params(labelsize=25, size=10)
plt.show()
fig.savefig(f"survival_{metadatum}.pdf")

## Search

In [None]:
##CNV filters
filters = {
    "op": "and",
    "content":[
        {
        "op": "in",
        "content":{
            "field": "files.data_type",
            "value": ["Copy Number Segment"]
            }
        },
          {
        "op": "in",
        "content":{
               "field": "files.data_category",
                "value": ["copy number variation"]
                }
        },
        {
        "op": "in",
        "content":{
            "field": "files.data_format",
            "value": ["TXT"]
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ['TCGA-BRCA']
            }
        },
        {
        "op": "in",
        "content":{
            "field": "cases.project.program.name",
            "value": ["TCGA"]
            }
        }
        
    ]
}

In [None]:
filters = {
    "op":"and",
    "content":[
    {
   "op":"in",
   "content":{  
      "field":"case.submitter_id",
      "value": list(case_set)
   }
}]
}

In [None]:
params = {
    "filters": json.dumps(filters),
    "format": "TSV",
    "size": "50000"
    }
response = requests.get("https://api.gdc.cancer.gov/cnv_occurrences?expand=case,cnv", headers = {"Content-Type": "application/json"}, params = params)
data = [row.replace("\r","").split("\t") for row in response.content.decode("utf-8").split("\n")]
df_cnv_occurrences = pd.DataFrame(columns = data[0], data=data[1:-1]).dropna(how="all", axis=0)
response = requests.get("https://api.gdc.cancer.gov/ssm_occurrences?expand=ssm", headers = {"Content-Type": "application/json"}, params = params)
data = [row.replace("\r","").split("\t") for row in response.content.decode("utf-8").split("\n")]
df_ssm_occurrences = pd.DataFrame(columns = data[0], data=data[1:-1]).dropna(how="all", axis=0)

In [None]:
def get_gene_from_cnv_id(cnv):
    response = requests.get(f'https://api.gdc.cancer.gov/cnvs/{cnv}?pretty=true&expand=consequence.gene')
    ret = response.json()
    return ret["data"]["consequence"][0]["gene"]["gene_id"]

def get_gene_from_ssm_id(ssm):
    response = requests.get(f'https://api.gdc.cancer.gov/ssms/{ssm}?pretty=true&expand=consequence.gene')
    ret = response.json()
    if len(ret["data"]["gene_aa_change"]) < 1:
        return None
    return ret["data"]["gene_aa_change"][0].split(" ")[0]

In [None]:
df_ssm_id = df_ssm_occurrences[["ssm.ssm_id", "ssm_occurrence_id"]].groupby(["ssm.ssm_id"]).count().sort_values("ssm_occurrence_id", ascending=False)
df_ssm_id.rename(columns={"ssm_occurrence_id":"score"}, inplace=True)
print(df_ssm_id["score"].max())
ssm_ids = df_ssm_id[df_ssm_id["score"]>1].index
print(len(ssm_ids), "ssm.cnv_id")

In [None]:
df_cnv_id = df_cnv_occurrences[["cnv.cnv_id", "cnv_occurrence_id"]].groupby(["cnv.cnv_id"]).count().sort_values("cnv_occurrence_id", ascending=False)
df_cnv_id.rename(columns={"cnv_occurrence_id":"score"}, inplace=True)
print(df_cnv_id["score"].max())
cnv_ids = df_cnv_id[df_cnv_id["score"]>9].index
print(len(cnv_ids), "cnv.cnv_id")

In [None]:
get_gene_from_cnv_id(cnv_ids[0])

In [None]:
get_gene_from_ssm_id(ssm_ids[0])

In [None]:
import multiprocessing as mp
from time import time

start = time()

gene_list = []
gene_list_from_ssm = []

def append_gene(g):
    global gene_list
    if g is not None:
        gene_list.append(g)
    
def append_gene_from_ssm(g):
    global gene_list_from_ssm
    if g is not None:
        gene_list_from_ssm.append(g)
    
pool = mp.Pool(2)
w = [pool.apply_async(get_gene_from_cnv_id, args=([cnv_id]), callback=append_gene, error_callback=lambda err: print(err)) for cnv_id in cnv_ids]
pool.close()

poolssm = mp.Pool(2)
wssm = [poolssm.apply_async(get_gene_from_ssm_id, args=([ssm_id]), callback=append_gene_from_ssm, error_callback=lambda err: print(err)) for ssm_id in ssm_ids]
poolssm.close()


pool.join()
poolssm.join()


time()-start

In [None]:
list(map(print,gene_list))

In [None]:
from biothings_client import get_client
client = get_client("gene")

In [None]:
#client.getgenes(gene_list, 'name,symbol,refseq.rna,type_of_gene,bp')

In [None]:
list(map(print,gene_list))

In [None]:
with open("%strisbm|metadatum7|l0/oncogrid.json"%file_dir, "r") as file:
    oncogrid = json.loads(file.read())

In [None]:
oncogrid.keys()

In [None]:
onco_genes = pd.read_json(json.dumps(oncogrid["genes"]))
onco_genes.head()

In [None]:
for g in onco_genes[onco_genes["_score"]>0]["symbol"]:
    print(g)

In [None]:
pd.read_json(json.dumps(oncogrid["cnv_occurrences"]))["cnv"]

In [None]:
len(cnv_ids)