In [None]:
import pandas as pd
import scanpy as sc
import json
import requests

# Preprocessing

## Gene Names

In [None]:
df_conversion = pd.read_csv("https://www.genenames.org/cgi-bin/download/custom?col=gd_app_sym&col=md_ensembl_id&status=Approved&status=Entry%20Withdrawn&hgnc_dbtag=on&order_by=gd_app_sym_sort&format=text&submit=submit", sep="\t").set_index("Approved symbol")
df_conversion.rename(columns={"Ensembl ID(supplied by Ensembl)":"ensg"}, inplace=True)
df_conversion.head(2)

## bioMART
https://www.ensembl.org/biomart/martview/cf9b27b6e78e9d6a1be079a4ea60f7fe

In [None]:
df_mart = pd.read_csv("mart_export.txt").set_index("Gene stable ID")
df_mart.head(2)

# TCGA tumor

In [None]:
df_tcga = pd.read_csv("mainTable_all.csv", sep=",", index_col=0)

In [None]:
df_tcga

In [None]:
# isolate pc
df_tcga = df_tcga.join(df_mart, how="inner")
df_tcga = df_tcga[df_tcga["Gene type"]=="protein_coding"]
df_tcga = df_tcga.drop(["miRBase ID", "Gene type"],1)
cases = ["-".join(case.split("-")[:3]) for case in df_tcga.columns]
df_tcga.head(2)

In [None]:
filters = {
        "op": "in",
        "content":{
            "field": "cases.project.project_id",
            "value": ["TCGA-BRCA"]
            }
    }

In [None]:
params = {
    "filters": json.dumps(filters),
    "fields": "submitter_id,primary_site,project.project_id,demographic.vital_status,demographic.gender,diagnoses.age_at_diagnosis,diagnoses.days_to_last_follow_up,demographic.days_to_death,demographic.days_to_birth,diagnoses.last_known_disease_status,diagnoses.tumor_stage,exposures.years_smoked,exposures.cigarettes_per_day",
    "format": "TSV",
    "size": "50000"
    }
response = requests.get("https://api.gdc.cancer.gov/cases", headers = {"Content-Type": "application/json"}, params = params)
#print(response.content.decode("UTF-8"))
with open("files.txt","w") as files:
    files.write(response.content.decode("utf-8"))

In [None]:
df_manifest = pd.read_csv("files.txt", sep="\t").set_index("submitter_id")
df_manifest = df_manifest.reindex(index=cases).dropna(how="all", axis=0)
df_manifest["sample_submitter_id"] = ["-".join(case.split("-")[:4]) for case in df_tcga.columns]
df_manifest["file"] = df_tcga.columns
df_manifest.head(2)

In [None]:
df_biolinks = pd.read_csv("TCGA_biolinks.csv", index_col=0)
df_biolinks = df_biolinks[df_biolinks["cancer.type"]=="BRCA"]
df_biolinks.set_index("pan.samplesID", inplace=True)

In [None]:
df_files = df_manifest.reset_index().set_index("file").join(df_biolinks).reset_index()
df_files.head()

## miRNA

In [None]:
df_tcga_mirna = pd.read_csv("mainTable_mirna.csv", index_col=0)
mirna_submitter_ids = ["-".join(case.split("-")[:4]) for case in df_tcga_mirna.columns]
df_tcga_mirna.head(2)

In [None]:
df_manifest_mirna = pd.read_csv("files_miRNA.dat")
df_manifest_mirna["sample_submitter_id"] = ["-".join(case.split("-")[:4]) for case in df_manifest_mirna["cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id"]]
df_manifest_mirna.head(2)

In [None]:
df_tcga_mirna[df_tcga_mirna.columns[df_tcga_mirna.columns.isin(df_manifest_mirna["file_name"])]]

In [None]:
df_tcga_mirna.columns.duplicated().sum()

In [None]:
df_all = df_files.reset_index().set_index("sample_submitter_id").join(df_manifest_mirna.set_index("sample_submitter_id").rename(columns={"file_name":"file_mirna"}), how="outer")

In [None]:
df_all["index"].dropna()

In [None]:
df_tcga_mirna = df_tcga_mirna.reindex(columns=df_all["file_mirna"].dropna())
df_tcga_mirna.columns = df_all.index.drop_duplicates()
df_tcga = df_tcga.reindex(columns=df_all["file"].dropna())
df_tcga.columns = df_all.index.drop_duplicates()
df_tcga.reindex(columns=df_all.index).append(df_tcga_mirna)

In [None]:
import numpy as np
np.unique([case.split("-")[3] for case in pd.read_csv("files_fpkm.dat")["cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id"]], return_counts=True)