In [60]:
import pandas as pd
import numpy as np
import datetime as dt

In [116]:
df_meta = pd.read_csv("TCGAbiolinks_BRCA.csv", header=0, index_col=0, skiprows=0)
df_meta.dropna(how="all", axis=1)
df_meta["birth_year"] = [(dt.datetime.now()+dt.timedelta(days=d)) if d!="" else np.nan for d in df_meta["days_to_birth"].fillna("")]
df_meta["death_year"] = [(dt.datetime.now()-dt.timedelta(days=d)) if d!="" else np.nan for d in df_meta["days_to_death"].fillna("")]
df_meta["age"] = (df_meta["death_year"] - df_meta["birth_year"]).apply(lambda timestamp: timestamp/dt.timedelta(days=365))
bins = np.linspace(df_meta["age"].min(),df_meta["age"].max(), 3)
df_meta["age_binned"] = pd.cut(df_meta["age"], bins=bins, labels = bins[:-1], include_lowest=True)
df_meta.columns

Index(['Tumor.Type', 'Included_in_previous_marker_papers', 'vital_status',
       'days_to_birth', 'days_to_death', 'days_to_last_followup',
       'age_at_initial_pathologic_diagnosis', 'pathologic_stage',
       'Tumor_Grade', 'BRCA_Pathology', 'BRCA_Subtype_PAM50', 'MSI_status',
       'HPV_Status', 'tobacco_smoking_history', 'CNV Clusters',
       'Mutation Clusters', 'DNA.Methylation Clusters', 'mRNA Clusters',
       'miRNA Clusters', 'lncRNA Clusters', 'Protein Clusters',
       'PARADIGM Clusters', 'Pan-Gyn Clusters', 'birth_year', 'death_year',
       'age', 'age_binned'],
      dtype='object')

In [142]:
df_tcgabiolinks = pd.read_csv("../TCGA_biolinks.csv", index_col=0).dropna(how="all", axis=1)
df_tcgabiolinks = df_tcgabiolinks[df_tcgabiolinks["cancer.type"]=="BRCA"].set_index("pan.samplesID")
df_tcgabiolinks["Subtype_Selected_Lum"]=df_tcgabiolinks["Subtype_Selected"]
df_tcgabiolinks["Subtype_Selected_Lum"].replace("BRCA.LumA","BRCA.Lum", inplace=True)
df_tcgabiolinks["Subtype_Selected_Lum"].replace("BRCA.LumB","BRCA.Lum", inplace=True)

In [144]:
df_files=pd.read_csv("files_tcga.dat", index_col=2)
df_files = df_files.reset_index().set_index("cases.0.submitter_id")
df_files = df_files.transpose().append(df_meta.reindex(index=df_files.index).transpose()).transpose()
df_files = df_files.reset_index().set_index("cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id")
df_files = df_files.transpose().append(df_tcgabiolinks.reindex(index=df_files.index).transpose()).transpose()
df_files.fillna("unknown").set_index("file_name").to_csv("files.dat", index=True, header=True)

In [146]:
df_files.groupby(["BRCA_Subtype_PAM50","Subtype_Selected_Lum", "Subtype_Selected"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cases.0.submitter_id,cases.0.diagnoses.0.primary_diagnosis,file_name,cases.0.diagnoses.0.morphology,cases.0.diagnoses.0.tissue_or_organ_of_origin,cases.0.diagnoses.0.tumor_grade,cases.0.diagnoses.0.tumor_stage,cases.0.project.disease_type,cases.0.project.primary_site,cases.0.samples.0.longest_dimension,...,age,age_binned,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other
BRCA_Subtype_PAM50,Subtype_Selected_Lum,Subtype_Selected,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Basal,BRCA.Basal,BRCA.Basal,188,188,188,188,188,188,188,188,188,0,...,28,28,188,188,0,0,0,0,0,0
Basal,BRCA.Normal,BRCA.Normal,17,17,17,17,17,17,17,17,17,0,...,6,6,17,17,0,0,0,0,0,0
Her2,BRCA.Her2,BRCA.Her2,82,82,82,82,82,82,82,82,82,0,...,17,17,82,82,0,0,0,0,0,0
Her2,BRCA.Normal,BRCA.Normal,9,9,9,9,9,9,9,9,9,0,...,6,6,9,9,0,0,0,0,0,0
LumA,BRCA.Lum,BRCA.LumA,570,569,570,569,569,569,569,570,570,0,...,69,69,570,570,0,0,0,0,0,0
LumA,BRCA.Normal,BRCA.Normal,53,53,53,53,53,53,53,53,53,0,...,16,16,53,53,0,0,0,0,0,0
LumB,BRCA.Lum,BRCA.LumA,1,1,1,1,1,1,1,1,1,0,...,1,1,1,1,0,0,0,0,0,0
LumB,BRCA.Lum,BRCA.LumB,209,209,209,209,209,209,209,209,209,0,...,33,33,209,209,0,0,0,0,0,0
LumB,BRCA.Normal,BRCA.Normal,21,21,21,21,21,21,21,21,21,0,...,12,12,21,21,0,0,0,0,0,0
Normal,BRCA.Normal,BRCA.Normal,42,42,42,42,42,42,42,42,42,0,...,9,9,42,42,0,0,0,0,0,0


In [4]:
df=pd.read_csv("mainTable_all.csv", index_col=0, header=0).applymap(lambda x: np.log2(x+1))
df.info()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-b88ac5a1166e>", line 1, in <module>
    df=pd.read_csv("mainTable_all.csv", index_col=0, header=0).applymap(lambda x: np.log2(x+1))
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py", line 6944, in applymap
    return self.apply(infer)
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py", line 6878, in apply
    return op.get_result()
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/apply.py", line 186, in get_result
    return self.apply_standard()
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/apply.py", line 313, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/opt/conda/lib/python3.7/site-packages/pandas/core/apply.py", line 341, in apply_series_generator
    results[i] = self.f(

KeyboardInterrupt: 

In [None]:
dey = pd.read_csv("http://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None)

In [8]:
df=df.reindex(index=dey.values.ravel()[np.random.randint(0,len(dey), size=1000)])

In [9]:
full_files = [c for c in df_files['file_name'].values if c in df.columns]

In [10]:
df[full_files].dropna().astype(int).to_csv("mainTable.csv")

In [11]:
df_files[df_files['file_name'].isin(full_files)].set_index('file_name').to_csv("files.dat")