# Breast analysis

In [1]:
%load_ext watermark
%watermark -v -m  -u -n -p pandas,numpy -a Filippo_Valle -g -r -b -w

Filippo_Valle 
last updated: Wed Jul 29 2020 

CPython 3.7.6
IPython 7.15.0

pandas 1.0.4
numpy 1.18.5

compiler   : GCC 7.5.0
system     : Linux
release    : 4.19.76-linuxkit
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit
Git hash   : 902ab32961da9fa244c75895125877050ca4bd03
Git repo   : git@github.com:fvalle1/phd.git
Git branch : master
watermark 2.0.2


In [2]:
# import libraries
import pandas as pd
import numpy as np
import datetime as dt

We read file with gdc annotations obtained with gdc API

In [3]:
df_meta = pd.read_csv("TCGAbiolinks_BRCA.csv", header=0, index_col=0, skiprows=0)
df_meta.dropna(how="all", axis=1)
df_meta["birth_year"] = [(dt.datetime.now()+dt.timedelta(days=d)) if d!="" else np.nan for d in df_meta["days_to_birth"].fillna("")]
df_meta["death_year"] = [(dt.datetime.now()-dt.timedelta(days=d)) if d!="" else np.nan for d in df_meta["days_to_death"].fillna("")]
df_meta["age"] = (df_meta["death_year"] - df_meta["birth_year"]).apply(lambda timestamp: timestamp/dt.timedelta(days=365))
bins = np.linspace(df_meta["age"].min(),df_meta["age"].max(), 3)
df_meta["age_binned"] = pd.cut(df_meta["age"], bins=bins, labels = bins[:-1], include_lowest=True)
df_meta.columns

Index(['Tumor.Type', 'Included_in_previous_marker_papers', 'vital_status',
       'days_to_birth', 'days_to_death', 'days_to_last_followup',
       'age_at_initial_pathologic_diagnosis', 'pathologic_stage',
       'Tumor_Grade', 'BRCA_Pathology', 'BRCA_Subtype_PAM50', 'MSI_status',
       'HPV_Status', 'tobacco_smoking_history', 'CNV Clusters',
       'Mutation Clusters', 'DNA.Methylation Clusters', 'mRNA Clusters',
       'miRNA Clusters', 'lncRNA Clusters', 'Protein Clusters',
       'PARADIGM Clusters', 'Pan-Gyn Clusters', 'birth_year', 'death_year',
       'age', 'age_binned'],
      dtype='object')

We read the file from TCGA-Biolinks

In [4]:
df_tcgabiolinks = pd.read_csv("../TCGA_biolinks.csv", index_col=0).dropna(how="all", axis=1)
df_tcgabiolinks = df_tcgabiolinks[df_tcgabiolinks["cancer.type"]=="BRCA"].set_index("pan.samplesID")
df_tcgabiolinks["Subtype_Selected_Lum"]=df_tcgabiolinks["Subtype_Selected"]
df_tcgabiolinks["Subtype_Selected_Lum"].replace("BRCA.LumA","BRCA.Lum", inplace=True)
df_tcgabiolinks["Subtype_Selected_Lum"].replace("BRCA.LumB","BRCA.Lum", inplace=True)

In [8]:
df_files=pd.read_csv("files_tcga.dat", sep="\t", index_col=2)
df_files = df_files.reset_index().set_index("cases.0.submitter_id")
df_files = df_files.transpose().append(df_meta.reindex(index=df_files.index).transpose()).transpose()
df_files = df_files.reset_index().set_index("cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id")
df_files = df_files.transpose().append(df_tcgabiolinks.reindex(index=df_files.index).transpose()).transpose()
df_files.fillna("unknown").set_index("file_name").to_csv("files.dat", index=True, header=True)

In [7]:
df_files.groupby(["BRCA_Subtype_PAM50","Subtype_Selected_Lum", "Subtype_Selected"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,cases.0.submitter_id,cases.0.demographic.gender,cases.0.demographic.days_to_birth,cases.0.demographic.days_to_death,cases.0.demographic.vital_status,cases.0.diagnoses.0.age_at_diagnosis,cases.0.diagnoses.0.days_to_last_follow_up,cases.0.diagnoses.0.last_known_disease_status,cases.0.diagnoses.0.tumor_stage,cases.0.exposures.0.cigarettes_per_day,...,age,age_binned,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other
BRCA_Subtype_PAM50,Subtype_Selected_Lum,Subtype_Selected,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Basal,BRCA.Basal,BRCA.Basal,188,188,188,28,188,188,166,188,188,0,...,28,28,188,188,0,0,0,0,0,0
Basal,BRCA.Normal,BRCA.Normal,17,17,17,6,17,17,12,17,17,0,...,6,6,17,17,0,0,0,0,0,0
Her2,BRCA.Her2,BRCA.Her2,82,82,78,17,82,78,68,82,82,0,...,17,17,82,82,0,0,0,0,0,0
Her2,BRCA.Normal,BRCA.Normal,9,9,9,6,9,9,4,9,9,0,...,6,6,9,9,0,0,0,0,0,0
LumA,BRCA.Lum,BRCA.LumA,570,569,560,69,569,560,525,569,569,0,...,69,69,570,570,0,0,0,0,0,0
LumA,BRCA.Normal,BRCA.Normal,53,53,52,16,53,52,39,53,53,0,...,16,16,53,53,0,0,0,0,0,0
LumB,BRCA.Lum,BRCA.LumA,1,1,1,1,1,1,1,1,1,0,...,1,1,1,1,0,0,0,0,0,0
LumB,BRCA.Lum,BRCA.LumB,209,209,207,33,209,207,186,209,209,0,...,33,33,209,209,0,0,0,0,0,0
LumB,BRCA.Normal,BRCA.Normal,21,21,21,12,21,21,11,21,21,0,...,12,12,21,21,0,0,0,0,0,0
Normal,BRCA.Normal,BRCA.Normal,42,42,42,9,42,42,35,42,42,0,...,9,9,42,42,0,0,0,0,0,0


We read the table with all RNA-Seq samples obtained throgh gdc API. Download of manifest file and preprocessing can be done with [Table_Creation.ipynb](Table_Creation.ipynb) and [TCGA_GetManifest.ipynb](TCGA_GetManifest.ipynb)

In [9]:
df=pd.read_csv("mainTable_all.csv", index_col=0, header=0).applymap(lambda x: np.log2(x+1))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15659 entries, ENSG00000000419 to ENSG00000273489
Columns: 1222 entries, e091bf28-5002-48ff-93df-3030c0942b3a.FPKM.txt.gz to a5dc521e-bee4-489c-8679-d4b90a327d33.FPKM.txt.gz
dtypes: float64(1222)
memory usage: 146.1+ MB


We filter and sample ~1000 some genes that satisfied filters of *Dey et al.* [Visualizing the structure of RNA-seq expression data using grade of membership models](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1006599)

In [10]:
dey = pd.read_csv("http://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None)

In [44]:
df=df.reindex(index=dey.values.ravel()[np.random.randint(0,len(dey), size=1000)])

In [11]:
full_files = [c for c in df_files['file_name'].values if c in df.columns]

In [12]:
df[full_files].dropna().astype(int).to_csv("mainTable.csv")

In [13]:
df_files[df_files['file_name'].isin(full_files)].set_index('file_name').to_csv("files.dat")