# Breast analysis

In [1]:
%load_ext watermark
%watermark -v -m  -u -n -p pandas,numpy -a Filippo_Valle -g -r -b -w

Filippo_Valle 
last updated: Wed Nov 11 2020 

CPython 3.8.6
IPython 7.19.0

pandas 1.1.4
numpy 1.19.4

compiler   : GCC 7.5.0
system     : Linux
release    : 5.4.39-linuxkit
machine    : x86_64
processor  : x86_64
CPU cores  : 2
interpreter: 64bit
Git hash   : fb57723c6276622c634991a8522d88ae164d9930
Git repo   : git@github.com:fvalle1/cancers
Git branch : develop_first_round
watermark 2.0.2


In [2]:
# import libraries
import pandas as pd
import numpy as np
import datetime as dt

We read file with gdc annotations obtained with gdc API

In [3]:
df_meta = pd.read_csv("TCGAbiolinks_BRCA.csv", header=0, index_col=0, skiprows=0)
df_meta.dropna(how="all", axis=1)
df_meta["birth_year"] = [(dt.datetime.now()+dt.timedelta(days=d)) if d!="" else np.nan for d in df_meta["days_to_birth"].fillna("")]
df_meta["death_year"] = [(dt.datetime.now()-dt.timedelta(days=d)) if d!="" else np.nan for d in df_meta["days_to_death"].fillna("")]
df_meta["age"] = (df_meta["death_year"] - df_meta["birth_year"]).apply(lambda timestamp: timestamp/dt.timedelta(days=365))
bins = np.linspace(df_meta["age"].min(),df_meta["age"].max(), 3)
df_meta["age_binned"] = pd.cut(df_meta["age"], bins=bins, labels = bins[:-1], include_lowest=True)
df_meta["BRCA_Subtype_PAM50_Lum"]=df_meta["BRCA_Subtype_PAM50"]
df_meta["BRCA_Subtype_PAM50_Lum"].replace("LumA","Lum", inplace=True)
df_meta["BRCA_Subtype_PAM50_Lum"].replace("LumB","Lum", inplace=True)
df_meta.columns

Index(['Tumor.Type', 'Included_in_previous_marker_papers', 'vital_status',
       'days_to_birth', 'days_to_death', 'days_to_last_followup',
       'age_at_initial_pathologic_diagnosis', 'pathologic_stage',
       'Tumor_Grade', 'BRCA_Pathology', 'BRCA_Subtype_PAM50', 'MSI_status',
       'HPV_Status', 'tobacco_smoking_history', 'CNV Clusters',
       'Mutation Clusters', 'DNA.Methylation Clusters', 'mRNA Clusters',
       'miRNA Clusters', 'lncRNA Clusters', 'Protein Clusters',
       'PARADIGM Clusters', 'Pan-Gyn Clusters', 'birth_year', 'death_year',
       'age', 'age_binned', 'BRCA_Subtype_PAM50_Lum'],
      dtype='object')

We read the file from TCGA-Biolinks

In [4]:
df_tcgabiolinks = pd.read_csv("../TCGA_biolinks.csv", index_col=0).dropna(how="all", axis=1)
df_tcgabiolinks = df_tcgabiolinks[df_tcgabiolinks["cancer.type"]=="BRCA"].set_index("pan.samplesID")
df_tcgabiolinks["Subtype_Selected_Lum"]=df_tcgabiolinks["Subtype_Selected"]
df_tcgabiolinks["Subtype_Selected_Lum"].replace("BRCA.LumA","BRCA.Lum", inplace=True)
df_tcgabiolinks["Subtype_Selected_Lum"].replace("BRCA.LumB","BRCA.Lum", inplace=True)

In [5]:
df_files=pd.read_csv("files_tcga.dat", sep="\t",index_col=2)
df_files = df_files.reset_index().set_index("cases.0.submitter_id")
df_files = df_files.transpose().append(df_meta.reindex(index=df_files.index).transpose()).transpose()
df_files = df_files.reset_index().set_index("cases.0.samples.0.portions.0.analytes.0.aliquots.0.submitter_id")
df_files = df_files.transpose().append(df_tcgabiolinks.reindex(index=df_files.index).transpose()).transpose()
df_files.fillna("unknown").set_index("file_name").to_csv("files.dat", index=True, header=True)

In [7]:
df_files.groupby(["BRCA_Subtype_PAM50","Subtype_Selected_Lum", "Subtype_Selected"])["cases.0.submitter_id"].count()

BRCA_Subtype_PAM50  Subtype_Selected_Lum  Subtype_Selected
Basal               BRCA.Basal            BRCA.Basal          188
                    BRCA.Normal           BRCA.Normal          17
Her2                BRCA.Her2             BRCA.Her2            82
                    BRCA.Normal           BRCA.Normal           9
LumA                BRCA.Lum              BRCA.LumA           570
                    BRCA.Normal           BRCA.Normal          53
LumB                BRCA.Lum              BRCA.LumA             1
                                          BRCA.LumB           209
                    BRCA.Normal           BRCA.Normal          21
Normal              BRCA.Normal           BRCA.Normal          42
Name: cases.0.submitter_id, dtype: int64

In [16]:
df_files.groupby(["BRCA_Subtype_PAM50"])["cases.0.submitter_id"].count()

BRCA_Subtype_PAM50
Basal     212
Her2       91
LumA      633
LumB      231
Normal     42
Name: cases.0.submitter_id, dtype: int64

In [13]:
df_files.groupby(["Subtype_Selected"])["cases.0.submitter_id"].count()

Subtype_Selected
BRCA.Basal     188
BRCA.Her2       82
BRCA.LumA      576
BRCA.LumB      217
BRCA.Normal    142
Name: cases.0.submitter_id, dtype: int64

In [25]:
for subtype, count_subtype, count_PAM in zip(df_files.groupby(["BRCA_Subtype_PAM50"])["cases.0.submitter_id"].count().index,df_files.groupby(["Subtype_Selected"])["cases.0.submitter_id"].count(),df_files.groupby(["BRCA_Subtype_PAM50"])["cases.0.submitter_id"].count()):
    print(f"{subtype} & {count_PAM} & {count_subtype} \\\\ \hline")

Basal & 212 & 188 \\ \hline
Her2 & 91 & 82 \\ \hline
LumA & 633 & 576 \\ \hline
LumB & 231 & 217 \\ \hline
Normal & 42 & 142 \\ \hline


```latex
\begin{table}[]
    \centering
    \begin{tabular}{c|c|c}
    \hline
        subtype & PAM50 & SubtypeSelected \\ \hline
        Basal & 212 & 188 \\ \hline
        Her2 & 91 & 82 \\ \hline
        LumA & 633 & 576 \\ \hline
        LumB & 231 & 217 \\ \hline
        Normal-like & 42 & 142 \\ \hline
    \end{tabular}
\end{table}
```

We read the table with all RNA-Seq samples obtained throgh gdc API. Download of manifest file and preprocessing can be done with [Table_Creation.ipynb](Table_Creation.ipynb) and [TCGA_GetManifest.ipynb](TCGA_GetManifest.ipynb)

In [8]:
df=pd.read_csv("mainTable_all.csv", index_col=0, header=0).applymap(lambda x: np.log2(x+1))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15659 entries, ENSG00000000419 to ENSG00000273489
Columns: 1222 entries, e091bf28-5002-48ff-93df-3030c0942b3a.FPKM.txt.gz to a5dc521e-bee4-489c-8679-d4b90a327d33.FPKM.txt.gz
dtypes: float64(1222)
memory usage: 146.1+ MB


We filter and sample ~1000 some genes that satisfied filters of *Dey et al.* [Visualizing the structure of RNA-seq expression data using grade of membership models](https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1006599)

In [9]:
dey = pd.read_csv("http://stephenslab.github.io/count-clustering/project/utilities/gene_names_all_gtex.txt", header=None)

In [10]:
filtered_genes = list(filter(lambda g: g in dey.values.ravel(), pd.read_csv("HDE_Breast.csv")["0"]))

In [11]:
df=df.reindex(index=filtered_genes)

In [12]:
full_files = [c for c in df_files['file_name'].values if c in df.columns]

In [13]:
df[full_files].dropna().astype(int).to_csv("mainTable.csv")

In [14]:
df_files[df_files['file_name'].isin(full_files)].set_index('file_name').to_csv("files.dat")