## Preprocess miRNA data

Load the downloaded data, update gene identifiers to Entrez, and curate sample IDs.

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mpmp.config as cfg
import mpmp.utilities.tcga_utilities as tu

### Read TCGA Barcode Curation Information

Extract information from TCGA barcodes - `cancer-type` and `sample-type`. See https://github.com/cognoma/cancer-data for more details

In [2]:
(cancer_types_df,
 cancertype_codes_dict,
 sample_types_df,
 sampletype_codes_dict) = tu.get_tcga_barcode_info()
cancer_types_df.head(2)

Unnamed: 0,TSS Code,Source Site,Study Name,BCR,acronym
0,1,International Genomics Consortium,ovarian serous cystadenocarcinoma,IGC,OV
1,2,MD Anderson Cancer Center,glioblastoma multiforme,IGC,GBM


In [3]:
sample_types_df.head(2)

Unnamed: 0,Code,Definition,Short Letter Code
0,1,Primary Solid Tumor,TP
1,2,Recurrent Solid Tumor,TR


### Load and process miRNA data

In [4]:
# first load manifest file, this tells us the filenames of the raw data files
manifest_df = pd.read_csv(os.path.join(cfg.data_dir, 'manifest.tsv'),
                          sep='\t', index_col=0)

# we have sample info for the mirna data (mirna_sample), and the data itself (mirna)
manifest_df.filter(like='mirna', axis=0)

Unnamed: 0_level_0,id,filename,md5,size
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mirna_sample,55d9bf6f-0712-4315-b588-e6f8e295018e,PanCanAtlas_miRNA_sample_information_list.txt,02bb56712be34bcd58c50d90387aebde,553408
mirna,1c6174d9-8ffb-466e-b5ee-07b204c15cf8,pancanMiRs_EBadjOnProtocolPlatformWithoutRepsW...,c7501dc3c505ca172a6a05b611bd11c3,67167640


In [5]:
tcga_mirna_df = pd.read_csv(
    os.path.join(cfg.raw_data_dir, manifest_df.loc['mirna'].filename),
    index_col=0, sep=',')

print(tcga_mirna_df.shape)
tcga_mirna_df.head()

(743, 10825)


Unnamed: 0_level_0,Correction,TCGA-C4-A0F6-01A-11R-A10V-13,TCGA-CU-A0YO-01A-11R-A10V-13,TCGA-BT-A0S7-01A-11R-A10V-13,TCGA-CU-A0YR-01A-12R-A10V-13,TCGA-BL-A0C8-01A-11R-A10V-13,TCGA-C4-A0F0-01A-12R-A10V-13,TCGA-BL-A13J-01A-11R-A10V-13,TCGA-BT-A0YX-01A-11R-A10V-13,TCGA-CU-A0YN-01A-21R-A10V-13,...,TCGA-AG-A020-01A-21R-A082-13,TCGA-AG-A01Y-01A-41R-A082-13,TCGA-AG-A01W-01A-21R-A082-13,TCGA-AG-3726-01A-02T-0906-13,TCGA-AG-3605-01A-01T-0827-13,TCGA-AG-3584-01A-01T-0822-13,TCGA-AG-3599-01A-02T-0827-13,TCGA-AG-3583-01A-01T-0822-13,TCGA-AG-3598-01A-01T-0827-13,TCGA-AG-3586-01A-02T-0822-13
Genes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hsa-let-7a-2-3p,Corrected,0.980776,2.750555,7.085729,0.805977,0.803256,5.508264,4.64188,1.332116,1.964324,...,-0.468859,1.261731,1.452124,1.029965,0.120077,3.262839,2.002213,0.675954,1.948157,1.542847
hsa-let-7a-3p,Corrected,32.785934,62.595962,64.480134,17.73149,32.397995,32.065967,42.1865,23.978095,33.000636,...,58.980125,52.909443,41.337998,20.858121,23.477429,22.251037,29.156045,19.786044,25.893893,25.602462
hsa-let-7a-5p,Corrected,17787.06993,20816.36637,64187.20949,31852.2032,36701.5735,41331.26091,31584.5823,26000.62876,23612.74068,...,22642.14219,29069.86828,26665.74461,21519.34547,27242.93387,23122.91869,17890.75858,16792.88093,23153.80168,16210.1845
hsa-let-7b-3p,Corrected,6.72532,11.835721,41.238943,14.104594,18.207138,19.869096,19.113625,9.70542,28.286259,...,21.922332,8.44328,15.903616,25.569781,10.97813,20.626534,28.611334,22.441202,41.672361,14.560834
hsa-let-7b-5p,Corrected,6875.798894,6014.963519,34652.0489,7643.480998,5972.74445,16684.33574,9602.821608,7287.057385,7610.182346,...,3253.760442,3510.517285,6051.502783,4360.331995,5342.375842,4760.875509,5660.590915,7572.840852,6127.299751,6729.794141


### Process gene expression matrix

This involves processing sample IDs, sorting and subsetting.

In [6]:
# remove transcripts with NA values
tcga_mirna_df = (tcga_mirna_df
    .drop(columns=['Correction'])
    .dropna(axis='rows')
    .groupby(level=0).mean()
    .transpose()
    .sort_index(axis='rows')
    .sort_index(axis='columns')
)

tcga_mirna_df.index.rename('sample_id', inplace=True)

In [7]:
# update sample IDs to remove multiple samples measured on the same tumor
# and to map with the clinical information
tcga_mirna_df.index = tcga_mirna_df.index.str.slice(start=0, stop=15)
tcga_mirna_df = tcga_mirna_df.loc[~tcga_mirna_df.index.duplicated(), :]

In [8]:
print(tcga_mirna_df.shape)
tcga_mirna_df.head()

(10818, 743)


Genes,hsa-let-7a-2-3p,hsa-let-7a-3p,hsa-let-7a-5p,hsa-let-7b-3p,hsa-let-7b-5p,hsa-let-7c-3p,hsa-let-7c-5p,hsa-let-7d-3p,hsa-let-7d-5p,hsa-let-7e-3p,...,hsa-miR-944,hsa-miR-95-3p,hsa-miR-96-3p,hsa-miR-96-5p,hsa-miR-98-3p,hsa-miR-98-5p,hsa-miR-99a-3p,hsa-miR-99a-5p,hsa-miR-99b-3p,hsa-miR-99b-5p
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-04-1331-01,4.12319,1.649276,129176.7914,212.481721,129236.7151,8.521259,34674.65283,9271.404853,931.566043,140.463337,...,0.0,2.199035,0.0,4.672949,0.0,7.146863,9.620776,899.130282,395.826232,177848.0247
TCGA-04-1336-01,0.0,1.598108,17577.58814,314.827245,89649.05552,12.784863,7734.841947,20466.96711,281.26698,147.025921,...,0.0,0.0,0.0,1.598108,0.0,9.588647,4.794324,340.39697,1022.789018,495375.0759
TCGA-04-1337-01,22.23493,2.779366,31921.02153,159.118719,48513.14328,17.371039,6931.739459,9333.806752,413.430732,203.588579,...,0.0,2.779366,0.0,0.694842,0.0,4.863891,9.03294,453.036701,275.15726,537774.0195
TCGA-04-1341-01,44.086569,2.844295,23114.87253,140.081517,30416.88829,7.110737,2886.959193,9498.522389,890.975337,204.07815,...,0.0,0.711074,0.0,3.555368,0.0,9.955032,0.711074,137.948296,790.002873,429092.4424
TCGA-04-1342-01,9.409329,0.0,25658.56904,102.830528,38008.98591,14.786089,7225.020751,9181.489161,880.444389,212.382005,...,0.0,0.672095,0.0,2.68838,0.0,6.048855,3.360475,360.242895,604.213363,461403.2671


### Process TCGA cancer type and sample type info from barcodes

Cancer-type includes `OV`, `BRCA`, `LUSC`, `LUAD`, etc. while sample-type includes `Primary`, `Metastatic`, `Solid Tissue Normal`, etc.

See https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/tissue-source-site-codes for more details.

The goal is to use this info to stratify train and test sets by cancer type and sample type. 

In [9]:
# get sample info and save to file
tcga_id = tu.get_and_save_sample_info(tcga_mirna_df,
                                      sampletype_codes_dict,
                                      cancertype_codes_dict,
                                      training_data='mirna')

print(tcga_id.shape)
tcga_id.head()

(10818, 4)


Unnamed: 0,sample_id,sample_type,cancer_type,id_for_stratification
0,TCGA-04-1331-01,Primary Solid Tumor,OV,OVPrimary Solid Tumor
1,TCGA-04-1336-01,Primary Solid Tumor,OV,OVPrimary Solid Tumor
2,TCGA-04-1337-01,Primary Solid Tumor,OV,OVPrimary Solid Tumor
3,TCGA-04-1341-01,Primary Solid Tumor,OV,OVPrimary Solid Tumor
4,TCGA-04-1342-01,Primary Solid Tumor,OV,OVPrimary Solid Tumor


In [10]:
# get cancer type counts and save to file
cancertype_count_df = (
    pd.DataFrame(tcga_id.cancer_type.value_counts())
    .reset_index()
    .rename({'index': 'cancertype', 'cancer_type': 'n ='}, axis='columns')
)

file = os.path.join(cfg.sample_info_dir, 'tcga_mirna_sample_counts.tsv')
cancertype_count_df.to_csv(file, sep='\t', index=False)

cancertype_count_df.head()

Unnamed: 0,cancertype,n =
0,BRCA,1165
1,KIRC,570
2,THCA,569
3,HNSC,565
4,UCEC,556


In [11]:
mirna_file = os.path.join(cfg.data_dir, 'tcga_mirna_matrix_processed.tsv')
tcga_mirna_df.to_csv(mirna_file, sep='\t', float_format='%.3g')