# Convert downloaded TCGA datasets into sample × gene matrices

In [1]:
import os

import pandas

## Read sample information

This file contains sample information. See the [online documentation](https://genome-cancer.soe.ucsc.edu/proj/site/xena/datapages/?dataset=TCGA.PANCAN.sampleMap/PANCAN_clinicalMatrix&host=https://tcga.xenahubs.net) for `PANCAN_clinicalMatrix`.

In [2]:
path = os.path.join('download', 'PANCAN_clinicalMatrix.tsv.bz2')
clinmat_df = (
    pandas.read_table(path)
    .rename(columns={'sampleID': 'sample_id'})
)
# Check that no sample_ids are duplicated
assert not clinmat_df.sample_id.duplicated().any()
clinmat_df.shape

(12811, 40)

In [3]:
# Types of samples
clinmat_df.sample_type.value_counts()

Primary Tumor                                      10593
Solid Tissue Normal                                 1475
Metastatic                                           396
Primary Blood Derived Cancer - Peripheral Blood      200
Recurrent Tumor                                       56
Additional - New Primary                              11
Additional Metastatic                                  1
Name: sample_type, dtype: int64

## Read mutation data

This file contains mutation data (which mutations each sample contains) See the [online documentation](https://genome-cancer.soe.ucsc.edu/proj/site/xena/datapages/?dataset=TCGA.PANCAN.sampleMap/PANCAN_mutation&host=https://tcga.xenahubs.net) for `PANCAN_mutation`. Note that duplicate mutation rows, which [occur](https://groups.google.com/d/msg/ucsc-cancer-genomics-browser/eg6nJOFSefw/Z0BM6pU9BAAJ "Message on the Xena Browser Google Group") for samples that were sequenced multiple times, are filtered.

In [4]:
path = os.path.join('download', 'PANCAN_mutation.tsv.bz2')
snp_mutation_df = (
    pandas.read_table(path)
    .rename(columns={'sample': 'sample_id'})
    .drop_duplicates()
)
snp_mutation_df.head(2)

Unnamed: 0,sample_id,chr,start,end,reference,alt,gene,effect,DNA_VAF,RNA_VAF,Amino_Acid_Change
0,TCGA-D8-A1J8-01,chr10,52587953,52587953,C,A,A1CF,Missense_Mutation,,,p.R236I
1,TCGA-BH-A0HP-01,chr10,52595854,52595854,G,A,A1CF,Missense_Mutation,,,p.A195V


In [5]:
# Number of samples with at least one mutation
snp_mutation_df.sample_id.nunique()

8510

In [6]:
# Mutations counts by type
snp_mutation_df.effect.value_counts().reset_index()

Unnamed: 0,index,effect
0,Missense_Mutation,1132319
1,Silent,474679
2,Nonsense_Mutation,87104
3,RNA,75134
4,Frame_Shift_Del,46991
5,Splice_Site,46477
6,Frame_Shift_Ins,21657
7,In_Frame_Del,10663
8,Translation_Start_Site,3437
9,In_Frame_Ins,2685


### Convert SNP mutations to gene mutations

The next cell specifies which mutations to preserve as gene-affecting, which were chosen according to the red & blue [mutation effects in Xena](http://xena.ucsc.edu/how-we-characterize-mutations/).

In [7]:
mutations = {
    'Frame_Shift_Del',
    'Frame_Shift_Ins',
    'In_Frame_Del',
    'In_Frame_Ins',
    'Missense_Mutation',
    'Nonsense_Mutation',
    'Nonstop_Mutation',
    'RNA',
    'Splice_Site',
    'Translation_Start_Site',
}

In [8]:
# Mutations effects that were observed but nut included
set(snp_mutation_df.effect.unique()) - mutations

{"3'UTR", "5'Flank", "5'UTR", 'IGR', 'Intron', 'Silent'}

In [9]:
gene_mutation_df = (snp_mutation_df
    .query("effect in @mutations")
    .groupby(['sample_id', 'gene'])
    .apply(len)
    .reset_index()
    .rename(columns={0: 'count'})
)
gene_mutation_df.head(2)

Unnamed: 0,sample_id,gene,count
0,TCGA-02-0003-01,AKAP6,1
1,TCGA-02-0003-01,ANAPC4,1


In [10]:
# Create a sample (rows) by gene (columns) matrix of mutation status
gene_mutation_mat_df = (gene_mutation_df
    .pivot_table(index='sample_id', columns='gene', values='count', fill_value=0)
    .astype(bool).astype(int)
)
gene_mutation_mat_df.shape

(8508, 30236)

In [11]:
'{:.2%} sample-gene pairs are mutated'.format(
    gene_mutation_mat_df.stack().mean())

'0.50% sample-gene pairs are mutated'

In [12]:
# Top mutated genes
gene_mutation_df.gene.value_counts().reset_index().head(5)

Unnamed: 0,index,gene
0,TP53,2992
1,TTN,2465
2,MUC16,1518
3,PIK3CA,1024
4,CSMD3,989


In [13]:
# Top mutated samples
gene_mutation_df.sample_id.value_counts().reset_index().head(5)

Unnamed: 0,index,sample_id
0,TCGA-IB-7651-01,8369
1,TCGA-FW-A3R5-06,7772
2,TCGA-AP-A0LM-01,7227
3,TCGA-AP-A059-01,6660
4,TCGA-B5-A0JY-01,6338


## Read gene expression data

This file contains gene expression data from RNA-Sequencing. See the [online documentation](https://genome-cancer.soe.ucsc.edu/proj/site/xena/datapages/?dataset=TCGA.PANCAN.sampleMap/HiSeqV2&host=https://tcga.xenahubs.net) for `HiSeqV2`.

In [14]:
# Read the gene × sample dataset
path = os.path.join('download', 'HiSeqV2.tsv.bz2')
expr_df = pandas.read_table(path, index_col=0)

# Process the dataset
expr_df = (expr_df
    # Remove genes containing a `?`
    [~expr_df.index.str.contains('?', regex=False)]
    # Transpose so the data is sample × gene
    .transpose()
    # Sort rows and columns
    .sort_index(axis='rows')
    .sort_index(axis='columns')
)

expr_df.index.rename('sample_id', inplace=True)

expr_df.shape

(10459, 20501)

In [15]:
# Peak at the data matrix
expr_df.iloc[:5, :5]

Sample,A1BG,A1CF,A2BP1,A2LD1,A2M
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,6.9774,0.0,7.9403,5.8092,15.0538
TCGA-02-0055-01,8.6177,0.0,7.1122,6.4096,15.3879
TCGA-02-2483-01,8.092,0.0,6.8077,5.1513,14.3622
TCGA-02-2485-01,6.4084,0.0,8.012,6.9919,12.9292
TCGA-02-2486-01,6.7716,0.0,2.3973,7.5814,15.3224


## Integrate expression and mutation data

Find samples with both mutation and expression data. We assume that if a sample was not in `PANCAN_mutation`, it was not assayed for mutation. Hence, zero-mutation cancers are excluded even if they have mutation data.

In [16]:
sample_ids = list(gene_mutation_mat_df.index & expr_df.index)
len(sample_ids)

7706

In [17]:
# Filter expression (x) and mutation (y) matrices for common samples
x_df = expr_df.loc[sample_ids, :]
y_df = gene_mutation_mat_df.loc[sample_ids, :]

### Export matrices to TSVs

Matrices are saved as sample × gene TSVs. Subsetted matrices are also exported to allow users to quickly explore small portions of the dataset.

In [18]:
def sample_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):
    """Randomly subset a dataframe, preserving row and column order."""
    if nrows is None:
        nrows = len(df)
    if ncols is None:
        ncols = len(df.columns)
    return (df
        .sample(n=nrows, random_state=row_seed, axis='rows')
        .sample(n=ncols, random_state=col_seed, axis='columns')
        .sort_index(axis='rows')
        .sort_index(axis='columns')
    )

In [19]:
tsv_args = {'sep': '\t', 'float_format': '%.3g'}

for df, name in (x_df, 'expression-matrix'), (y_df, 'mutation-matrix'):

    # Save full dataset
    path = os.path.join('data', name + '.tsv.bz2')
    df.to_csv(path, **tsv_args, compression='bz2')
    
    # Save subsetted datasets
    for sample, nrows, ncols in ('small', 50, 15), ('all-samples', None, 15), ('all-genes', 50, None):
        path = os.path.join('data', 'subset', '{}-{}.tsv'.format(name, sample))
        sample_df(df, nrows=nrows, ncols=ncols).to_csv(path, **tsv_args)