## Use pre-trained models to make predictions on normal tissue samples

For some cancer types, TCGA provides samples from normal tissue in addition to the tumor samples (see `01_explore_data/normal_tissue_samples.ipynb`).

In this analysis, we want to make predictions on those samples and compare them to our tumor sample predictions.

Our assumption is that our models will predict that the normal tissue samples have a low probability of mutation (since they almost certainly do not have somatic mutations in any of the genes of interest).

In [1]:
from pathlib import Path
import pickle as pkl

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mpmp.config as cfg
import mpmp.utilities.analysis_utilities as au
import mpmp.utilities.data_utilities as du
import mpmp.utilities.plot_utilities as plu
import mpmp.utilities.tcga_utilities as tu

%load_ext autoreload
%autoreload 2

In [2]:
results_dir = Path(cfg.results_dirs['final'],
                   'pilot_genes',
                   'gene').resolve()

genes = [g.stem for g in results_dir.iterdir() if not g.is_file()]
print(genes)

['TP53', 'EGFR', 'IDH1', 'PIK3CA', 'SETD2', 'KRAS']


### Load pre-trained model

In [3]:
gene = 'TP53'
training_data = 'expression'

model_filename = '{}_{}_elasticnet_classify_s42_model.pkl'.format(gene, training_data)

with open(str(results_dir / gene / model_filename), 'rb') as f:
    model_fit = pkl.load(f)

print(model_fit)
print(model_fit.feature_names_in_.shape)

SGDClassifier(alpha=0.1, class_weight='balanced', l1_ratio=0.1, loss='log',
              penalty='elasticnet', random_state=42)
(15389,)


### Load expression data and sample info

In [4]:
# load expression sample info, this has tumor/normal labels
sample_info_df = du.load_sample_info(training_data)
print(sample_info_df.sample_type.unique())
sample_info_df.head()

['Primary Solid Tumor' 'Recurrent Solid Tumor' 'Solid Tissue Normal'
 'Additional - New Primary' 'Metastatic'
 'Primary Blood Derived Cancer - Peripheral Blood' 'Additional Metastatic']


Unnamed: 0_level_0,sample_type,cancer_type,id_for_stratification
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-02-0047-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-0055-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-2483-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-2485-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-2486-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor


In [5]:
# load expression data
data_df = du.load_raw_data('expression', verbose=True)
print(data_df.shape)
data_df.iloc[:5, :5]

Loading expression data...


(11060, 15369)


Unnamed: 0_level_0,1,100,1000,10000,10001
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,125.0,136.0,2300.0,1300.0,272.0
TCGA-02-0055-01,392.0,222.0,1820.0,903.0,321.0
TCGA-02-2483-01,272.0,256.0,2890.0,1320.0,458.0
TCGA-02-2485-01,83.9,129.0,6970.0,10100.0,419.0
TCGA-02-2486-01,108.0,205.0,2250.0,873.0,441.0


In [6]:
# load mutation data
pancancer_data = du.load_pancancer_data()
(sample_freeze_df,
 mutation_df,
 copy_loss_df,
 copy_gain_df,
 mut_burden_df) = pancancer_data

### Subset expression data to get train and control samples

We want to compare predictions made using the trained model on the control samples to predictions on the (tumor-derived) data used to train the model, so we'll load both expression datasets here.

In [7]:
# get cancer types that were used to train the model
valid_cancer_types = sample_info_df.cancer_type.unique()
train_cancer_types = [
    f for f in model_fit.feature_names_in_ if f in valid_cancer_types
]
print(train_cancer_types)

['BLCA', 'BRCA', 'CESC', 'COAD', 'ESCA', 'HNSC', 'KICH', 'LGG', 'LIHC', 'LUAD', 'LUSC', 'PAAD', 'PRAD', 'READ', 'SARC', 'SKCM', 'STAD', 'UCEC', 'UCS']


In [8]:
# get samples that were used to train the model
train_samples = (
    sample_info_df[sample_info_df.cancer_type.isin(train_cancer_types)]
      .index
      .intersection(data_df.index)
      .intersection(mut_burden_df.index)
)
train_data_df = data_df.loc[train_samples, :]
print(train_data_df.shape)
train_data_df.iloc[:5, :5]

(6821, 15369)


Unnamed: 0,1,100,1000,10000,10001
TCGA-05-4244-01,26.0,150.0,4.19,948.0,412.0
TCGA-05-4249-01,120.0,73.1,7.73,174.0,349.0
TCGA-05-4250-01,50.9,266.0,79.9,760.0,651.0
TCGA-05-4382-01,146.0,288.0,82.6,477.0,475.0
TCGA-05-4384-01,127.0,75.1,11.0,305.0,411.0


In [9]:
# get normal samples that we have expression data for
normal_ids = (
    sample_info_df[sample_info_df.sample_type.str.contains('Normal')]
      .index
      .intersection(data_df.index)
)
print(len(normal_ids))
print(normal_ids[:5])

737
Index(['TCGA-06-0675-11', 'TCGA-06-0678-11', 'TCGA-06-0680-11',
       'TCGA-06-0681-11', 'TCGA-06-AABW-11'],
      dtype='object', name='sample_id')


In [10]:
# get normal expression data
normal_data_df = data_df.loc[normal_ids, :]
print(normal_data_df.shape)
normal_data_df.iloc[:5, :5]

(737, 15369)


Unnamed: 0_level_0,1,100,1000,10000,10001
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-06-0675-11,88.5,43.6,2090.0,3250.0,312.0
TCGA-06-0678-11,58.5,101.0,2060.0,2490.0,317.0
TCGA-06-0680-11,80.8,53.7,2170.0,3090.0,291.0
TCGA-06-0681-11,141.0,148.0,1910.0,2210.0,311.0
TCGA-06-AABW-11,258.0,208.0,1560.0,786.0,251.0


### Preprocessing for normal samples

This is a bit nuanced since we don't have mutation calling information for the normal samples, so we can't generate a log(mutation burden) covariate.

For now we'll just take the mean mutation burden from the tumor dataset and apply it to all the normal samples.

In [11]:
print(mut_burden_df.shape)
mut_burden_df.head()

(9074, 1)


Unnamed: 0_level_0,log10_mut
SAMPLE_BARCODE,Unnamed: 1_level_1
TCGA-02-0047-01,1.812913
TCGA-02-0055-01,1.70757
TCGA-02-2483-01,1.662758
TCGA-02-2485-01,1.748188
TCGA-02-2486-01,1.755875


In [12]:
# construct covariate matrix for train samples
train_info_df = (
    mut_burden_df.loc[train_samples, :]
      .merge(sample_info_df, left_index=True, right_index=True)    
      .drop(columns={'id_for_stratification'})
      .rename(columns={'cancer_type': 'DISEASE'})
)
print(train_info_df.shape)
train_info_df.head()

(6821, 3)


Unnamed: 0,log10_mut,sample_type,DISEASE
TCGA-05-4244-01,2.285557,Primary Solid Tumor,LUAD
TCGA-05-4249-01,2.488551,Primary Solid Tumor,LUAD
TCGA-05-4250-01,2.502427,Primary Solid Tumor,LUAD
TCGA-05-4382-01,3.193125,Primary Solid Tumor,LUAD
TCGA-05-4384-01,2.089905,Primary Solid Tumor,LUAD


In [13]:
mean_mutation_burden = mut_burden_df.sum() / mut_burden_df.shape[0]
print(mean_mutation_burden)

log10_mut    1.834867
dtype: float64


In [14]:
# construct covariate matrix for normal samples
normal_info_df = pd.DataFrame(
    {'log10_mut': mean_mutation_burden.values[0]},
    index=normal_ids
)
# add cancer type info for normal samples
normal_info_df = (normal_info_df
    .merge(sample_info_df, left_index=True, right_index=True)    
    .drop(columns={'id_for_stratification'})
    .rename(columns={'cancer_type': 'DISEASE'})
)
print(normal_info_df.shape)
normal_info_df.head()

(737, 3)


Unnamed: 0_level_0,log10_mut,sample_type,DISEASE
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-06-0675-11,1.834867,Solid Tissue Normal,GBM
TCGA-06-0678-11,1.834867,Solid Tissue Normal,GBM
TCGA-06-0680-11,1.834867,Solid Tissue Normal,GBM
TCGA-06-0681-11,1.834867,Solid Tissue Normal,GBM
TCGA-06-AABW-11,1.834867,Solid Tissue Normal,GBM


In [15]:
def add_dummies_from_model(data_df, info_df, model):
    """TODO: document what info_df looks like, etc"""
    # get cancer type covariates used in original model,
    # in the correct order
    cov_matrix = np.zeros((info_df.shape[0], len(train_cancer_types)))
    for sample_ix, (_, row) in enumerate(info_df.iterrows()):
        try:
            row_cancer_type = row.DISEASE
            cov_ix = train_cancer_types.index(row_cancer_type)
            cov_matrix[sample_ix, cov_ix] = 1
        except ValueError:
            # if cancer type is not in train set (e.g. for a normal sample),
            # just leave it an all-zeros row
            continue
    mut_burden = info_df.log10_mut.values.reshape(-1, 1)
    feature_matrix = np.concatenate(
        (data_df, mut_burden, cov_matrix),
        axis=1
    )
    X_df = pd.DataFrame(
        feature_matrix,
        index=data_df.index.copy(),
        columns=model.feature_names_in_[:]
    )
    return X_df

In [16]:
X_train_df = add_dummies_from_model(train_data_df,
                                    train_info_df,
                                    model_fit)

In [17]:
X_normal_df = add_dummies_from_model(normal_data_df,
                                     normal_info_df,
                                     model_fit)
print(X_normal_df.shape)
X_normal_df.iloc[:5, -20:]

(737, 15389)


Unnamed: 0_level_0,log10_mut,BLCA,BRCA,CESC,COAD,ESCA,HNSC,KICH,LGG,LIHC,LUAD,LUSC,PAAD,PRAD,READ,SARC,SKCM,STAD,UCEC,UCS
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
TCGA-06-0675-11,1.834867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-06-0678-11,1.834867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-06-0680-11,1.834867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-06-0681-11,1.834867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TCGA-06-AABW-11,1.834867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
