## Single-cancer holdout dimension reduction analysis

Words go here.

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pancancer_evaluation.config as cfg
import pancancer_evaluation.utilities.analysis_utilities as au

In [2]:
# load gene expression and mutation data
from pancancer_evaluation.data_models.tcga_data_model import TCGADataModel

tcga_data = TCGADataModel(seed=cfg.default_seed,
                          subset_mad_genes=cfg.num_features_raw,
                          verbose=True,
                          debug=False)

Loading gene expression data...
Loading sample info...
Loading pan-cancer data from cached pickle file...


In [6]:
# for a given gene and holdout percentage, generate reduced dimensional
# representation of train/test data
from sklearn.decomposition import PCA

from pancancer_evaluation.utilities import data_utilities as du
from pancancer_evaluation.utilities import tcga_utilities as tu

pca = PCA(n_components=2)

def generate_pca_holdout(gene, cancer_type, percent_holdout):
    identifier = '{}_{}'.format(gene, cancer_type)
    classification = du.get_classification(gene)
    tcga_data.process_data_for_identifiers(identifier,
                                           identifier,
                                           classification,
                                           classification,
                                           output_dir=None,
                                           shuffle_labels=False,
                                           percent_holdout=percent_holdout,
                                           holdout_class='both')
    # this step standardizes columns and filters features by MAD
    X_train, X_test = tu.preprocess_data(tcga_data.X_train_raw_df,
                                         tcga_data.X_test_raw_df,
                                         tcga_data.gene_features,
                                         tcga_data.subset_mad_genes)
    # now 
    train_pca = pca.fit_transform(X_train)
    test_pca = pca.fit_transform(X_test)
    train_labels = tcga_data.y_train_df.status.values
    test_labels = tcga_data.y_test_df.status.values
    return (train_pca, test_pca, train_labels, test_labels)

X, _, __, ___ = generate_pca_holdout('TP53', 'GBM', 0.5)
print(X.shape)
print(X[:5, :])

(62, 2)
[[ -62401.9781104  -124221.7825539 ]
 [-204293.8407749  -142207.64138238]
 [-378674.67630148   95338.37474755]
 [-192584.37691558  196852.44023123]
 [-306156.92880978 -153121.62017862]]


(254, 16149)
(252, 16149)
(254, 5)
(252, 5)
