# Encoded representation layer recapitulates signal identified in raw data

## Visualized with t-SNE

Perform a t-sne on tybalt features to visualize if the latent layer recapitulates relationships observed through raw data t-sne.

In [1]:
import os
import pandas as pd
from sklearn import manifold

In [2]:
# Load VAE feature activations per sample
encoded_file = os.path.join('data', 'encoded_rnaseq_onehidden_warmup_batchnorm.tsv')
encoded_df = pd.read_table(encoded_file, index_col=0)
encoded_df.head(2)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
TCGA-02-0047-01,0.0,7.870012,0.0,0.0,0.071894,1.981403,2.580404,4.991513,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.731716,0.0,0.0,0.0,0.0
TCGA-02-0055-01,0.397321,4.752499,0.0,0.0,0.0,0.0,3.468984,0.235408,1.999857,0.172326,...,0.0,1.645057,0.289536,0.427183,0.0,3.674683,0.0,0.0,0.0,1.433402


In [3]:
# Load ADAGE feature activations per sample
adage_file = os.path.join('data', 'encoded_adage_features.tsv')
adage_df = pd.read_table(adage_file, index_col=0)
adage_df.head(2)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,91,92,93,94,95,96,97,98,99,100
TCGA-02-0047-01,5.180547,4.8496,5.285702,3.600258,-0.87799,6.722819,1.711105,-1.330625,4.856046,7.892483,...,-0.816303,4.850134,5.025827,1.708705,6.899716,1.135625,-1.314607,13.287391,-0.881876,5.524631
TCGA-02-0055-01,5.829771,4.013995,6.473931,5.275906,1.432594,7.523101,0.682273,-1.099351,4.280545,7.881772,...,-0.702914,3.743074,3.447191,2.598773,4.800836,1.523167,-1.060419,11.968166,-0.588578,5.968176


In [4]:
# Load Zero-One transformed (min-max scaled) RNAseq data
rnaseq_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv.gz')
rnaseq_df = pd.read_table(rnaseq_file, index_col=0)
print(rnaseq_df.shape)
rnaseq_df.head(2)

(10459, 5000)


Unnamed: 0,RPS4Y1,XIST,KRT5,AGR2,CEACAM5,KRT6A,KRT14,CEACAM6,DDX3Y,KDM5D,...,FAM129A,C8orf48,CDK5R1,FAM81A,C13orf18,GDPD3,SMAGP,C2orf85,POU5F1B,CHST2
TCGA-02-0047-01,0.678296,0.28991,0.03423,0.0,0.0,0.084731,0.031863,0.037709,0.746797,0.687833,...,0.44061,0.428782,0.732819,0.63434,0.580662,0.294313,0.458134,0.478219,0.168263,0.638497
TCGA-02-0055-01,0.200633,0.654917,0.181993,0.0,0.0,0.100606,0.050011,0.092586,0.103725,0.140642,...,0.620658,0.363207,0.592269,0.602755,0.610192,0.374569,0.72242,0.271356,0.160465,0.60256


In [5]:
# Perform t-SNE on VAE encoded_features
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0, perplexity=20,
                     learning_rate=300, n_iter=400)
tsne_out = tsne.fit_transform(encoded_df)
tsne_out = pd.DataFrame(tsne_out, columns=['1', '2'])
tsne_out.index = encoded_df.index
tsne_out.index.name = 'tcga_id'
tsne_out_file = os.path.join('results', 'tybalt_tsne_features.tsv')
tsne_out.to_csv(tsne_out_file, sep='\t')
tsne_out.head(2)

Unnamed: 0_level_0,1,2
tcga_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-02-0047-01,15.205528,-7.820806
TCGA-02-0055-01,9.571207,-6.080736


In [6]:
# Perform t-SNE on ADAGE encoded_features
tsne_adage = manifold.TSNE(n_components=2, init='pca', random_state=0, perplexity=20,
                           learning_rate=300, n_iter=400)
tsne_adage_out = tsne_adage.fit_transform(adage_df)
tsne_adage_out = pd.DataFrame(tsne_adage_out, columns=['1', '2'])
tsne_adage_out.index = adage_df.index
tsne_adage_out.index.name = 'tcga_id'
tsne_adage_out_file = os.path.join('results', 'adage_tsne_features.tsv')
tsne_adage_out.to_csv(tsne_adage_out_file, sep='\t')
tsne_adage_out.head(2)

Unnamed: 0_level_0,1,2
tcga_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-02-0047-01,30.59631,-7.618701
TCGA-02-0055-01,15.811577,-5.36112


In [7]:
# Perform t-SNE on zero-one RNAseq features
tsne_rna = manifold.TSNE(n_components=2, init='pca', random_state=0, perplexity=20,
                         learning_rate=300, n_iter=400)
tsne_rna_out = tsne_rna.fit_transform(rnaseq_df)
tsne_rna_out = pd.DataFrame(tsne_rna_out, columns=['1', '2'])
tsne_rna_out.index = rnaseq_df.index
tsne_rna_out.index.name = 'tcga_id'
tsne_rna_out_file = os.path.join('results', 'rnaseq_tsne_features.tsv')
tsne_rna_out.to_csv(tsne_rna_out_file, sep='\t')
tsne_rna_out.head(2)

Unnamed: 0_level_0,1,2
tcga_id,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-02-0047-01,12.082565,-4.38609
TCGA-02-0055-01,8.941544,-3.303283
