# Apply Compression Models to Simulated Data

Here, we apply the suite of compression models over a range of latent dimensionalities (k) to the simulated data.

We apply PCA, ICA, NMF, DAE, and VAE models over a range of k (k = 1, 2, 3, 4, 5, 6).

We extract the weight matrices for every iteration.
We will show which compressed feature captures the two groups of simulated signals.

In [1]:
import os
import random
import pandas as pd
from sklearn import decomposition

from tybalt.data_models import DataModel

Using TensorFlow backend.


In [2]:
%matplotlib inline

In [3]:
random.seed(123)

In [4]:
# Setup constants
ks = list(range(1, 7))
data_file = os.path.join("data", "simulated_signal_n1000_p10.tsv")

In [5]:
data_df = pd.read_csv(data_file, sep='\t')
data_df.index = ["sample_{}".format(x) for x in data_df.index]

print(data_df.shape)
data_df.head()

(10000, 10)


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10
sample_0,1.02862,1.314631,1.190183,0.490601,-1.486416,-1.523443,-2.21882,-0.903148,2.491819,-1.687863
sample_1,-0.151202,-0.348562,-0.313551,0.024991,0.56701,0.592104,0.6442,-0.006098,0.053221,-0.955201
sample_2,-1.06182,-0.883511,-1.228853,1.299053,0.297267,0.51319,0.682318,-0.904132,0.456249,-0.648057
sample_3,2.237944,2.672969,1.950427,-0.234573,0.389137,-0.080283,0.093005,-0.060453,1.577055,0.261034
sample_4,-0.582426,-0.461699,-0.208167,-0.452576,1.582997,1.563214,1.037386,-1.094187,0.622353,-1.219694


In [6]:
# Split into training and testing sets
# (For compatibility with tybalt.DataModel)
split_prop = 0.05
test_samples = random.sample(range(0, data_df.shape[0]), int(data_df.shape[0] * split_prop))

test_df = data_df.iloc[test_samples, :]
train_df = data_df.drop(test_df.index, axis="index")

In [7]:
# Initialize DataModel class with the input data
dm = DataModel(df=train_df, test_df=test_df)
dm.transform(how='zeroone')

In [8]:
# Parameters selected to be similar to real data parameter sweep
epochs = 25
batch_size = 50
vae_learning_rate = 0.0015
dae_learning_rate = 0.0005
dae_noise = 0.01
dae_sparsity = 0

In [9]:
# Loop over the latent dimensionalities
sim_results = list()
for k in ks:
    # Fit models
    # 1) PCA
    dm.pca(n_components=k, transform_test_df=False)
    result = dm.pca_weights.assign(k=k, algorithm="PCA")
    sim_results.append(result)

    # 2) ICA
    dm.ica(n_components=k, transform_test_df=False)
    result = dm.ica_weights.assign(k=k, algorithm="ICA")
    sim_results.append(result)

    # 3) NMF
    dm.nmf(n_components=k, transform_test_df=False)
    result = dm.nmf_weights.assign(k=k, algorithm="NMF")
    sim_results.append(result)

    # 4) DAE
    dm.nn(n_components=k,
          model='adage',
          loss='binary_crossentropy',
          epochs=epochs,
          batch_size=batch_size,
          learning_rate=dae_learning_rate,
          noise=dae_noise,
          sparsity=dae_sparsity,
          verbose=False,
          transform_test_df=False)
    result = dm.adage_weights.assign(k=k, algorithm="DAE")
    sim_results.append(result)

    # 4) VAE
    dm.nn(n_components=k,
          model='tybalt',
          loss='binary_crossentropy',
          epochs=epochs,
          batch_size=batch_size,
          learning_rate=vae_learning_rate,
          separate_loss=False,
          verbose=False,
          transform_test_df=False)
    result = dm.tybalt_weights.assign(k=k, algorithm="VAE")
    sim_results.append(result)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [10]:
# Compile and output results
full_sim_results = (
    pd.concat(sim_results)
    .reset_index()
    .rename({"index": "compressed_feature"}, axis="columns")
)

out_file = os.path.join("results", "compression_simulation_results.tsv")
full_sim_results.to_csv(out_file, sep='\t', index=False)

full_sim_results.tail(10)

Unnamed: 0,compressed_feature,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,k,algorithm
95,dae_2,-0.546046,-0.51584,-0.566654,1.033809,0.380345,0.348092,0.344777,-1.065545,0.630489,0.225548,6,DAE
96,dae_3,-0.302826,-0.308572,-0.164746,-0.198026,-0.066438,-0.106798,-0.105325,0.316308,1.426418,-1.012661,6,DAE
97,dae_4,0.459738,0.465264,0.323465,0.195409,-0.665763,-0.645848,-0.764701,-0.005358,0.977908,0.257407,6,DAE
98,dae_5,-0.248916,-0.204387,-0.401755,0.029242,0.35337,0.384323,0.328022,1.238153,-0.068851,-0.712932,6,DAE
99,vae_0,-0.015214,-0.015392,-0.013931,0.030013,-0.133837,-0.130641,-0.140015,0.056671,0.006538,-0.013091,6,VAE
100,vae_1,0.01515,0.011733,0.01212,-0.12748,-0.003187,-0.004695,-0.00299,0.038137,-0.096538,-0.119031,6,VAE
101,vae_2,0.022309,0.02204,0.020235,0.115377,0.044975,0.04409,0.046921,0.1362,-0.063809,-0.009779,6,VAE
102,vae_3,-0.115922,-0.112107,-0.113667,0.057261,0.005445,0.007476,0.009798,-0.06562,-0.058584,-0.051358,6,VAE
103,vae_4,0.099305,0.096138,0.096762,0.067794,-0.020125,-0.019623,-0.022674,-0.108714,-0.061099,-0.033364,6,VAE
104,vae_5,-0.009767,-0.011479,-0.007921,-0.046626,-0.016776,-0.017521,-0.019817,-0.006435,-0.128252,0.133439,6,VAE
