# Similarity analysis

We want to determine if the different batch simulated data is able to capture the biological signal that is present in the original data:  How much of the real input data is captured in the simulated batch data?

In other words, we want to compare the representation of the real input data and the simulated batch data.  We will use **SVCCA** to compare these two representations.

Here, we apply Singular Vector Canonical Correlation Analysis [Raghu et al. 2017](https://arxiv.org/pdf/1706.05806.pdf) [(github)](https://github.com/google/svcca) to the UMAP and PCA representations of our batch 1 simulated dataset vs batch n simulated datasets.  The output of the SVCCA analysis is the SVCCA mean similarity score. This single number can be interpreted as a measure of similarity between our original data vs batched dataset.

Briefly, SVCCA uses Singular Value Decomposition (SVD) to extract the components explaining 99% of the variation. This is done to remove potential dimensions described by noise. Next, SVCCA performs a Canonical Correlation Analysis (CCA) on the SVD matrices to identify maximum correlations of linear combinations of both input matrices. The algorithm will identify the canonical correlations of highest magnitude across and within algorithms of the same dimensionality.

In [1]:
%load_ext autoreload
%autoreload 2

import os
import ast
import pandas as pd
import numpy as np
import random
import glob
import umap
import pickle
import warnings
warnings.filterwarnings(action='ignore')

from ggplot import *
from functions import cca_core
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from numpy.random import seed
randomState = 123
seed(randomState)

In [2]:
# Load config file
config_file = "config_exp_0.txt"

d = {}
float_params = ["learning_rate", "kappa", "epsilon_std"]
str_params = ["analysis_name", "NN_architecture"]
lst_params = ["num_batches"]
with open(config_file) as f:
    for line in f:
        (name, val) = line.split()
        if name in float_params:
            d[name] = float(val)
        elif name in str_params:
            d[name] = str(val)
        elif name in lst_params:
            d[name] = ast.literal_eval(val)
        else:
            d[name] = int(val)

In [3]:
# Parameters
analysis_name = d["analysis_name"]
NN_architecture = d["NN_architecture"]
num_PCs = d["num_PCs"]
num_batches = d["num_batches"]

In [4]:
# Load data
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../.."))

simulated_data_file = os.path.join(
    base_dir,
    "data",
    "simulated",
    analysis_name,
    "simulated_data.txt.xz")

batch_dir = os.path.join(
    base_dir,
    "data",
    "batch_simulated",
    analysis_name)

umap_model_file = umap_model_file = os.path.join(
    base_dir,
    "models",  
    NN_architecture,
    "umap_model.pkl")

In [5]:
# Read in UMAP model
infile = open(umap_model_file, 'rb')
umap_model = pickle.load(infile)
infile.close()

In [6]:
# Read in data
simulated_data = pd.read_table(
    simulated_data_file,
    header=0, 
    index_col=0,
    sep='\t')

simulated_data.head(10)

Unnamed: 0,5340,339,244,1567,1827,4981,2310,3929,1498,3226,...,2641,4645,4585,1696,5218,249,2655,4782,1293,767
0,0.535602,0.503128,0.285015,0.182251,0.338972,0.563961,0.32429,0.469941,0.185681,0.09072,...,0.358896,0.654506,0.352168,0.213065,0.555343,0.346773,0.231398,0.207239,0.548592,0.61
1,0.602998,0.314449,0.170274,0.150126,0.393875,0.425789,0.359611,0.367097,0.162651,0.060858,...,0.411636,0.569583,0.461774,0.141425,0.559661,0.404262,0.19332,0.256343,0.604435,0.563223
2,0.517498,0.419739,0.182155,0.13146,0.32451,0.41385,0.320531,0.416758,0.152202,0.088437,...,0.354576,0.599023,0.410843,0.257731,0.510072,0.403116,0.19758,0.239914,0.713293,0.624423
3,0.397841,0.457606,0.323778,0.249936,0.297673,0.476715,0.381297,0.46833,0.215541,0.132116,...,0.451938,0.411013,0.378725,0.290338,0.430163,0.377278,0.377678,0.32869,0.521566,0.390877
4,0.535997,0.465947,0.277286,0.222062,0.385123,0.421842,0.330086,0.415129,0.177301,0.128083,...,0.49598,0.525113,0.432366,0.20125,0.606396,0.380384,0.209951,0.288402,0.563646,0.583976
5,0.49329,0.396501,0.249366,0.159639,0.375957,0.402583,0.275842,0.381382,0.261653,0.081041,...,0.342845,0.67007,0.407004,0.215835,0.587762,0.397528,0.236528,0.251614,0.600678,0.718417
6,0.433612,0.351818,0.241003,0.179525,0.336945,0.345377,0.321877,0.382335,0.167349,0.141031,...,0.442284,0.484055,0.40264,0.323867,0.504018,0.418877,0.241006,0.274514,0.596295,0.556387
7,0.451588,0.457565,0.33658,0.272774,0.264118,0.369123,0.27505,0.424156,0.250198,0.247813,...,0.417076,0.499973,0.297584,0.261137,0.453874,0.425273,0.294926,0.293479,0.535604,0.471286
8,0.619654,0.442385,0.281102,0.209915,0.4173,0.559756,0.303963,0.474562,0.233043,0.112126,...,0.436053,0.675932,0.392779,0.213318,0.691767,0.300163,0.260528,0.28153,0.509397,0.618331
9,0.434846,0.361911,0.255196,0.199091,0.345327,0.390971,0.34298,0.392987,0.176265,0.135802,...,0.465793,0.475744,0.384862,0.209843,0.52172,0.375902,0.241566,0.250671,0.527586,0.52244


## Calculate Similarity using high dimensional (5K) batched data

In [13]:
%%time
# Calculate similarity using SVCCA

# Store svcca scores
output_list = []

for i in num_batches:
    print('Calculating SVCCA score for 1 batch vs {} batches..'.format(i))
    
    # Get batch 1
    batch_1_file = os.path.join(
        batch_dir,
        "Batch_1.txt.xz")

    batch_1 = pd.read_table(
        batch_1_file,
        header=0,
        index_col=0,
        sep='\t')

    # Use trained model to encode expression data into SAME latent space
    original_data_df =  batch_1
    
    # All batches
    batch_other_file = os.path.join(
        batch_dir,
        "Batch_"+str(i)+".txt.xz")

    batch_other = pd.read_table(
        batch_other_file,
        header=0,
        index_col=0,
        sep='\t')
    
    print("Using batch {}".format(i))
    
    # Use trained model to encode expression data into SAME latent space
    batch_data_df =  batch_other
    
    # Check shape: ensure that the number of samples is the same between the two datasets
    if original_data_df.shape[0] != batch_data_df.shape[0]:
        diff = original_data_df.shape[0] - batch_data_df.shape[0]
        original_data_df = original_data_df.iloc[:-diff,:]
    
    # SVCCA
    svcca_results = cca_core.get_cca_similarity(original_data_df.T,
                                          batch_data_df.T,
                                          verbose=False)
    
    output_list.append(np.mean(svcca_results["cca_coef1"]))

# Convert output to pandas dataframe
svcca_raw_df = pd.DataFrame(output_list, columns=["svcca_mean_similarity"], index=num_batches)
svcca_raw_df

Calculating SVCCA score for 1 batch vs 1 batches..


LZMAError: Input format not supported by decoder

In [8]:
%%time
# Permute simulated data
shuffled_simulated_arr = []
num_samples = simulated_data.shape[0]

for i in range(num_samples):
    row = list(simulated_data.values[i])
    shuffled_simulated_row = random.sample(row, len(row))
    shuffled_simulated_arr.append(shuffled_simulated_row)

CPU times: user 4.15 s, sys: 67.6 ms, total: 4.21 s
Wall time: 4.21 s


In [9]:
%%time
# SVCCA
svcca_results = cca_core.get_cca_similarity(simulated_data.T,
                                      shuffled_simulated_data.T,
                                      verbose=False)

print(np.mean(svcca_results["cca_coef1"]))

NameError: name 'shuffled_simulated_data' is not defined

In [10]:
# Plot
svcca_raw_df.plot()

NameError: name 'svcca_raw_df' is not defined

## Calculate Similarity using PCA projection of batched data

In [None]:
"""
output_list = []

for i in num_batches:
    print('Calculating SVCCA score for 1 batch vs {} batches..'.format(i))
    
    # Get batch 1
    batch_1_file = os.path.join(
        batch_dir,
        "Batch_1.txt")

    batch_1 = pd.read_table(
        batch_1_file,
        header=0,
        sep='\t',
        index_col=0)

    # PCA projection
    pca = PCA(n_components=num_PCs)

    # Use trained model to encode expression data into SAME latent space
    original_data_PCAencoded = pca.fit_transform(batch_1)


    original_data_PCAencoded_df = pd.DataFrame(original_data_PCAencoded,
                                         index=batch_1.index
                                         )
    
    # All batches
    batch_other_file = os.path.join(
        batch_dir,
        "Batch_"+str(i)+".txt")

    batch_other = pd.read_table(
        batch_other_file,
        header=0,
        sep='\t',
        index_col=0)
    
    print("Using batch {}".format(i))
    
    # Use trained model to encode expression data into SAME latent space
    batch_data_PCAencoded = pca.fit_transform(batch_other)
    
    
    batch_data_PCAencoded_df = pd.DataFrame(batch_data_PCAencoded,
                                         index=batch_other.index
                                         )
        
    # Check shape
    if original_data_PCAencoded_df.shape[0] != batch_data_PCAencoded_df.shape[0]:
        diff = original_data_PCAencoded_df.shape[0] - batch_data_PCAencoded_df.shape[0]
        original_data_PCAencoded_df = original_data_PCAencoded_df.iloc[:-diff,:]
    
    # SVCCA
    svcca_results = cca_core.get_cca_similarity(original_data_PCAencoded_df.T,
                                          batch_data_PCAencoded_df.T,
                                          verbose=False)
    
    output_list.append(np.mean(svcca_results["cca_coef1"]))

# Convert output to pandas dataframe
svcca_pca_df = pd.DataFrame(output_list, columns=["svcca_mean_similarity"], index=num_batches)
svcca_pca_df
"""

In [None]:
"""
# Plot
svcca_pca_df.plot()"""

## Manually compute similarity by applying CCA to PC batched data

In [None]:
"""
cca = CCA(n_components=1)

output_list = []

for i in num_batches:
    print('Calculating SVCCA score for 1 batch vs {} batches..'.format(i))
    
    # Get batch 1
    batch_1_file = os.path.join(
        batch_dir,
        "Batch_1.txt")

    batch_1 = pd.read_table(
        batch_1_file,
        header=0,
        sep='\t',
        index_col=0)

    # PCA projection
    pca = PCA(n_components=num_PCs)

    # Use trained model to encode expression data into SAME latent space
    original_data_PCAencoded = pca.fit_transform(batch_1)


    original_data_PCAencoded_df = pd.DataFrame(original_data_PCAencoded,
                                         index=batch_1.index
                                         )
    
    # All batches
    batch_other_file = os.path.join(
        batch_dir,
        "Batch_"+str(i)+".txt")

    batch_other = pd.read_table(
        batch_other_file,
        header=0,
        sep='\t',
        index_col=0)
    
    print("Using batch {}".format(i))
    
    # Use trained model to encode expression data into SAME latent space
    batch_data_PCAencoded = pca.fit_transform(batch_other)
    
    
    batch_data_PCAencoded_df = pd.DataFrame(batch_data_PCAencoded,
                                         index=batch_other.index
                                         )
        
    # Check shape
    if original_data_PCAencoded_df.shape[0] != batch_data_PCAencoded_df.shape[0]:
        diff = original_data_PCAencoded_df.shape[0] - batch_data_PCAencoded_df.shape[0]
        original_data_PCAencoded_df = original_data_PCAencoded_df.iloc[:-diff,:]
    
    # CCA
    U_c, V_c = cca.fit_transform(original_data_PCAencoded_df, batch_data_PCAencoded_df)
    result = np.corrcoef(U_c.T, V_c.T)[0,1]
    
    output_list.append(result)

# Convert output to pandas dataframe
pca_cca_df = pd.DataFrame(output_list, columns=["svcca_mean_similarity"], index=num_batches)
pca_cca_df
"""

In [None]:
"""
# Plot
pca_cca_df.plot()"""