# SVCCA and affine transformations
We want to test that SVCCA is working as expected.  In other words, what is the SVCCA score when we compare two datasets that are 1) identical and 2) one is a contant scaled version of the other?

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import random
import glob
import warnings
warnings.filterwarnings("ignore")

from ggplot import *
from functions import cca_core

from numpy.random import seed
randomState = 123
seed(randomState)

In [2]:
# Parameters
analysis_name = 'experiment_0'
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../.."))

In [3]:
# Load arguments
simulated_data_file = os.path.join(
    base_dir,
    "data",
    "simulated",
    analysis_name,
    "simulated_data.txt")

In [4]:
# Read in simulated gene expression data
simulated_data = pd.read_table(
    simulated_data_file,
    header=0, 
    index_col=0,
    sep='\t')

simulated_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548
0,0.690384,0.642501,0.454786,0.65065,0.374857,0.41402,0.353704,0.566726,0.447497,0.165201,...,0.375559,0.600063,0.562897,0.640035,0.661894,0.325466,0.576836,0.56733,0.708088,0.615353
1,0.69106,0.655274,0.527149,0.680636,0.371842,0.443242,0.37422,0.533293,0.502785,0.165815,...,0.290522,0.591411,0.609883,0.585498,0.596831,0.162511,0.459705,0.531669,0.710235,0.698571
2,0.826005,0.694632,0.510346,0.642764,0.611429,0.517733,0.344205,0.631357,0.663116,0.199852,...,0.589395,0.581802,0.59635,0.671666,0.753091,0.16051,0.52174,0.511157,0.747738,0.728167
3,0.600721,0.564944,0.417176,0.594936,0.382868,0.440063,0.387032,0.466111,0.402363,0.223858,...,0.34552,0.547836,0.470917,0.460431,0.564647,0.256127,0.509963,0.34822,0.579351,0.583487
4,0.621544,0.615939,0.473489,0.599652,0.401605,0.481008,0.364476,0.444714,0.447605,0.215759,...,0.450535,0.532127,0.588728,0.547413,0.578212,0.234255,0.424777,0.501985,0.704726,0.717408


## Similarity between input vs itself
We expect that the similarity of SVCCA comparing the input with itself should yeild an SVCCA score of 1.0

In [5]:
%%time
# SVCCA
svcca_results = cca_core.get_cca_similarity(simulated_data.T,
                                      simulated_data.T,
                                      verbose=False)

print(np.mean(svcca_results["cca_coef1"]))

0.8914584502967937
CPU times: user 26min 32s, sys: 5min 55s, total: 32min 27s
Wall time: 7min 7s


## Similarity between input vs scaled version of input¶
We expect that the similarity of SVCCA comparing the input with scaled version of itself to yield a high SVCCA score since this transformation is an affine transformation which SVCCA is supposed to be invariant to.

In [6]:
# Scale data by a constant
scaled_simulated_data = simulated_data.multiply(2)
scaled_simulated_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548
0,1.380768,1.285002,0.909573,1.301301,0.749713,0.828039,0.707409,1.133452,0.894995,0.330402,...,0.751118,1.200126,1.125793,1.28007,1.323789,0.650932,1.153671,1.134661,1.416177,1.230707
1,1.38212,1.310548,1.054297,1.361271,0.743683,0.886485,0.74844,1.066585,1.00557,0.331631,...,0.581045,1.182821,1.219766,1.170997,1.193662,0.325021,0.919409,1.063338,1.42047,1.397142
2,1.65201,1.389264,1.020692,1.285529,1.222859,1.035466,0.68841,1.262715,1.326231,0.399703,...,1.178791,1.163604,1.1927,1.343331,1.506182,0.321021,1.043479,1.022314,1.495475,1.456335
3,1.201442,1.129888,0.834352,1.189871,0.765736,0.880125,0.774063,0.932222,0.804726,0.447716,...,0.69104,1.095672,0.941835,0.920863,1.129293,0.512254,1.019925,0.696441,1.158701,1.166973
4,1.243089,1.231879,0.946978,1.199303,0.803211,0.962017,0.728951,0.889428,0.895209,0.431517,...,0.901071,1.064254,1.177455,1.094826,1.156425,0.46851,0.849554,1.003969,1.409452,1.434816


In [7]:
%%time
# SVCCA
svcca_results = cca_core.get_cca_similarity(simulated_data.T,
                                      scaled_simulated_data.T,
                                      verbose=False)

print(np.mean(svcca_results["cca_coef1"]))

0.8914584502967937
CPU times: user 26min 17s, sys: 5min 51s, total: 32min 9s
Wall time: 6min 55s


## Similarity between input vs permuted input¶
We will use the similarity of between the input with permuted input as a negative control.  We would expect the SVCCA score to be fairly low for this comparison.

In [8]:
%%time
# Permute simulated data
shuffled_simulated_arr = []
num_samples = simulated_data.shape[0]

for i in range(num_samples):
    row = list(simulated_data.values[i])
    shuffled_simulated_row = random.sample(row, len(row))
    shuffled_simulated_arr.append(shuffled_simulated_row)

CPU times: user 30.4 s, sys: 407 ms, total: 30.8 s
Wall time: 30.8 s


In [12]:
shuffled_simulated_data = pd.DataFrame(shuffled_simulated_arr, index=simulated_data.index, columns=simulated_data.columns)
shuffled_simulated_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548
0,0.409257,0.333294,0.386534,0.597455,0.581091,0.507824,0.291022,0.676535,0.357518,0.496841,...,0.141706,0.530225,0.64225,0.427936,0.296566,0.670648,0.505738,0.374053,0.216228,0.286875
1,0.189483,0.391035,0.297854,0.776538,0.662238,0.512074,0.197996,0.339063,0.235833,0.608897,...,0.398422,0.078064,0.284509,0.244508,0.134734,0.733355,0.312194,0.44712,0.606155,0.112393
2,0.218126,0.7924,0.32264,0.427435,0.735131,0.439997,0.758718,0.122767,0.238553,0.699463,...,0.509161,0.188042,0.484219,0.343653,0.511077,0.669245,0.598016,0.449833,0.528297,0.306722
3,0.38513,0.405205,0.29085,0.323053,0.578279,0.569818,0.411938,0.439244,0.53706,0.146141,...,0.314991,0.327365,0.39081,0.457027,0.618357,0.217448,0.382653,0.611171,0.453389,0.514274
4,0.405758,0.450224,0.258398,0.307599,0.183675,0.261187,0.433908,0.228496,0.335418,0.27291,...,0.201804,0.429378,0.440116,0.612743,0.426176,0.196929,0.533426,0.350666,0.291244,0.424676


In [11]:
%%time
# SVCCA
svcca_results = cca_core.get_cca_similarity(simulated_data.T,
                                      shuffled_simulated_data.T,
                                      verbose=False)

print(np.mean(svcca_results["cca_coef1"]))

0.8764167125489775
CPU times: user 26min 10s, sys: 5min 48s, total: 31min 58s
Wall time: 6min 56s


## Toy

In [21]:
df = pd.DataFrame({'A': range(5),'B': np.ones(5), 'C': [20,40,60,80,100], 'D': np.ones(5)*5})
df.head()

Unnamed: 0,A,B,C,D
0,0,1.0,20,5.0
1,1,1.0,40,5.0
2,2,1.0,60,5.0
3,3,1.0,80,5.0
4,4,1.0,100,5.0


In [22]:
shuffled_arr = []


for i in range(len(df.values)):
    row = list(df.values[i])
    shuffled = random.sample(row, len(row))
    shuffled_arr.append(shuffled)
shuffled_df = pd.DataFrame(shuffled_arr, index=df.index, columns=df.columns)
shuffled_df.head()

Unnamed: 0,A,B,C,D
0,1.0,0.0,5.0,20.0
1,1.0,40.0,1.0,5.0
2,60.0,5.0,1.0,2.0
3,3.0,5.0,80.0,1.0
4,5.0,1.0,100.0,4.0
