# SVCCA and dimensionality
We want to test the affect of the input data dimensions on SVCCA performance.  As we increase the number of dimensions how does SVCCA change?  

In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import random
import glob
import warnings
warnings.filterwarnings("ignore")

from ggplot import *
from functions import cca_core

from numpy.random import seed
randomState = 123
seed(randomState)

In [2]:
# Parameters
analysis_name = 'experiment_0'
base_dir = os.path.abspath(os.path.join(os.getcwd(),"../.."))
num_dims = [10, 100, 1000, 2000, 3000, 4000, 5549]

In [3]:
# Load arguments
simulated_data_file = os.path.join(
    base_dir,
    "data",
    "simulated",
    analysis_name,
    "simulated_data.txt")

In [4]:
# Read in simulated gene expression data
simulated_data = pd.read_table(
    simulated_data_file,
    header=0, 
    index_col=0,
    sep='\t')

simulated_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5539,5540,5541,5542,5543,5544,5545,5546,5547,5548
0,0.690384,0.642501,0.454786,0.65065,0.374857,0.41402,0.353704,0.566726,0.447497,0.165201,...,0.375559,0.600063,0.562897,0.640035,0.661894,0.325466,0.576836,0.56733,0.708088,0.615353
1,0.69106,0.655274,0.527149,0.680636,0.371842,0.443242,0.37422,0.533293,0.502785,0.165815,...,0.290522,0.591411,0.609883,0.585498,0.596831,0.162511,0.459705,0.531669,0.710235,0.698571
2,0.826005,0.694632,0.510346,0.642764,0.611429,0.517733,0.344205,0.631357,0.663116,0.199852,...,0.589395,0.581802,0.59635,0.671666,0.753091,0.16051,0.52174,0.511157,0.747738,0.728167
3,0.600721,0.564944,0.417176,0.594936,0.382868,0.440063,0.387032,0.466111,0.402363,0.223858,...,0.34552,0.547836,0.470917,0.460431,0.564647,0.256127,0.509963,0.34822,0.579351,0.583487
4,0.621544,0.615939,0.473489,0.599652,0.401605,0.481008,0.364476,0.444714,0.447605,0.215759,...,0.450535,0.532127,0.588728,0.547413,0.578212,0.234255,0.424777,0.501985,0.704726,0.717408


# Ordered sampling

## Similarity between input vs itself
We expect that the similarity of SVCCA comparing the input with itself should yeild an SVCCA score of 1.0

In [5]:
%%time
# SVCCA
svcca_scores_itself = []
for z in num_dims:
    subset_simulated_data = simulated_data.iloc[:,0:z]
    svcca_results = cca_core.get_cca_similarity(subset_simulated_data.T,
                                          subset_simulated_data.T,
                                          verbose=False)

    svcca_scores_itself.append(np.mean(svcca_results["cca_coef1"]))
    print("{} dimensions ... {}".format(z,np.mean(svcca_results["cca_coef1"])))
    
#print(svcca_scores_itself)

10 dimensions ... 0.9999914276551018
100 dimensions ... 0.9999601705952724
1000 dimensions ... 0.9991829455037723
2000 dimensions ... 0.9972983216828567
3000 dimensions ... 0.9916656307504228
4000 dimensions ... 0.9717492505179718
5549 dimensions ... 0.8914584502967937
CPU times: user 42min 59s, sys: 10min 46s, total: 53min 45s
Wall time: 12min 8s


## Similarity between input vs permuted input¶
We will use the similarity of between the input with permuted input as a negative control.  We would expect the SVCCA score to be fairly low for this comparison.

In [6]:
%%time
# Permute simulated data
shuffled_simulated_arr = []
num_samples = simulated_data.shape[0]

for i in range(num_samples):
    row = list(simulated_data.values[i])
    shuffled_simulated_row = random.sample(row, len(row))
    shuffled_simulated_arr.append(shuffled_simulated_row)
    
shuffled_simulated_data = pd.DataFrame(shuffled_simulated_arr, index=simulated_data.index, columns=simulated_data.columns)
shuffled_simulated_data.head()

CPU times: user 32.4 s, sys: 407 ms, total: 32.8 s
Wall time: 32.8 s


In [7]:
%%time
# SVCCA
svcca_scores_shuffled = []
for z in num_dims:
    subset_simulated_data = simulated_data.iloc[:,0:z]
    subset_shuffled_simulated_data = shuffled_simulated_data.iloc[:,0:z]
    svcca_results = cca_core.get_cca_similarity(subset_simulated_data.T,
                                          subset_shuffled_simulated_data.T,
                                          verbose=False)

    svcca_scores_shuffled.append(np.mean(svcca_results["cca_coef1"]))
    print("{} dimensions ... {}".format(z,np.mean(svcca_results["cca_coef1"])))
    
#print(svcca_scores_shuffled)

10 dimensions ... 0.04211572754469208
100 dimensions ... 0.11487407273233416
1000 dimensions ... 0.3579804203821832
2000 dimensions ... 0.5135871725680592
3000 dimensions ... 0.6390757992204288
4000 dimensions ... 0.7471535480590067
5549 dimensions ... 0.8764854746861425
CPU times: user 43min 59s, sys: 11min 9s, total: 55min 9s
Wall time: 12min 33s


# Random sampling

## Similarity between input vs itself and input vs permuted input
Perform the same analysis as above, this time taking random samples instead of ordered samples

In [5]:
%%time

# Store svcca scores
svcca_scores_itself = []
svcca_scores_shuffled = []

for z in num_dims:
    
    # Randomly select z dimensions 
    subset_simulated_data = simulated_data.sample(n=z, axis=1)
    print(subset_simulated_data.head())
    
    # Permute subset of data
    shuffled_simulated_arr = []
    num_samples = subset_simulated_data.shape[0]

    for i in range(num_samples):
        row = list(subset_simulated_data.values[i])
        shuffled_simulated_row = random.sample(row, len(row))
        shuffled_simulated_arr.append(shuffled_simulated_row)

    subset_shuffled_simulated_data = pd.DataFrame(shuffled_simulated_arr, 
                                           index=subset_simulated_data.index, 
                                           columns=subset_simulated_data.columns)
    
    print(subset_shuffled_simulated_data.head())
    
    # Calculate SVCCA for subset vs itself
    svcca_results = cca_core.get_cca_similarity(subset_simulated_data.T,
                                          subset_simulated_data.T,
                                          verbose=False)

    svcca_scores_itself.append(np.mean(svcca_results["cca_coef1"]))
    print("{} dimensions ... SVCCA(itself) {}".format(z,np.mean(svcca_results["cca_coef1"])))
    
    # Calculate SVCCA for subset vs permuted subset
    svcca_results = cca_core.get_cca_similarity(subset_simulated_data.T,
                                          subset_shuffled_simulated_data.T,
                                          verbose=False)

    svcca_scores_shuffled.append(np.mean(svcca_results["cca_coef1"]))
    print("{} dimensions ... SVCCA (permuted) {}".format(z,np.mean(svcca_results["cca_coef1"])))

        995      4510      4274       920      2855      3068      1727  \
0  0.522728  0.687604  0.319051  0.368393  0.647671  0.160295  0.233610   
1  0.561672  0.689917  0.297037  0.343807  0.513984  0.179211  0.212227   
2  0.375335  0.620559  0.399556  0.363385  0.598796  0.172484  0.218147   
3  0.305892  0.559677  0.320781  0.319041  0.573606  0.277945  0.330548   
4  0.587199  0.588220  0.367120  0.375171  0.557798  0.211097  0.254277   

       4503      4836      1473  
0  0.515831  0.162207  0.450435  
1  0.550672  0.148891  0.684844  
2  0.473083  0.189323  0.468893  
3  0.559733  0.238993  0.458835  
4  0.499955  0.251132  0.508542  
        995      4510      4274       920      2855      3068      1727  \
0  0.160295  0.162207  0.368393  0.515831  0.450435  0.687604  0.319051   
1  0.179211  0.684844  0.513984  0.550672  0.561672  0.212227  0.689917   
2  0.598796  0.189323  0.172484  0.218147  0.473083  0.399556  0.363385   
3  0.559677  0.305892  0.320781  0.277945  0.

       3598      1211      4295      3609      1779      4156      3962  \
0  0.177169  0.505738  0.345287  0.426841  0.450610  0.498114  0.488647   
1  0.194380  0.253997  0.257175  0.167381  0.321084  0.219802  0.263945   
2  0.538925  0.410866  0.564760  0.828311  0.385926  0.297443  0.413637   
3  0.615601  0.305014  0.694993  0.302983  0.410973  0.356919  0.336898   
4  0.161780  0.778112  0.625503  0.355854  0.501013  0.144881  0.306468   

       1410      2596      5342    ...         3805      5471      2260  \
0  0.616113  0.163710  0.239593    ...     0.548637  0.202573  0.468224   
1  0.190490  0.655274  0.477195    ...     0.184714  0.307158  0.555001   
2  0.337451  0.303799  0.554402    ...     0.546263  0.206291  0.403598   
3  0.297312  0.626720  0.490919    ...     0.559706  0.177549  0.262537   
4  0.212989  0.654149  0.627553    ...     0.352572  0.323468  0.188996   

       4630        94      4407      2397      1395       514       286  
0  0.462571  0.415634  0

        313      1892       917      4007      5119       350      2013  \
0  0.261930  0.454594  0.401539  0.671846  0.410720  0.408451  0.448048   
1  0.340914  0.199879  0.261097  0.269307  0.324222  0.508980  0.596501   
2  0.373975  0.845836  0.695463  0.434952  0.764006  0.808620  0.823411   
3  0.269253  0.735529  0.260600  0.464991  0.323053  0.512963  0.314541   
4  0.488190  0.605872  0.455512  0.462770  0.410249  0.549785  0.385814   

       3272      2056      3798    ...         1535      2201      3303  \
0  0.174529  0.425355  0.321589    ...     0.303785  0.362111  0.399346   
1  0.485056  0.499589  0.222578    ...     0.112813  0.243274  0.444935   
2  0.449340  0.260997  0.483375    ...     0.263168  0.656485  0.192830   
3  0.125290  0.316091  0.244389    ...     0.605417  0.366132  0.148158   
4  0.313456  0.311818  0.393462    ...     0.424984  0.260351  0.231984   

        586      4187      1204      3449      2235      5515      2109  
0  0.208113  0.340024  0

**Observations**
Looks like dimensionality affects SVCCA performance.  Comparing simulated data versus itself is most similar with fewer dimensions and decreases as we add dimensions.  Perhaps this indicates that the structure in the data is lost in such high dimensions.  