## Data Analysis with Contrastive Principle Component Analysis (CPCA) 

Using d_xy value as Hyperparameter in Contrastive Principle Component Analysis (CPCA) 

The CPCA repository is here: https://github.com/abidlabs/contrastive

In [1]:
import numpy as np
import pandas as pd

In [2]:
background_control = pd.read_csv("background_control.csv")
foreground_Etop = pd.read_csv("foreground_Etop.csv")
foreground_H2O2 = pd.read_csv("foreground_H2O2.csv")
foreground_Starve = pd.read_csv("foreground_Starve.csv")

print("background shape: ", background_control.shape)
print("foreground_Etop shape: ", foreground_Etop.shape)
print("foreground_H2O2 shape: ", foreground_H2O2.shape)
print("foreground_Starve shape: ", foreground_Starve.shape)

background shape:  (11268, 47)
foreground_Etop shape:  (4315, 47)
foreground_H2O2 shape:  (5015, 47)
foreground_Starve shape:  (3007, 47)


## Compute projection matrix V from CPCA

In [3]:
import scipy
from scipy.linalg import eig

### Compute V_Etop 

In [4]:
# Normalize foreground_Etop before CPCA (centralized to have mean = 0 and standardized to have variance = 1) 

foreground_Etop = foreground_Etop - np.mean(foreground_Etop, axis=0)
foreground_Etop = foreground_Etop/ np.std(foreground_Etop, axis=0)

In [7]:
# # Check: 
# print(np.mean(foreground_Etop, axis=0))
# print(np.var(foreground_Etop, axis=0))

In [8]:
# Contruct the contrastive covariance matrix
alpha = 1 
contrastive_cov_Etop = np.cov(foreground_Etop.T) - alpha * np.cov(background_control.T)

# take the top 8 eigenvector of the contrastive_cov
d_etop = 8
eigvals, eigvecs = eig(contrastive_cov_Etop)


# sort eigenvalues in descending order and also the corresponding eigenvectors
idx = eigvals.argsort()[::-1]  
eigvals = eigvals[idx]
print("Etop top eigenvalues: ", eigvals[:d_etop])
eigvecs = eigvecs[:,idx]


Etop top eigenvalues:  [8.94042964+0.j 5.69996251+0.j 2.18679736+0.j 1.55742766+0.j
 1.4306453 +0.j 1.21906166+0.j 1.07483742+0.j 0.8079107 +0.j]


In [9]:
# Get V_Etop
V_Etop = eigvecs[:, :d_etop]
V_Etop = pd.DataFrame(V_Etop)

# enforce the first entry of each PC to be positive
for i in range(V_Etop.shape[1]):
    if V_Etop.iloc[0, i] < 0: 
        V_Etop.iloc[:, i] = -V_Etop.iloc[:, i]
        
V_Etop.to_csv("V_Etop.csv", index=False)

In [10]:
print(V_Etop.shape)

(47, 8)


### Compute V_H202

In [11]:
# Normailize 
foreground_H2O2 = foreground_H2O2 - np.mean(foreground_H2O2, axis=0)
foreground_H2O2 = foreground_H2O2/ np.std(foreground_H2O2, axis=0)
# # Check: 
# print(np.mean(foreground_H2O2, axis=0))
# print(np.var(foreground_H2O2, axis=0))

In [12]:
# Contruct the contrastive covariance matrix
alpha = 1 
contrastive_cov_H2O2 = np.cov(foreground_H2O2.T) - alpha * np.cov(background_control.T)

# take the top 8 eigenvector of the contrastive_cov
d_H2O2 = 7
eigvals, eigvecs = eig(contrastive_cov_H2O2)


# sort eigenvalues in descending order and also the corresponding eigenvectors
idx = eigvals.argsort()[::-1]  
eigvals = eigvals[idx]
print("H2O2 top eigenvalues: ", eigvals[:d_H2O2])
eigvecs = eigvecs[:,idx]


H2O2 top eigenvalues:  [6.4017715 +0.j 6.33491241+0.j 2.66078101+0.j 1.63183019+0.j
 1.19695476+0.j 1.08556933+0.j 0.77127167+0.j]


In [13]:
# Get V_Etop
V_H2O2 = eigvecs[:, :d_H2O2]
V_H2O2 = pd.DataFrame(V_H2O2)

# enforce the first entry of each PC to be positive
for i in range(V_H2O2.shape[1]):
    if V_H2O2.iloc[0, i] < 0: 
        V_H2O2.iloc[:, i] = -V_H2O2.iloc[:, i]
        
        
V_H2O2.to_csv("V_H2O2.csv", index=False)

In [14]:
print(V_H2O2.shape)

(47, 7)


### Compute V_Starve

In [15]:
# Normailize 
foreground_Starve = foreground_Starve - np.mean(foreground_Starve, axis=0)
foreground_Starve = foreground_Starve/ np.std(foreground_Starve, axis=0)
# # Check: 
# print(np.mean(foreground_Starve, axis=0))
# print(np.var(foreground_Starve, axis=0))

In [16]:
# Contruct the contrastive covariance matrix
alpha = 1 
contrastive_cov_Starve = np.cov(foreground_Starve.T) - alpha * np.cov(background_control.T)

# take the top 8 eigenvector of the contrastive_cov
d_Starve = 5
eigvals, eigvecs = eig(contrastive_cov_Starve)


# sort eigenvalues in descending order and also the corresponding eigenvectors
idx = eigvals.argsort()[::-1]  
eigvals = eigvals[idx]
print("Starve top eigenvalues: ", eigvals[:d_Starve])
eigvecs = eigvecs[:,idx]


Starve top eigenvalues:  [11.30849214+0.j  3.0809889 +0.j  2.01629979+0.j  1.59925814+0.j
  1.20332392+0.j]


In [17]:
# Get V_Etop
V_Starve = eigvecs[:, :d_Starve]
V_Starve = pd.DataFrame(V_Starve)


# enforce the first entry of each PC to be positive
for i in range(V_Starve.shape[1]):
    if V_Starve.iloc[0, i] < 0: 
        V_Starve.iloc[:, i] = -V_Starve.iloc[:, i]

        
V_Starve.to_csv("V_Starve.csv", index=False)

In [18]:
print(V_Starve.shape)

(47, 5)
