In [None]:
import os
os.chdir(os.path.pardir)

import numpy as np
import pandas as pd
import pyreadr

import matplotlib.pyplot as plt
from scipy import stats

from gglasso.helper.utils import sparsity, zero_replacement, normalize, log_transform

### Import preprocessed soil data

In [None]:
soil = pd.read_table('~/Public/GGLasso/data/soil/soil_116.csv', sep=',')
ph = pd.read_table('~/Public/GGLasso/data/soil/ph.csv', sep=',')

In [None]:
soil.head()

### CLR-transformation of X

In [None]:
X_soil = np.array(soil)
X_soil.shape

In [None]:
X = normalize(X_soil.T)

In [None]:
X = log_transform(pd.DataFrame(X))
X.shape

### GGLasso low-rank with lambda = 0.14447343, mu =  2.36, rank=6

In [None]:
gg_lowrank = pd.read_csv("~/Downloads/gglasso_lr.csv", sep=',', header=None)
gg_lowrank.shape

In [None]:
gg_rank = np.linalg.matrix_rank(gg_lowrank)
gg_rank

### SE low-rank with lambda = 0.14447343, rank=6

In [None]:
SE_lowrank = pyreadr.read_r('~/Downloads/r6.rds')
SE_lowrank = SE_lowrank[None]
SE_lowrank = np.array(SE_lowrank)
SE_lowrank.shape

In [None]:
se_rank = np.linalg.matrix_rank(SE_lowrank)
se_rank

### Compare two solutions

In [None]:
np.allclose(SE_lowrank, gg_lowrank, atol=1e-01)

In [None]:
pd.DataFrame(data=(SE_lowrank-gg_lowrank))

### Robust PCA in [SE](https://github.com/zdk123/SpiecEasi/blob/ff528b23fafbd455efcca9dd356bef28951edf82/R/SparseLowRankICov.R)

In [None]:
# robustPCA <- function(X, L, inverse=TRUE) {
#   Lsvd <- svd(L)
#   ind <- Lsvd$d>1e-9
#   if (inverse) {
#     loadings <- diag(sqrt(1/Lsvd$d[ind])) %*% t(Lsvd$v[,ind])
#   } else {
#     loadings <- diag(sqrt(Lsvd$d[ind])) %*% t(Lsvd$v[,ind])
#   }

#   scores <- X %*% t(loadings)
#   return(list(scores=scores, loadings=loadings))
# }

### Robust PCA in GGLasso

In [None]:
def robust_PCA(X, L, inverse=True):
    #SVD of low-rank component
    v_tilde, sigma, v_tilde_T  = np.linalg.svd(L, full_matrices=True)
    
    ind = np.argwhere(sigma > 1e-9)
    if inverse:
        loadings = np.diag(np.sqrt(1/sigma[ind]).reshape(len(ind),)) @ v_tilde[:, :len(ind)].T
    else:
        loadings = np.diag(np.sqrt(sigma[ind]).reshape(len(ind),)) @ v_tilde[:, :len(ind)].T

    zu = X.T @ loadings.T
    
    return zu, loadings

### Plot SE/pH correlation

In [None]:
zu_SE, loadings = robust_PCA(X, SE_lowrank, inverse=True)

In [None]:
for i in range(0, se_rank):
    plt.scatter(zu_SE[i], ph)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
for i in range(0, se_rank):
    print("Spearman correlation between pH and {0}th component: {1}, p-value: {2}".format(i+1, stats.spearmanr(ph, zu_SE[i])[0], 
                                                                              stats.spearmanr(ph, zu_SE[i])[1]))

### Plot GGLasso/pH correlation

In [None]:
zu_gg = robust_PCA(X, gg_lowrank, inverse=True)

In [None]:
for i in range(0, se_rank):
    plt.scatter(zu_SE[i], ph)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
for i in range(0, se_rank):
    print("Spearman correlation between pH and {0}th component: {1}, p-value: {2}".format(i+1, stats.spearmanr(ph, zu_gg[i])[0], 
                                                                              stats.spearmanr(ph, zu_gg[i])[1]))