In [None]:
import os
os.chdir(os.path.pardir)

import numpy as np
import pandas as pd
import pyreadr

import matplotlib.pyplot as plt
from scipy import stats

from gglasso.helper.utils import sparsity, zero_replacement, normalize, log_transform
from gglasso.problem import glasso_problem

### Import preprocessed soil data

In [None]:
metadata = pd.read_table('~/Public/GGLasso/data/soil/88soils_modified_metadata.txt', index_col=0)

In [None]:
soil = pd.read_table('~/Public/GGLasso/data/soil/soil_id_116.csv', sep=',')
soil['SampleID'] = soil.iloc[:,0]
soil = soil.set_index(['SampleID'])
soil = soil.drop(['Unnamed: 0'], axis=1)
soil.head()

In [None]:
ph = metadata["ph"]
ph = ph.reindex(soil.index)
ph.head()

In [None]:
#check if any ids are missing
ph.isnull().values.any()

### CLR-transformation of X

In [None]:
X_soil = np.array(soil)
X_soil.shape

In [None]:
X = normalize(X_soil.T)
X.shape

In [None]:
X = log_transform(pd.DataFrame(X))
X.shape

### Calculate covariance and scale to correlations

In [None]:
S0 = np.cov(X.values, bias = True)
# scale covariances to correlations
scale = np.tile(np.sqrt(np.diag(S0)),(S0.shape[0],1))
scale = scale.T * scale

S = S0 / scale
S.shape

### GGLasso low-rank with lambda = 0.14447343, mu =  2.36, rank=6

In [None]:
P = glasso_problem(S, X.shape[1], latent = True, do_scaling = False)

print(P)

lambda1_range = [0.14447343]
mu1_range = [2.36]


modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

P.model_selection(modelselect_params = modelselect_params, method = 'eBIC', gamma = 0.25)

# regularization parameters are set to the best ones found during model selection
print(P.reg_params)

In [None]:
# gg_lowrank = pd.read_csv("~/Downloads/gglasso_lr.csv", sep=',', header=None)
gg_lowrank = P.solution.lowrank_
gg_lowrank.shape

In [None]:
gg_rank = np.linalg.matrix_rank(gg_lowrank)
print('Rank of low-rank component: {0}'.format(gg_rank))

### SE low-rank with lambda = 0.14447343, rank=6

In [None]:
SE_lowrank = pyreadr.read_r('~/Public/GGLasso/benchmarks/SE_lowrank.rds')
SE_lowrank = SE_lowrank[None]
SE_lowrank = np.array(SE_lowrank)
SE_lowrank.shape

In [None]:
se_rank = np.linalg.matrix_rank(SE_lowrank)
print('Rank of low-rank component: {0}'.format(se_rank))

### Compare two solutions

In [None]:
np.allclose(SE_lowrank, gg_lowrank, atol=1e-01)

In [None]:
pd.DataFrame(data=(SE_lowrank-gg_lowrank))

### Robust PCA in [SE](https://github.com/zdk123/SpiecEasi/blob/ff528b23fafbd455efcca9dd356bef28951edf82/R/SparseLowRankICov.R)

In [None]:
# robustPCA <- function(X, L, inverse=TRUE) {
#   Lsvd <- svd(L)
#   ind <- Lsvd$d>1e-9
#   if (inverse) {
#     loadings <- diag(sqrt(1/Lsvd$d[ind])) %*% t(Lsvd$v[,ind])
#   } else {
#     loadings <- diag(sqrt(Lsvd$d[ind])) %*% t(Lsvd$v[,ind])
#   }

#   scores <- X %*% t(loadings)
#   return(list(scores=scores, loadings=loadings))
# }

### Robust PCA in GGLasso

In [None]:
def robust_PCA(X, L, inverse=True):
    #SVD of low-rank component
    v_tilde, sigma, v_tilde_T  = np.linalg.svd(L, full_matrices=True)
    
    sigma_basis = sigma[sigma > 1e-9]
    rank = len(sigma_basis)
    sigma_diag = np.diag(sigma_basis)
    
    if inverse:
        sigma_diag = np.linalg.inv(sigma_diag)
        
    loading = []
    for i in range(0, rank):
        loading.append(v_tilde[:, i] * np.sqrt(sigma_diag[i,i]))

    zu = X.T @ np.array(loading).T
    return zu

### Plot SE/pH correlation

In [None]:
zu_SE = robust_PCA(X, SE_lowrank, inverse=True)

In [None]:
for i in range(0, se_rank):
    plt.scatter(zu_SE[i], ph)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
for i in range(0, se_rank):
    print("Spearman correlation between pH and {0}th component: {1}, p-value: {2}".format(i+1, stats.spearmanr(ph, zu_SE[i])[0], 
                                                                              stats.spearmanr(ph, zu_SE[i])[1]))

### Plot GGLasso/pH correlation

In [None]:
zu_gg = robust_PCA(X, gg_lowrank, inverse=True)

In [None]:
for i in range(0, gg_rank):
    plt.scatter(zu_SE[i], ph)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
for i in range(0, se_rank):
    print("Spearman correlation between pH and {0}th component: {1}, p-value: {2}".format(i+1, stats.spearmanr(ph, zu_gg[i])[0], 
                                                                              stats.spearmanr(ph, zu_gg[i])[1]))