In [None]:
import os
os.chdir(os.path.pardir)

import numpy as np
import pandas as pd

import networkx as nx
import matplotlib.pyplot as plt

from gglasso.helper.data_generation import group_power_network, sample_covariance_matrix
from gglasso.helper.basic_linalg import adjacency_matrix
from gglasso.helper.utils import sparsity, zero_replacement, normalize, log_transform

from gglasso.problem import glasso_problem

from tempfile import TemporaryFile

from numpy.linalg import matrix_rank
from sklearn.decomposition import PCA
from scipy import stats
import plotly.express as px

# Import preprocessed soil data

In [None]:
soil = pd.read_table('data/soil/soil_116.csv', sep=',')
soil.head()

In [None]:
soil_2 = pd.read_table('data/soil/soil_id_116.csv', sep=',')
soil_2['SampleID'] = soil_2.iloc[:,0]
soil_2 = soil_2.set_index(['SampleID'])
soil_2 = soil_2.drop(['Unnamed: 0'], axis=1)
soil_2.head()

In [None]:
np.all(np.array(soil_2) == np.array(soil))

# Import ph

In [None]:
metadata = pd.read_table('data/soil/88soils_modified_metadata.txt', index_col=0)

In [None]:
ph = metadata["ph"]
ph = ph.reindex(soil_2.index)
ph.head()

Compare the dataset from SPIEC-EASI and with the data from our example

In [None]:
test_1 = pd.read_table('data/soil/soil_116.csv', sep=',')
test_2 = pd.read_table('data/soil/spieceasi_soil_116.csv', sep=',')

In [None]:
a = test_1.columns
b = test_2.columns[1:]
a == b

# Log-transform

In [None]:
X_soil = np.array(soil_2)
X_soil.shape

In [None]:
X = normalize(X_soil.T)
X.shape

Note: X is scaled with geometric mean, and should be a dataframe of form (p,N)

In [None]:
X = log_transform(pd.DataFrame(X))
X.shape

# Calculate covariance and scale to correlations

In [None]:
S0 = np.cov(X.values, bias = True)

In [None]:
# scale covariances to correlations
scale = np.tile(np.sqrt(np.diag(S0)),(S0.shape[0],1))
scale = scale.T * scale

S = S0 / scale
S.shape

# GGLasso problem

Hyperparameters are taken from the experiments with [SpiecEasi](https://github.com/zdk123/SpiecEasi)

Gamma value for eBIC. Should be between 0 and 1. The larger gamma, the more eBIC tends to pick sparse solutions. The default is 0.1.

In [None]:
# lambda1_range = [1.14221314, 1.03975454, 0.94648667, 0.86158509, 0.78429934, 0.71394626, 0.64990398,
#                  0.59160641, 0.53853823, 0.49023037, 0.44625582, 0.40622586, 0.36978666, 0.33661612, 
#                  0.30642104, 0.27893451, 0.25391358, 0.23113707, 0.21040365, 0.19153006, 0.17434947,
#                  0.15871000, 0.14447343, 0.13151390, 0.11971686, 0.10897804, 0.09920251, 0.09030386,
#                  0.08220344, 0.07482964, 0.06811729, 0.06200704, 0.05644490, 0.05138169, 0.04677266,
#                  0.04257707, 0.03875783, 0.03528118, 0.03211639, 0.02923549, 0.02661302, 0.02422578,
#                  0.02205268, 0.02007452, 0.01827380, 0.01663460, 0.01514245, 0.01378414, 0.01254768, 0.01142213]

# mu1_range = np.linspace(0.01, 1.14, num=10)

# modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

# P.model_selection(modelselect_params = modelselect_params, method = 'eBIC', gamma = 1)

# # regularization parameters are set to the best ones found during model selection
# print(P.reg_params)

Optimal lambda=0.86158509, mu = 1.14

STARS selects optimal lambda = 0.14447343 (index 23)

In [None]:
P_1 = glasso_problem(S, X.shape[1], latent = True, do_scaling = False)

print(P_1)

lambda1_range = [0.14447343]
mu1_range = [2.36]


modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

P_1.model_selection(modelselect_params = modelselect_params, method = 'eBIC', gamma = 0.25)

# regularization parameters are set to the best ones found during model selection
print(P_1.reg_params)

In [None]:
P_1.modelselect_stats

Optimal lambda=0.2154434690031884, mu = 6.579332246575681

In [None]:
print('Rank of low-rank component: {0}'.format(matrix_rank(P_1.solution.lowrank_)))

In [None]:
# np.savetxt("/Users/oleg.vlasovetc/Downloads/gglasso_lr.csv", P_1.solution.lowrank_, delimiter=",")

In [None]:
%ls

### SPIEC-EASI

In [None]:
import pyreadr

In [None]:
result = pyreadr.read_r('/Users/oleg.vlasovetc/Downloads/r6.rds') # also works for Rds

In [None]:
SE_lowrank6 = result[None]
SE_lowrank = np.array(SE_lowrank6)
SE_lowrank.shape

In [None]:
gg_lowrank = P_1.solution.lowrank_
gg_lowrank.shape

In [None]:
# pd.DataFrame(data=(SE_lowrank-gg_lowrank))
np.allclose(SE_lowrank, gg_lowrank, atol=1e-01)

### Robust PCA

In [None]:
# def robust_PCA(X, L, inverse=True):
# #     assert L is symmetric and p-dimensional
#     #SVD
    
#     v_tilde, sigma, v_tilde_T  = np.linalg.svd(L, full_matrices=True)
    
#     rank = np.linalg.matrix_rank(L)
#     sigma_basis = sigma[sigma > 1e-9]
# #     rank == len(sigma_basis)
#     sigma_diag = np.diag(sigma_basis)
#     if inverse:
#         sigma_diag = np.linalg.inv(sigma_diag)
    
#     loading = []
#     for i in range(0,len(sigma_basis)):
#         loading.append(v_tilde[:, i] * np.sqrt(sigma_diag[i,i]))

#     zu = X.T @ np.array(loading).T
#     return zu

In [None]:
def robust_PCA(X, L, inverse=True):
    #SVD of low-rank component
    v_tilde, sigma, v_tilde_T  = np.linalg.svd(L, full_matrices=True)
    
    sigma_basis = sigma[sigma > 1e-9]
    rank = len(sigma_basis)
    sigma_diag = np.diag(sigma_basis)
    
    if inverse:
        sigma_diag = np.linalg.inv(sigma_diag)
        
    loading = []
    for i in range(0, rank):
        loading.append(v_tilde[:, i] * np.sqrt(sigma_diag[i,i]))

    zu = X.T @ np.array(loading).T
    return zu

In [None]:
robust_PCA(X, SE_lowrank, inverse=True)

In [None]:
zu_SE = robust_PCA(X, SE_lowrank, inverse=True)

In [None]:
zu_SE.shape, new_ph.shape

In [None]:
new_depth.values

In [None]:
import plotly.express as px
fig = px.scatter(zu_SE, x=zu_SE.iloc[:,0], y=new_ph, color=new_depth)
fig.show()

In [None]:
for i in range(0, 6):
    plt.scatter(zu_SE[i], new_ph)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

$L = V \times \Sigma^{-1} \times V^T$

$L^{-1}$

In [None]:
P_1.solution.lowrank_

$L^{-1} = (V \times \Sigma^{-1} \times V^T)^{-1}$

In [None]:
# v_tilde, sigma, v_tilde_T  = np.linalg.svd(P_1.solution.lowrank_, full_matrices=True)
v_tilde, sigma, v_tilde_T  = np.linalg.svd(SE_lowrank, full_matrices=True)

In [None]:
v_tilde = v_tilde[:, :6]
v_tilde.shape

In [None]:
v_tilde_T_6 = v_tilde.T
v_tilde_T_6.shape

In [None]:
sigma_diag = np.diag(sigma)

In [None]:
sigma_diag_rank = sigma_diag[:6,:6]

In [None]:
pd.DataFrame(data = sigma_diag[:6,:6])

In [None]:
sigma_diag_rank_inv = np.linalg.inv(sigma_diag_rank)

In [None]:
pd.DataFrame(data = sigma_diag_rank_inv)

In [None]:
emp = []
for i in range(0,6):
    emp.append(v_tilde[:, i] * sigma_diag_rank_inv[i,i])

In [None]:
pd.DataFrame(data= np.array(emp).T)

In [None]:
zu_1 = X.T @ np.array(emp).T

In [None]:
zu_1

In [None]:
for i in range(0,6):
    plt.scatter(zu_1[i], ph_1)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
for i in range(0,6):
    print("Spearman correlation between pH and: {0}, p-value: {1}".format(stats.spearmanr(ph_1, zu_1[i])[0], 
                                                                              stats.spearmanr(ph_1, zu_1[i])[1]))

In [None]:
emp_2 = []
for i in range(0,6):
    emp_2.append(v_tilde[:, i] * np.sqrt(sigma_diag_rank_inv[i,i]))

In [None]:
zu_2 = X.T @ np.array(emp_2).T

In [None]:
zu_2

In [None]:
for i in range(0,6):
    plt.scatter(zu_2[i], ph_1)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
for i in range(0,6):
    print("Spearman correlation between pH: {0}, p-value: {1}".format(stats.spearmanr(ph_1, zu_2[i])[0], 
                                                                              stats.spearmanr(ph_1, zu_2[i])[1]))

$C := cov(S)$

$C := L^{-1};$

In [None]:
v, s, vh  = np.linalg.svd(P_1.solution.lowrank_, full_matrices=True)
v.shape, s.shape, vh.shape

In [None]:
np.diag(s) # singma-1

In [None]:
pd.DataFrame(data = vh[:1, :])

In [None]:
pd.DataFrame(data = v[:, :6])

In [None]:
v_low_rank = v[:, :6] #v tilda

In [None]:
for i in range(0,6):
    np.sqrt(sd_vec) * v[:, :i]

In [None]:
low_dec = X.T @ v_low_rank
low_dec
# np.array(low_dec).shape

In [None]:
pd.DataFrame(data = np.diag(s))
sd_vec = np.sqrt(s[:6])
sd_vec

In [None]:
sd_vec[0]*np.array(low_dec)[:,1]

In [None]:
for i in range(0,6):
    plt.scatter(sd_vec[i]*np.array(low_dec)[:,i], ph_1)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

s - vector of singular values in descending order

In [None]:
s_diag = np.diag(s)
# pd.DataFrame(data=s_diag)

Since $C = V \times \Sigma \times V^{T}$ then $L = V \times \Sigma^{-1} \times V^{T}$

In [None]:
s_inv = np.linalg.inv(s_diag)

In [None]:
s_inv.shape, vh.shape

$C := L^{-1}$ => $U^{-1} =\sqrt{\Sigma^{-1})} \times V^{T}$ => $U =\sqrt{\Sigma^{-1})^{-1}} \times V^{T}$

In [None]:
pd.DataFrame(data = np.sqrt(s_inv[:7, :7]))

In [None]:
u = np.sqrt(s_inv[:, :]) @ vh[:,:6]
u.shape

In [None]:
v_low = vh[:,:6]

In [None]:
X.T.shape

$ZU$ is $n \times r$ matrix (89x6)

In [None]:
zu = X.T @ u
zu.shape

In [None]:
for i in range(0, 6):
    plt.scatter(zu[i], ph_1)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
P_1.solution.lowrank_.shape

In [None]:
X.T.shape, v.shape, s_inv[:, :6].shape

In [None]:
u_1.shape

In [None]:
pd.DataFrame(data = s_inv[:6, :6])

In [None]:
pd.DataFrame(data = v)

In [None]:
v.shape

In [None]:
u_1 = v @ np.sqrt(s_inv[:, :6])
zu_1 = X.T @ u_1
zu_1.shape

In [None]:
np.corrcoef(ph_1, zu_1[2])

In [None]:
for i in range(0, 6):
    plt.scatter(zu_1[i], ph_1)
    plt.xlabel("Learned low-rank PCA component")
    plt.ylabel("pH")
    plt.legend(loc='upper left')
    plt.show()

## Low-dimensional representation of OTUs table

In [None]:
pca = PCA(n_components=2)
res = pca.fit_transform(P_1.solution.lowrank_)

In [None]:
print("{0}% of variance explained by PC1".format(round(pca.explained_variance_ratio_[0]*100, 2)))
print("{0}% of variance explained by PC1".format(round(pca.explained_variance_ratio_[1]*100, 2)))
print("Total variance explained by PC1 and PC2: {0}%".format(round(pca.explained_variance_ratio_.sum() * 100, 2)))

In [None]:
pc1 = np.inner(X.T, pca.components_[0])
pc2 = np.inner(X.T, pca.components_[1])
pc1.shape, pc2.shape

In [None]:
plt.scatter(pc1, pc2, label="OTUs")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(loc='upper left')
plt.show()

## pH correlation

In [None]:
ph_1 = np.array(ph).reshape(89,)
ph_1.shape

In [None]:
print("Spearman correlation between pH and PC1: {0}, p-value: {1}".format(stats.spearmanr(ph_1, pc1)[0], 
                                                                          stats.spearmanr(ph_1, pc1)[1]))

print("Spearman correlation between pH and PC2: {0}, p-value: {1}".format(stats.spearmanr(ph_1, pc2)[0], 
                                                                          stats.spearmanr(ph_1, pc2)[1]))


In [None]:
np.corrcoef(ph_1, pc1)

In [None]:
plt.scatter(ph_1, pc1, label="PC1_pH")
plt.xlabel("pH")
plt.ylabel("PC1")
plt.legend(loc='upper left')
plt.show()

In [None]:
plt.scatter(ph_1, pc2, label="PC2_pH")
plt.xlabel("pH")
plt.ylabel("PC1")
plt.legend(loc='upper left')
plt.show()

In [None]:
sol = P_1.solution.precision_
P_1.solution.calc_adjacency()


plt.figure()
G1 = nx.from_numpy_array(P_1.solution.adjacency_)
nx.draw_spring(G1, node_color = "darkblue", edge_color = "darkblue", font_color = 'white', with_labels = True)

In [None]:
np.save("optimal_sol", sol)