In [None]:
import os
os.chdir(os.path.pardir)

import numpy as np
import pandas as pd

import networkx as nx
import matplotlib.pyplot as plt

from gglasso.helper.data_generation import group_power_network, sample_covariance_matrix
from gglasso.helper.basic_linalg import adjacency_matrix
from gglasso.helper.utils import sparsity, zero_replacement, normalize, log_transform

from gglasso.problem import glasso_problem

from tempfile import TemporaryFile

from numpy.linalg import matrix_rank
from sklearn.decomposition import PCA
from scipy import stats
import plotly.express as px

# Import preprocessed soil data

In [None]:
soil = pd.read_table('data/soil/soil_116.csv', sep=',')
soil.head()

In [None]:
ph = pd.read_table('data/soil/ph.csv', sep=',')
ph.head()

In [None]:
X_soil = np.array(soil)
X_soil.shape

Compare the dataset from SPIEC-EASI and with the data from our example

In [None]:
test_1 = pd.read_table('data/soil/soil_116.csv', sep=',')
test_2 = pd.read_table('data/soil/spieceasi_soil_116.csv', sep=',')

In [None]:
a = test_1.columns
b = test_2.columns[1:]
a == b

# Log-transform

In [None]:
X = normalize(X_soil)

Note: X is scaled with geometric mean, and should be a dataframe of form (p,N)

In [None]:
X = log_transform(pd.DataFrame(X.T))

# Calculate covariance and scale to correlations

In [None]:
S0 = np.cov(X.values, bias = True)

In [None]:
# scale covariances to correlations
scale = np.tile(np.sqrt(np.diag(S0)),(S0.shape[0],1))
scale = scale.T * scale

S = S0 / scale

# GGLasso problem

Hyperparameters are taken from the experiments with [SpiecEasi](https://github.com/zdk123/SpiecEasi)

In [None]:
# P = glasso_problem(S, X.shape[1], latent = True, do_scaling = False)
# print(P)

Gamma value for eBIC. Should be between 0 and 1. The larger gamma, the more eBIC tends to pick sparse solutions. The default is 0.1.

In [None]:
lambda1_range = [1.14221314, 1.03975454, 0.94648667, 0.86158509, 0.78429934, 0.71394626, 0.64990398,
                 0.59160641, 0.53853823, 0.49023037, 0.44625582, 0.40622586, 0.36978666, 0.33661612, 
                 0.30642104, 0.27893451, 0.25391358, 0.23113707, 0.21040365, 0.19153006, 0.17434947,
                 0.15871000, 0.14447343, 0.13151390, 0.11971686, 0.10897804, 0.09920251, 0.09030386,
                 0.08220344, 0.07482964, 0.06811729, 0.06200704, 0.05644490, 0.05138169, 0.04677266,
                 0.04257707, 0.03875783, 0.03528118, 0.03211639, 0.02923549, 0.02661302, 0.02422578,
                 0.02205268, 0.02007452, 0.01827380, 0.01663460, 0.01514245, 0.01378414, 0.01254768, 0.01142213]

mu1_range = np.linspace(0.01, 1.14, num=10)

# modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

# P.model_selection(modelselect_params = modelselect_params, method = 'eBIC', gamma = 1)

# # regularization parameters are set to the best ones found during model selection
# print(P.reg_params)

In [None]:
np.linspace(0.01, 1.14, num=10)

Optimal lambda=0.86158509, mu = 1.14

In [None]:
P_1 = glasso_problem(S, X.shape[1], latent = True, do_scaling = False)
print(P_1)

lambda1_range = np.logspace(2, -2, 10)

mu1_range = np.logspace(2, -2, 10)

modelselect_params = {'lambda1_range': lambda1_range, 'mu1_range': mu1_range}

P_1.model_selection(modelselect_params = modelselect_params, method = 'eBIC', gamma = 0.25)

# regularization parameters are set to the best ones found during model selection
print(P_1.reg_params)

In [None]:
P_1.modelselect_stats

Optimal lambda=0.2154434690031884, mu = 6.579332246575681

In [None]:
print('Rank of low-rank component: {0}'.format(matrix_rank(P_1.solution.lowrank_)))

### SVD

In [None]:
v, s, vh  = np.linalg.svd(P_1.solution.lowrank_, full_matrices=True)
v.shape, s.shape, vh.shape

In [None]:
u = np.sqrt(s)*vh
u.shape

In [None]:
score_matrix = np.inner(X.T, u)
score_matrix.shape

In [None]:
corr_array = list()
for i in range(0,score_matrix.shape[1]):
    spearman = stats.spearmanr(ph_1, score_matrix[:,i])
    r, p_value = spearman[0], spearman[1]
    corr_array.append([r, p_value])

In [None]:
d = {'corr': np.array(corr_array)[:,0], 'p_value': np.array(corr_array)[:,1]}
df_corr = pd.DataFrame(data=d)
df_corr.head()

In [None]:
df_corr[df_corr['p_value'] < 0.05]

In [None]:
plt.scatter(score_matrix[:,2], ph_1, label="OTUs")
plt.xlabel("Learned low-rank PCA component")
plt.ylabel("pH")
plt.legend(loc='upper left')
plt.show()

## Low-dimensional representation of OTUs table

In [None]:
pca = PCA(n_components=2)
res = pca.fit_transform(P_1.solution.lowrank_)

In [None]:
print("{0}% of variance explained by PC1".format(round(pca.explained_variance_ratio_[0]*100, 2)))
print("{0}% of variance explained by PC1".format(round(pca.explained_variance_ratio_[1]*100, 2)))
print("Total variance explained by PC1 and PC2: {0}%".format(round(pca.explained_variance_ratio_.sum() * 100, 2)))

In [None]:
pc1 = np.inner(X.T, pca.components_[0])
pc2 = np.inner(X.T, pca.components_[1])
pc1.shape, pc2.shape

In [None]:
plt.scatter(pc1, pc2, label="OTUs")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(loc='upper left')
plt.show()

## pH correlation

In [None]:
ph_1 = np.array(ph).reshape(89,)
ph_1.shape

In [None]:
print("Spearman correlation between pH and PC1: {0}, p-value: {1}".format(stats.spearmanr(ph_1, pc1)[0], 
                                                                          stats.spearmanr(ph_1, pc1)[1]))

print("Spearman correlation between pH and PC2: {0}, p-value: {1}".format(stats.spearmanr(ph_1, pc2)[0], 
                                                                          stats.spearmanr(ph_1, pc2)[1]))


In [None]:
np.corrcoef(ph_1, pc1)

In [None]:
plt.scatter(ph_1, pc1, label="PC1_pH")
plt.xlabel("pH")
plt.ylabel("PC1")
plt.legend(loc='upper left')
plt.show()

In [None]:
plt.scatter(ph_1, pc2, label="PC2_pH")
plt.xlabel("pH")
plt.ylabel("PC1")
plt.legend(loc='upper left')
plt.show()

In [None]:
sol = P_1.solution.precision_
P_1.solution.calc_adjacency()


plt.figure()
G1 = nx.from_numpy_array(P_1.solution.adjacency_)
nx.draw_spring(G1, node_color = "darkblue", edge_color = "darkblue", font_color = 'white', with_labels = True)

In [None]:
np.save("optimal_sol", sol)