In [1]:
# To run this notebook create a new conda environment with:
# conda create -n coda_env python=3.6 scipy numpy scikit-bio

import numpy as np
import scipy.stats
from skbio.stats.composition import *
from scipy.optimize import minimize

In [2]:
data = np.genfromtxt('wang.csv', delimiter=",")

V = data[:,:3]
U1 = data[:,3:6]
U2 = data[:,6:]

# Economic Data from (H. Wang et al, Multiple linear regression for Compositional Data, 2013)

In [3]:
V_obs = np. array(
       [[0.024  , 0.568  , 0.408  ],
       [0.023  , 0.54   , 0.437  ],
       [0.021  , 0.516  , 0.46301],
       [0.019  , 0.493  , 0.488  ],
       [0.018  , 0.474  , 0.50801],
       [0.016  , 0.46299, 0.52101],
       [0.015  , 0.461  , 0.524  ],
       [0.014  , 0.457  , 0.52901],
       [0.012  , 0.479  , 0.509  ],
       [0.01   , 0.482  , 0.508  ],
       [0.01   , 0.474  , 0.516  ],
       [0.009  , 0.47   , 0.521  ],
       [0.008  , 0.446  , 0.546  ],
       [0.008  , 0.432  , 0.56   ],
       [0.007  , 0.39899, 0.59401],
       [0.007  , 0.42   , 0.573  ]])

U1_obs = np.array(
       [[0.09853, 0.54467, 0.3568 ],
       [0.12044, 0.52255, 0.35701],
       [0.12708, 0.49105, 0.38187],
       [0.12443, 0.46027, 0.4153 ],
       [0.11414, 0.46458, 0.42128],
       [0.10772, 0.4431 , 0.44918],
       [0.11004, 0.39875, 0.49121],
       [0.10154, 0.39671, 0.50175],
       [0.08626, 0.40764, 0.5061 ],
       [0.06878, 0.45351, 0.47771],
       [0.06296, 0.42429, 0.51275],
       [0.05504, 0.41676, 0.5282 ],
       [0.05243, 0.41251, 0.53505],
       [0.04688, 0.40272, 0.5504 ],
       [0.05119, 0.37391, 0.5749 ],
       [0.0393 , 0.37573, 0.58496]])

U2_obs = np.array(
       [[0.0035 , 0.29244, 0.70406],
       [0.0038 , 0.30635, 0.68985],
       [0.00192, 0.30276, 0.69531],
       [0.00117, 0.31051, 0.68832],
       [0.00297, 0.30803, 0.68901],
       [0.00308, 0.28961, 0.70732],
       [0.00377, 0.30051, 0.69573],
       [0.00264, 0.30085, 0.69651],
       [0.00192, 0.29305, 0.70503],
       [0.00078, 0.30147, 0.69775],
       [0.00176, 0.25638, 0.74185],
       [0.0042 , 0.26713, 0.72867],
       [0.0022 , 0.25783, 0.73997],
       [0.00209, 0.24019, 0.75772],
       [0.00235, 0.22705, 0.7706 ],
       [0.0036 , 0.24734, 0.74906]])

In [4]:
def KL_coeff(V, U_vec):
    k = len(U_vec)
    V_tilde = np.exp(clr(V))
    C_vec = [clr(U) for U in U_vec]
    
    def kl_loss(beta):
        # beta in R^k
        ilr_estimator = sum(beta[i] * C_vec[i] for i in range(k))
        h = sum(beta[i] * np.trace(V_tilde.T @ C_vec[i]) for i in range(k))
        return np.sum(np.exp(ilr_estimator)) - h
    
    def kl_loss_grad(beta):
        ilr_estimator = sum(beta[i] * C_vec[i] for i in range(k))
        return np.array([np.sum(C_vec[i] * np.exp(ilr_estimator)) - np.trace(V_tilde.T@C_vec[i]) for i in range(k)])
    
    return minimize(kl_loss, np.array([0.0]*k), method='BFGS', jac=kl_loss_grad, tol=1e-16,
                    options={'gtol': 1e-012, 'disp':True}).x

In [5]:
KL_coeff(V, [U1,U2])

Optimization terminated successfully.
         Current function value: 47.066758
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10


array([0.8779684 , 0.08993897])

In [6]:
def L2_coeff(V, U_vec):
    k = len(U_vec)
    Y = ilr(V)
    X_vec = [ilr(U) for U in U_vec]
    A = np.array([ [np.trace(X_vec[i].T @ X_vec[j]) for i in range(k)] for j in range(k)])
    b = np.array( [np.trace(X_vec[i].T@Y) for i in range(k)]).T
    return np.linalg.solve(A,b)

In [7]:
L2_coeff(V, [U1,U2])

array([0.87427064, 0.0795275 ])

In [8]:
beta_KL = KL_coeff(V, [U1,U2])

Optimization terminated successfully.
         Current function value: 47.066758
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10


In [9]:
def fitted_outputs(beta, U_vec, V_mean):
    return clr_inv( sum(beta[i]* clr(U_vec[i]) for i in range(len(U_vec))) + clr(V_mean))

In [10]:
V_mean = np.array([0.01275, 0.47401, 0.51324] * 16).reshape((16,3))

In [11]:
V_KL_fit = fitted_outputs(KL_coeff(V, [U1,U2]), [U1,U2], V_mean)

Optimization terminated successfully.
         Current function value: 47.066758
         Iterations: 9
         Function evaluations: 10
         Gradient evaluations: 10


In [12]:
V_L2_fit = fitted_outputs(L2_coeff(V, [U1,U2]), [U1,U2], V_mean)

In [13]:
U1_mean = np.array([0.08077, 0.44037, 0.47887] * 16).reshape((16,3))

In [14]:
U2_mean = np.array([0.00240, 0.28059, 0.71701] * 16).reshape((16,3))

In [15]:
scipy.stats.gmean(V_obs, axis=0)

array([0.01268806, 0.47162129, 0.5106566 ])

In [16]:
def L1_error(V, V_):
    return np.sum(np.absolute(V - V_))

def L2_error(V, V_):
    return np.sum(np.square(V - V_))

def Fisher_error(V, V_):
    return np.trace(np.arccos(np.sqrt(V)@np.sqrt(V_.T)))

def symKL_error(V, V_):
    return np.sum((V-V_) * np.log(np.divide(V, V_)))

def report_evaluation(V_obs, V_fit):
    print("Fisher-Rao:   ", Fisher_error(V_obs, V_fit))
    print("Symmetric KL: ", symKL_error(V_obs, V_fit))
    print("L2 error:     ", L2_error(V_obs, V_fit))
    print("L1 error:     ", L1_error(V_obs, V_fit))

In [17]:
report_evaluation(V_obs, V_KL_fit)

Fisher-Rao:    0.3562571916079603
Symmetric KL:  0.03852572165818842
L2 error:      0.016355809262733927
L1 error:      0.656821446002978


In [18]:
report_evaluation(V_obs, V_L2_fit)

Fisher-Rao:    0.3506587679841833
Symmetric KL:  0.03754793592198505
L2 error:      0.015857469030712668
L1 error:      0.6430839822095691


# D17 Aitchison

In [19]:
# Dataset 17 Aitchison (with V[13] adjusted due to error in book)
V_obs = [
[0.27,0.28,0.45],
[0.02,0.03,0.95],
[0.12,0.16,0.72],
[0.83,0.02,0.15],
[0.24,0.22,0.54],
[0.16,0.20,0.64],
[0.31,0.08,0.61],
[0.05,0.85,0.10],
[0.06,0.06,0.88],
[0.08,0.31,0.61],
[0.18,0.20,0.62],
[0.17,0.19,0.64],
[0.04,0.17,0.79],
[0.08,0.25,0.67],
[0.11,0.34,0.55] ]

U1_obs = [
[0.70,0.07,0.23],
[0.19,0.16,0.65],
[0.18,0.26,0.54],
[0.02,0.02,0.96],
[0.08,0.16,0.76],
[0.14,0.18,0.68],
[0.16,0.11,0.73],
[0.04,0.06,0.90],
[0.06,0.54,0.40],
[0.12,0.22,0.66],
[0.06,0.02,0.92],
[0.16,0.04,0.80],
[0.27,0.17,0.56],
[0.21,0.51,0.28],
[0.15,0.15,0.70] ]

In [20]:
V = centralize(V_obs)

In [21]:
V_mean = np.array([scipy.stats.gmean(V_obs, axis=0)]*15)

In [22]:
U1 = centralize(U1_obs)

In [23]:
U1_mean = np.array([scipy.stats.gmean(U1_obs, axis=0)]*15)

In [24]:
V_KL_fit = fitted_outputs(KL_coeff(V, [U1]), [U1], V_mean)

Optimization terminated successfully.
         Current function value: 42.089996
         Iterations: 5
         Function evaluations: 7
         Gradient evaluations: 7


In [25]:
V_L2_fit = fitted_outputs(L2_coeff(V, [U1]), [U1], V_mean)

In [26]:
report_evaluation(V_obs, V_KL_fit)

Fisher-Rao:    3.872214437840649
Symmetric KL:  6.116991096034899
L2 error:      1.6360709884415292
L1 error:      6.201914114037339


In [27]:
report_evaluation(V_obs, V_L2_fit)

Fisher-Rao:    3.534805032523254
Symmetric KL:  5.939364536691748
L2 error:      1.734198069079138
L1 error:      5.8500769095778855


In [28]:
KL_coeff(V, [U1])

Optimization terminated successfully.
         Current function value: 42.089996
         Iterations: 5
         Function evaluations: 7
         Gradient evaluations: 7


array([-0.55688544])

In [29]:
L2_coeff(V, [U1])

array([-0.18759286])

# Artificial  Dataset

Created by generating a dataset in $\mathbb{R}^{d-1}$ and then using the inverse ilr map to produce compositional data.

In [30]:
np.random.seed(100)

In [31]:
n, d, k = 20, 10, 4

In [32]:
U_obs = [ ilr_inv(np.random.randn(n,d-1)) for i in range(k)]

In [33]:
beta_true = [(-1)**j * 0.1 * k for j in range(1,k+1)]

In [34]:
V_obs = ilr_inv(sum( beta_true[i] * ilr(U_obs[i]) for i in range(k)) + 0.2* np.random.randn(n,d-1) )

In [35]:
V = centralize(V_obs)

In [36]:
V_mean = np.array([scipy.stats.gmean(V_obs, axis=0)]*n)

In [37]:
U = [centralize(U_obs[i]) for i in range(k)]

In [38]:
V_KL_fit = fitted_outputs(KL_coeff(V, U), U, V_mean)
V_L2_fit = fitted_outputs(L2_coeff(V, U), U, V_mean)

         Current function value: 128.486627
         Iterations: 14
         Function evaluations: 21
         Gradient evaluations: 20


In [39]:
KL_coeff(V, U)

         Current function value: 128.486627
         Iterations: 14
         Function evaluations: 21
         Gradient evaluations: 20


array([-0.41220933,  0.3948095 , -0.41851324,  0.39962675])

In [40]:
L2_coeff(V, U)

array([-0.41981132,  0.38955447, -0.42349373,  0.40064569])

In [41]:
report_evaluation(V_obs, V_KL_fit)

Fisher-Rao:    1.6036137096198573
Symmetric KL:  0.5597964429138013
L2 error:      0.06956712126865855
L1 error:      2.558752265366193


In [42]:
report_evaluation(V_obs, V_L2_fit)

Fisher-Rao:    1.6019835268393974
Symmetric KL:  0.5628690010951225
L2 error:      0.07033931601600188
L1 error:      2.5607284360699083
