In [3]:
import numpy as np
#import jax.numpy as jnp

data = np.loadtxt("/var/home/luka/proj/Papilonidae_dataset_v2/Papilionidae_aligned_new.txt", delimiter="\t").reshape((2240, 200))


In [4]:
from ete3 import Tree

# Load the tree and get the leaf order in inorder traversal
ptree = Tree("/var/home/luka/proj/Papilonidae_dataset_v2/papilionidae_tree.txt", format=1)
order = ptree.get_leaf_names()  # returns in-order leaves (same as pre-order when just looking at leaves)


In [6]:
import pandas as pd

# Load categories
categories = pd.read_csv("/var/home/luka/proj/Papilonidae_dataset_v2/Papilonidae_metadata_new.txt", header=None)[0]

# Create DataFrame from data
df = pd.DataFrame(data.reshape(2240, -1))
df['category'] = pd.Categorical(categories, categories=categories.unique())

# Group by category and calculate means
means = df.groupby('category', observed=True).mean()

# Add order column based on the tree order
df['order'] = pd.Categorical(df['category'], categories=order, ordered=True)


# Reorder means DataFrame based on the order column
means = means.reset_index()
means['order'] = pd.Categorical(means['category'], categories=order, ordered=True)
means = means.sort_values('order')

# Drop unnecessary columns and convert to numpy array
X = means.drop(columns=['category', 'order']).values

N, d = X.shape

In [7]:
D = np.zeros((N*d, d))

# Create an array of indices
i_indices = np.arange(N*d)
j_indices = np.arange(d)

# Use broadcasting to create a mask
mask = (j_indices[:, None] * N <= i_indices) & (i_indices < (j_indices[:, None] + 1) * N)

D[mask.T] = 1.0


In [8]:
preorder = [n for n in ptree.traverse("preorder")]
M = len(preorder)   # number of nodes in tree (including internal)
dists = np.zeros((M))
inds = np.zeros(N, dtype=int)
inds_r = np.zeros(M, dtype=int)

j = 0
for i in range(M):
    n = preorder[i]
    dists[i:i+len(n.get_descendants())+1] += preorder[i].dist
    if n.name[0] != 'Q':
        inds[j] = i
        inds_r[i] = j
        j += 1


In [14]:
leaves = ptree.get_leaves()
#lca_matrix = np.zeros((N, N), dtype=int)
Cov = np.zeros((N, N))

for i in range(N):
    for j in range(i, N):
        ancestor = leaves[i].get_common_ancestor(leaves[j])
        lca_ij = preorder.index(ancestor)
        #lca_matrix[i, j] = lca_matrix[j, i] = lca_ij
        Cov[i, j] = Cov[j, i] = dists[lca_ij]

In [16]:
v1 = np.ones(N)
evoCov_inv = np.linalg.inv(Cov)
tmp = v1.T @ evoCov_inv
mle_r = ((tmp @ v1) **-1) * (tmp @ X)
assert mle_r.shape==(d,)

tmp = X - mle_r.T
mle_R = (((N - 1) ** -1) * tmp.T) @ evoCov_inv @ tmp
assert mle_R.shape==(d,d)

In [17]:
X_mean = np.mean(X,axis=0)
evals, evecs = np.linalg.eigh(Cov)
X_cent = X - X_mean[None,:]

def ppca_recon(k=2):
    V_k = evecs[:, -k:]
    #print(X_cent.shape,V_k.shape)
    X_reduced = X_cent.T @ V_k
    #print(X_reduced.shape)
    X_reconstructed = X_reduced @ V_k.T + X_mean[:,None]
    #print(X_reconstructed.shape)
    return X_reconstructed

runtime modul

In [19]:
for k in range(1,N+1):
    ppca_recon(k)