Skip to content

Commit

Permalink
Merge pull request #355 from linsalrob/master
Browse files Browse the repository at this point in the history
Adding a cophenetic matrix method and unitest.
  • Loading branch information
jhcepas committed Mar 28, 2020
2 parents b524047 + 337647f commit c639381
Show file tree
Hide file tree
Showing 2 changed files with 178 additions and 1 deletion.
101 changes: 101 additions & 0 deletions ete3/coretype/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2341,7 +2341,108 @@ def _resolve(node):
for n in target:
_resolve(n)

def cophenetic_matrix(self):
"""
.. versionadded: 3.1.1
Generate a cophenetic distance matrix of the treee to standard output
The `cophenetic matrix <https://en.wikipedia.org/wiki/Cophenetic>` is a matrix representation of the
distance between each node.
if we have a tree like
----A
_____________|y
| |
| ----B
________|z
| ----C
| |
|____________|x -----D
| |
|______|w
|
|
-----E
Where w,x,y,z are internal nodes.
d(A,B) = d(y,A) + d(y,B)
and
d(A, E) = d(z,A) + d(z, E) = {d(z,y) + d(y,A)} + {d(z,x) + d(x,w) + d(w,E)}
We use an idea inspired by the ete3 team: https://gist.github.com/jhcepas/279f9009f46bf675e3a890c19191158b :
For each node find its path to the root.
e.g.
A -> A, y, z
E -> E, w, x,z
and make these orderless sets. Then we XOR the two sets to only find the elements
that are in one or other sets but not both. In this case A, E, y, x, w.
The distance between the two nodes is the sum of the distances from each of those nodes
to the parent
One more optimization: since the distances are symmetric, and distance to itself is zero
we user itertools.combinations rather than itertools.permutations. This cuts our computes from theta(n^2)
1/2n^2 - n (= O(n^2), which is still not great, but in reality speeds things up for large trees).
For this tree, we will return the two dimensional array:
A B C D E
A 0 d(A-y) + d(B-y) d(A-z) + d(C-z) d(A-z) + d(D-z) d(A-z) + d(E-z)
B d(B-y) + d(A-y) 0 d(B-z) + d(C-z) d(B-z) + d(D-z) d(B-z) + d(E-z)
C d(C-z) + d(A-z) d(C-z) + d(B-z) 0 d(C-x) + d(D-x) d(C-x) + d(E-x)
D d(D-z) + d(A-z) d(D-z) + d(B-z) d(D-x) + d(C-x) 0 d(D-w) + d(E-w)
E d(E-z) + d(A-z) d(E-z) + d(B-z) d(E-x) + d(C-x) d(E-w) + d(D-w) 0
We will also return the one dimensional array with the leaves in the order in which they appear in the matrix
(i.e. the column and/or row headers).
:param filename: the optional file to write to. If not provided, output will be to standard output
:return: two-dimensional array and a one dimensional array
"""

leaves = self.get_leaves()
paths = {x: set() for x in leaves}

# get the paths going up the tree
# we get all the nodes up to the last one and store them in a set

for n in leaves:
if n.is_root():
continue
movingnode = n
while not movingnode.is_root():
paths[n].add(movingnode)
movingnode = movingnode.up

# now we want to get all pairs of nodes using itertools combinations. We need AB AC etc but don't need BA CA

leaf_distances = {x.name: {} for x in leaves}

for (leaf1, leaf2) in itertools.combinations(leaves, 2):
# figure out the unique nodes in the path
uniquenodes = paths[leaf1] ^ paths[leaf2]
distance = sum(x.dist for x in uniquenodes)
leaf_distances[leaf1.name][leaf2.name] = leaf_distances[leaf2.name][leaf1.name] = distance

allleaves = sorted(leaf_distances.keys()) # the leaves in order that we will return

output = [] # the two dimensional array that we will return

for i, n in enumerate(allleaves):
output.append([])
for m in allleaves:
if m == n:
output[i].append(0) # distance to ourself = 0
else:
output[i].append(leaf_distances[n][m])
return output, allleaves

def add_face(self, face, column, position="branch-right"):
"""
.. versionadded: 2.1
Expand Down
78 changes: 77 additions & 1 deletion ete3/test/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,7 +1296,83 @@ def test_copy(self):
self.assertEqual((t_pkl & "A").complex[0], [0,1])
self.assertEqual((t_deep & "A").testfn(), "YES")



def test_cophenetic_matrix(self):
t = Tree(nw_full)
dists, leaves = t.cophenetic_matrix()
actualdists = [
[0, 2.3662779999999994, 2.350554999999999, 2.7002369999999996, 3.527812, 3.305472, 2.424086, 2.424086,
2.432288, 2.483421, 2.3355079999999995, 2.3355079999999995, 2.389350999999999, 2.3812519999999995,
2.404005999999999, 2.3945459999999996, 2.4035289999999994, 2.3689599999999995, 2.4048339999999997,
2.6487609999999995],
[2.3662779999999994, 0, 0.079009, 1.122461, 2.38897, 2.16663, 0.47755000000000003, 0.47755000000000003,
0.4857520000000001, 0.5368850000000001, 0.320202, 0.32020200000000004, 0.133729, 0.12563,
0.14838400000000002, 0.230406, 0.168047, 0.113338, 0.16935199999999997, 0.633455],
[2.350554999999999, 0.079009, 0, 1.106738, 2.373247, 2.150907, 0.461827, 0.461827, 0.47002900000000003,
0.521162, 0.304479, 0.30447900000000006, 0.11800599999999999, 0.10990699999999998, 0.132661, 0.214683,
0.152324, 0.09761499999999998, 0.153629, 0.617732],
[2.7002369999999996, 1.122461, 1.106738, 0, 2.7229289999999997, 2.5005889999999997, 1.180269, 1.180269,
1.188471, 1.239604, 1.091691, 1.091691, 1.145534, 1.137435, 1.160189, 1.1507290000000001, 1.159712,
1.125143, 1.161017, 1.404944],
[3.527812, 2.38897, 2.373247, 2.7229289999999997, 0, 2.6926, 2.446778, 2.446778, 2.45498, 2.506113,
2.3581999999999996, 2.3581999999999996, 2.412043, 2.403944, 2.426698, 2.4172379999999998,
2.4262209999999995, 2.391652, 2.427526, 2.6714529999999996],
[3.305472, 2.16663, 2.150907, 2.5005889999999997, 2.6926, 0, 2.224438, 2.224438, 2.23264, 2.283773, 2.13586,
2.13586, 2.189703, 2.181604, 2.204358, 2.194898, 2.2038809999999995, 2.169312, 2.205186, 2.449113],
[2.424086, 0.47755000000000003, 0.461827, 1.180269, 2.446778, 2.224438, 0, 0.0, 0.01366,
0.30963300000000005, 0.44678, 0.44677999999999995, 0.5006229999999999, 0.49252399999999996, 0.515278,
0.505818, 0.5148010000000001, 0.480232, 0.5161060000000001, 0.7600329999999998],
[2.424086, 0.47755000000000003, 0.461827, 1.180269, 2.446778, 2.224438, 0.0, 0, 0.01366,
0.30963300000000005, 0.44678, 0.44677999999999995, 0.5006229999999999, 0.49252399999999996, 0.515278,
0.505818, 0.5148010000000001, 0.480232, 0.5161060000000001, 0.7600329999999998],
[2.432288, 0.4857520000000001, 0.47002900000000003, 1.188471, 2.45498, 2.23264, 0.01366, 0.01366, 0,
0.317835, 0.45498200000000005, 0.454982, 0.508825, 0.500726, 0.5234800000000001, 0.51402, 0.523003,
0.48843400000000003, 0.524308, 0.7682349999999999],
[2.483421, 0.5368850000000001, 0.521162, 1.239604, 2.506113, 2.283773, 0.30963300000000005,
0.30963300000000005, 0.317835, 0, 0.506115, 0.506115, 0.559958, 0.551859, 0.574613, 0.565153, 0.574136,
0.539567, 0.5754410000000001, 0.8193679999999999],
[2.3355079999999995, 0.320202, 0.304479, 1.091691, 2.3581999999999996, 2.13586, 0.44678, 0.44678,
0.45498200000000005, 0.506115, 0, 0.0, 0.343275, 0.33517600000000003, 0.35793, 0.34847,
0.35745299999999997, 0.322884, 0.35875799999999997, 0.531709],
[2.3355079999999995, 0.32020200000000004, 0.30447900000000006, 1.091691, 2.3581999999999996, 2.13586,
0.44677999999999995, 0.44677999999999995, 0.454982, 0.506115, 0.0, 0, 0.34327500000000005,
0.33517600000000003, 0.35793, 0.34847, 0.357453, 0.32288400000000006, 0.358758, 0.531709],
[2.389350999999999, 0.133729, 0.11800599999999999, 1.145534, 2.412043, 2.189703, 0.5006229999999999,
0.5006229999999999, 0.508825, 0.559958, 0.343275, 0.34327500000000005, 0, 0.013558999999999998, 0.021967,
0.25347900000000007, 0.19112, 0.031257, 0.192425, 0.656528],
[2.3812519999999995, 0.12563, 0.10990699999999998, 1.137435, 2.403944, 2.181604, 0.49252399999999996,
0.49252399999999996, 0.500726, 0.551859, 0.33517600000000003, 0.33517600000000003, 0.013558999999999998, 0,
0.028214, 0.24538000000000004, 0.183021, 0.023157999999999998, 0.184326, 0.648429],
[2.404005999999999, 0.14838400000000002, 0.132661, 1.160189, 2.426698, 2.204358, 0.515278, 0.515278,
0.5234800000000001, 0.574613, 0.35793, 0.35793, 0.021967, 0.028214, 0, 0.26813400000000004,
0.20577499999999999, 0.045912, 0.20708, 0.6711830000000001],
[2.3945459999999996, 0.230406, 0.214683, 1.1507290000000001, 2.4172379999999998, 2.194898, 0.505818,
0.505818, 0.51402, 0.565153, 0.34847, 0.34847, 0.25347900000000007, 0.24538000000000004,
0.26813400000000004, 0, 0.267657, 0.233088, 0.268962, 0.661723],
[2.4035289999999994, 0.168047, 0.152324, 1.159712, 2.4262209999999995, 2.2038809999999995,
0.5148010000000001, 0.5148010000000001, 0.523003, 0.574136, 0.35745299999999997, 0.357453, 0.19112,
0.183021, 0.20577499999999999, 0.267657, 0, 0.170729, 0.057269, 0.670706],
[2.3689599999999995, 0.113338, 0.09761499999999998, 1.125143, 2.391652, 2.169312, 0.480232, 0.480232,
0.48843400000000003, 0.539567, 0.322884, 0.32288400000000006, 0.031257, 0.023157999999999998, 0.045912,
0.233088, 0.170729, 0, 0.17203399999999996, 0.636137],
[2.4048339999999997, 0.16935199999999997, 0.153629, 1.161017, 2.427526, 2.205186, 0.5161060000000001,
0.5161060000000001, 0.524308, 0.5754410000000001, 0.35875799999999997, 0.358758, 0.192425, 0.184326,
0.20708, 0.268962, 0.057269, 0.17203399999999996, 0, 0.672011],
[2.6487609999999995, 0.633455, 0.617732, 1.404944, 2.6714529999999996, 2.449113, 0.7600329999999998,
0.7600329999999998, 0.7682349999999999, 0.8193679999999999, 0.531709, 0.531709, 0.656528, 0.648429,
0.6711830000000001, 0.661723, 0.670706, 0.636137, 0.672011, 0]
]

actualleaves = ['Aga0007658', 'Bta0018700', 'Cfa0016700', 'Cin0011239', 'Ddi0002240', 'Dme0014628',
'Dre0008390', 'Dre0008391', 'Dre0008392', 'Fru0004507', 'Gga0000981', 'Gga0000982',
'Hsa0000001', 'Hsa0010711', 'Hsa0010730', 'Mdo0014718', 'Mms0024821', 'Ptr0000001',
'Rno0030248', 'Xtr0044988']

for i in range(len(actualdists)):
for j in range(len(actualdists[i])):
self.assertAlmostEqual(actualdists[i][j], dists[i][j], places=4)
self.assertEqual(actualleaves, leaves)


# def test_traversing_speed(self):
# return
Expand Down

0 comments on commit c639381

Please sign in to comment.