[Drippypale](https://github.com/drippypale/ROSALIND)  
Email: drippypale@gmail.com  

Problem: **[Implement UPGMA](https://rosalind.info/problems/ba7d/)**

In [140]:
import numpy as np

In [141]:
with open('ba7d.in', 'r') as f:
    lines = f.read().splitlines()
    n = int(lines[0])
    dist_matrix = list()
    for l in lines[1:]:
        dist_matrix.extend([int(x) for x in l.split()])
    
    dist_matrix = np.array(dist_matrix, dtype=np.float64).reshape((n, n))

In [142]:
class Cluster:
    
    def __init__(self, ind, nodes: list = [], age = 0) -> None:
        self.ind = ind
        self.nodes = nodes
        self.age = age

    def add_nodes(self, nodes: list):
        self.nodes.extend(nodes)

In [143]:
def clusters_distance(dist_matrix, c1: Cluster, c2: Cluster):
    dist = 0
    for i in c1.nodes:
        for j in c2.nodes:
            dist += dist_matrix[i][j]
    return dist / float(len(c1.nodes) * len(c2.nodes))

In [144]:
from itertools import combinations
def find_closest_clusters(dist_matrix, clusters: list[Cluster], current_clusters):
    return min([(clusters_distance(dist_matrix, clusters[x[0]], clusters[x[1]]), x) for x in combinations(current_clusters, 2)],
    key=lambda x: x[0])[1]

In [145]:
new_cluster_index = n
def merge_clusters(T, dist_matrix, clusters: dict[Cluster], current_clusters:list, i, j):
    global new_cluster_index

    # merge clusters i and j into a new cluster
    new_cluster = Cluster(new_cluster_index,
                            nodes=clusters[i].nodes + clusters[j].nodes,
                            age=clusters_distance(dist_matrix, clusters[i], clusters[j]) / 2)

    current_clusters.remove(i)
    current_clusters.remove(j)
    current_clusters.append(new_cluster_index)
    clusters[new_cluster_index] = new_cluster

    # add a new internal node to the Tree which connects i and j with the dist = Dci, Dcj / 2    
    if T.get(new_cluster_index, False):
        T[new_cluster_index][clusters[i].ind] = new_cluster.age - clusters[i].age
        T[new_cluster_index][clusters[j].ind] = new_cluster.age - clusters[j].age
    else:
        T[new_cluster_index] = {
            clusters[i].ind: new_cluster.age - clusters[i].age,
            clusters[j].ind: new_cluster.age - clusters[j].age,
            }
    T[clusters[i].ind][new_cluster_index] = new_cluster.age - clusters[i].age
    T[clusters[j].ind][new_cluster_index] = new_cluster.age - clusters[j].age

    # compute the distances from the new cluster
    new_distances = np.array([clusters_distance(dist_matrix, new_cluster, _) for _ in clusters.values()], dtype=np.float64)
    dist_matrix = np.column_stack((np.row_stack((dist_matrix, [0 for _ in range(dist_matrix.shape[0])])), new_distances))
    dist_matrix[-1, :] = new_distances

    new_cluster_index += 1

    return T, dist_matrix, clusters, current_clusters

In [146]:
clusters = {i:Cluster(i, [i]) for i in range(dist_matrix.shape[0])} # key represents the index in the dist_matrix
current_clusters = [i for i in range(dist_matrix.shape[0])]

T = {
    i:{} for i in range(dist_matrix.shape[0])
}

In [147]:
while len(current_clusters) > 1:
    i, j = find_closest_clusters(dist_matrix, clusters, current_clusters)
    T, dist_matrix, clusters, current_clusters = merge_clusters(T, dist_matrix, clusters, current_clusters, i, j)

In [148]:
with open('ba7d.out', 'w') as f:
    for i in sorted(T.keys()):
            for j in sorted(T[i].keys()):
                f.write(f'{i}->{j}:{T[i][j]:.3f}\n')