[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crfield18/TMmatrix/blob/main/tmmatrix.ipynb)


# Title
put description etc. here

##### Install Dependencies

In [None]:
# @title Set
from google.colab import files


!mkdir pdbs

uploaded = files.upload()

##### Calculate TMAlign Matrix

In [None]:
from itertools import combinations
from math import isnan
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pymol import cmd
import psico.fullinit # Needed to add tmalign to PyMOL

class TMalignMatrix():
    def __init__(self) -> None:
        pass

    def get_sequence_length(self, object_name):
        # Count the number of CA atoms in given object
        sequence_length = cmd.count_atoms(f'{object_name} and name CA')
        return sequence_length

    def load_models(self):
        model_list = [model for model, results in self.results.items()]
        for model in model_list:
            cmd.load(f'{model}.pdb')

    def calculate_matrix(self, save_temp:bool):
        model_list = [model for model, results in self.results.items()]
        self.load_models()

        self.show_matrix()

        # Calculate TMalign scores for all unique combinations of models
            # This would be more efficient if storing values in a list/dict
            # Directly updating the dataframe makes the data easier to parse visually
            # The real bottleneck here is going to TMalign anyway so I'm leaving it
        for pair in combinations(model_list, 2):
            print(pair)
            if isnan(self.tmmatrix.loc[pair[0], pair[1]]) or isnan(self.tmmatrix.loc[pair[1], pair[0]]):
                # Set the smaller object as the reference (target) object
                # TMalign scores are normalised based on sequence length
                object_1_length = self.get_sequence_length(pair[0])
                object_2_length = self.get_sequence_length(pair[1])

                if object_1_length >= object_2_length:
                    target = pair[1]
                    mobile = pair[0]
                elif object_1_length < object_2_length:
                    target = pair[0]
                    mobile = pair[1]
                
                tm = cmd.tmalign(mobile, target)

                self.tmmatrix.loc[pair[0], pair[1]] = tm
                self.tmmatrix.loc[pair[1], pair[0]] = tm

                self.show_matrix()

        # Fill the diagonal with TMscores of 1 (always the score for identical proteins)
            # ∴ We do not need to run any calculations
        for obj in model_list:
            self.tmmatrix.loc[obj,obj] = 1.0
        
        self.show_matrix()
        return self.tmmatrix
    
    def show_matrix(self):
        print(self.tmmatrix)

    def get_matrix(self):
        return self.tmmatrix
    
    


##### Generate Newick-formmated phylogenetic tree

In [None]:

from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
from scipy.spatial.distance import squareform
import pandas as pd
import numpy as np
from math import isnan

distances_df = pd.read_csv(tmalign_matrtx_csv, index_col=0)

distances_df = 1 - distances_df

lower_tri_df = distances_df.where(np.tril(np.ones(distances_df.shape)).astype(bool))

lower_tri_lists = [[value for value in row if not isnan(value)] for row in lower_tri_df.values.tolist()]

test_matrix = DistanceMatrix(names=distances_df.index.values.tolist(),matrix=lower_tri_lists)

constructor = DistanceTreeConstructor()
tree = constructor.upgma(test_matrix)

# Phylo.draw(tree)
Phylo.write(tree, 'tree.tree', 'newick')
