# Title
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crfield18/TMmatrix/blob/main/tmmatrix.ipynb)


put description etc. here

In [None]:
# @title Install Dependencies

!pip install -q condacolab

import condacolab
condacolab.install()

!conda install conda-forge::pymol-open-source schrodinger::pymol-psico matplotlib pandas conda-forge::biopython bioconda::tmalign scipy

In [None]:
# @title Upload PDB files

# Upload PDB files
from google.colab import files

uploaded = files.upload()


# Create subdirectory for PDB files and results
from pathlib import Path

cwd = Path.cwd()
for sub_dir in (cwd.joinpath('pdbs'), cwd.joinpath('results')):
  sub_dir.mkdir(parents=True, exist_ok=True)

# Move PDB files to pdbs subdirectory
import shutil

for file in set(cwd.glob('*.pdb')):
  source = file
  dest = cwd.joinpath(f'pdbs/{file.name}')
  shutil.move(source, dest)


In [None]:
# @title Calculate TMAlign Matrix

from itertools import combinations
from math import isnan
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pymol import cmd
import psico.fullinit # Needed to add tmalign to PyMOL

from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
from scipy.spatial.distance import squareform

class TMalignMatrix():
  def __init__(self) -> None:
    self.models = {}
    self.home_path = Path.cwd()
    self.pdbs_path = self.home_path.joinpath('pdbs')
    self.results_path = self.home_path.joinpath('results')
    self.tmmatrix = pd.DataFrame()

  def get_models(self):
    for model in set(self.pdbs_path.glob('*.pdb')):
      self.models[model.name] = model
    return self.models

  def load_models(self):
    if self.models == {}:
      self.get_models()
    for model, path in self.models.items():
      cmd.load(path)

  def get_sequence_length(self, object_name):
    # Count the number of CA atoms in given object
    sequence_length = cmd.count_atoms(f'{object_name} and name CA')
    return sequence_length

  def calculate_matrix(self):
    # Load all PDB files into pymol
    self.load_models()
    model_list = [model[:-4] for model, results in self.models.items() if model.endswith('.pdb')]

    # Initialise an empty pandas dataframe using the list of loaded pdb files as column/row indeces
    if self.tmmatrix.empty:
      self.tmmatrix = pd.DataFrame(index=model_list, columns=model_list)

    # Calculate TMalign scores for all unique combinations of models
    for pair in combinations(model_list, 2):
      if isnan(self.tmmatrix.loc[pair[0], pair[1]]) or isnan(self.tmmatrix.loc[pair[1], pair[0]]):
        # Set the smaller object as the reference (target) object
        # TMalign scores are normalised based on sequence length
        object_1_length = self.get_sequence_length(pair[0])
        object_2_length = self.get_sequence_length(pair[1])

        if object_1_length >= object_2_length:
            target = pair[1]
            mobile = pair[0]
        elif object_1_length < object_2_length:
            target = pair[0]
            mobile = pair[1]

        tm = cmd.tmalign(mobile, target)
        self.tmmatrix.loc[pair[0], pair[1]] = tm
        self.tmmatrix.loc[pair[1], pair[0]] = tm

    # Fill the diagonal with TMalign scores of 1 (always the score for identical proteins)
    # ∴ We do not need to run any calculations
    for obj in model_list:
      self.tmmatrix.loc[obj,obj] = 1.0

    # Write the TMalign score matrix to a csv file
    self.tmmatrix.to_csv(self.results_path.joinpath('tmalign_score_matrix.csv'), index=True)

    return self.tmmatrix

  def make_tree_from_matrix(self):
    if self.tmmatrix.empty:
      distances_df = pd.read_csv(self.results_path.joinpath('tmalign_score_matrix.csv'), index_col=0)
    else:
      distances_df = self.tmmatrix

    # Invert TMalign scores
    distances_df = 1 - distances_df

    # Convert the scores matrix to a lower triangle matrix
    lower_tri_df = distances_df.where(np.tril(np.ones(distances_df.shape)).astype(bool))

    lower_tri_lists = [[value for value in row if not isnan(value)] for row in lower_tri_df.values.tolist()]

    test_matrix = DistanceMatrix(names=distances_df.index.values.tolist(),matrix=lower_tri_lists)

    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(test_matrix)

    Phylo.draw(tree)
    Phylo.write(tree, self.results_path.joinpath('results.tree'), 'newick')

    from google.colab import files

    files.download(self.results_path.joinpath('results.tree'))


if __name__ == '__main__':
  test = TMalignMatrix()
  test.load_models()
  test.calculate_matrix()
  test.make_tree_from_matrix()
