### TMmatrix&nbsp;&nbsp;&nbsp;&nbsp;[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crfield18/TMmatrix/blob/main/tmmatrix.ipynb)

Create a phylogenetic tree that compares the secondary structure of proteins, rather than nucleotide or amino acid sequence. Scoring uses the [_TM-align_](https://zhanggroup.org/TM-align/) algorithm by [_Zhang and Skolnick (2005)_](https://doi.org/10.1093%2Fnar%2Fgki524). Based of [_mTM-align_](http://yanglab.nankai.edu.cn/mTM-align/) by [_Dong et al. (2018)_](https://doi.org/10.1093/nar/gky430). Made possible with [Biopython](https://biopython.org).

A score of **<0.17** indicates similarity indistinguishable from a random pair of structures, where as as score **≥0.50** indicates a pair with broadly the same fold ([_Xu et al., 2010_](https://doi.org/10.1093/bioinformatics/btq066))


##### Usage

* Upload .pdb or .cif format files directly to the Colab instance by clicking the folder icon on the left, then dragging and dropping your structures

* For the best results, upload PDB/mmCIF files that contain a single subunit and with only a single position for each atom (crystallography/cryo-EM structures from the PDB can sometimes have multiple positions for some residues where a single position could not be resolved).



In [None]:
# @title Download and Compile the latest version of TM-align, Biopython and pandas

%%script bash

# Download Biopython and pandas
pip install biopython matplotlib
# pip install pandas # pandas is already installed on Colab. Uncomment if this changes

# Download an up-to-date copy of TMalign/USalign
git clone https://github.com/pylelab/USalign.git

# Compile TMalign
g++ -static -O3 -ffast-math -lm -o TMalign USalign/TMalign.cpp

In [None]:
# @title Run pairwise TM-align calculations

%%script bash

# Make results and models directories
mkdir -p results models

# Delete any PDB or mmCIF files downloaded from the USalign (TMalign) repository
rm USalign/*.pdb

# Move all PDB and mmCIF files to /contents/models/ (silently)
find . -type f \( -name "*.pdb" -o -name "*.cif" \) -exec mv {} models/ \; > /dev/null 2>&1

# How many TM-align calculations are needed?
model_count=$(find models/ -type f | wc -l)
calc_count=$((($model_count * ($model_count - 1)) / 2))

echo "There are ${model_count} files in the models directory. This means ${calc_count} total calculations."

# Make a list of all PDB and mmCIF files in the models directory and write to model_list.txt in the results directory
find models/ -maxdepth 1 \( -name "*.pdb" -o -name "*.cif" \) -printf "%f\n" > results/model_list.txt

# Run pairwise TM-align calculations
./TMalign -dir /content/models/ /content/results/model_list.txt -outfmt 2 > results/tmalign_out.tsv
# ./TMalign -h # Check that TMalign compiled correctly by loading the help message
# ./TMalign -dir /content/models/ /content/results/model_list.txt -outfmt 2 | tee results/tmalign_out.tsv # Print TM-align output to stdout (for debugging)

In [None]:
# @title Make phylogenetic tree from TM-align results

%matplotlib inline

from math import isnan
from pathlib import Path
import pandas as pd
import numpy as np


from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix

class TMmatrix():
  def __init__(self) -> None:
    # Set home, models and results paths
    self.home_path = Path.cwd()
    self.models_path = self.home_path.joinpath('models')
    self.results_path = self.home_path.joinpath('results')

    # Initialise self.tmmatrix as nothing
    self.tmmatrix = None

    # Get model list from results/model_list.txt
    with open(self.results_path.joinpath('model_list.txt'), 'r') as model_list_file:
      self.model_list = [line.strip('\n') for line in model_list_file]

  def tmalign_output_to_matrix(self):
    # Load tmalign_out.tsv into self.tmalign_output
    tmalign_output = pd.read_csv(self.results_path.joinpath('tmalign_out.tsv'), sep='\t')

    # make new df with model list as column and rows
    self.tmmatrix = pd.DataFrame(index=self.model_list, columns=self.model_list)

    # populate from output df with the highest TM-align score for each pair
    for index, row in tmalign_output.iterrows():
      if row['#PDBchain1'].startswith('#'):
        continue
      else:
        max_tm = max(row['TM1'], row['TM2'])

        self.tmmatrix.at[row['#PDBchain1'], row['PDBchain2']] = max_tm
        self.tmmatrix.at[row['PDBchain2'], row['#PDBchain1']] = max_tm

    # Fill the diagonal with scores of 1.0 (equivalent to aligning a model to itself)
    for obj in self.model_list:
      self.tmmatrix.at[obj,obj] = 1.0

    # Write self.tmmatrix to tm-align_score_matrix.csv
    self.tmmatrix.to_csv(self.results_path.joinpath('tm-align_score_matrix.csv'), index=True)

    return self.tmmatrix

  def make_tree_from_matrix(self):
    # Invert TM-align scores
    # More similar pairs of models (i.e., higher scores) have shorter distances to each other
    if not self.tmmatrix.empty:
      distances_df = 1 - self.tmmatrix

    # Convert the scores matrix to a lower triangle matrix
    lower_tri_df = distances_df.where(np.tril(np.ones(distances_df.shape)).astype(bool))
    lower_tri_lists = [[value for value in row if not isnan(value)] for row in lower_tri_df.values.tolist()]

    # Generate phylogenetic tree using the UPGMA clustering method
    tm_matrix = DistanceMatrix(names=self.model_list, matrix=lower_tri_lists)
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(tm_matrix)

    print('\n')

    # # Draw the tree for quick validation
    # # Only label terminal nodes
    # def no_internal_labels(node):
    #   if node.is_terminal():
    #     return node.name
    #   else:
    #     return None

    # Phylo.draw(tree, label_func=no_internal_labels)

    # Draw tree in ASCII for quick validation
    Phylo.draw_ascii(tree)

    # Write the tree to results/tmmatrix.tree
    Phylo.write(tree, self.results_path.joinpath('tmmatrix.tree'), 'newick')

  # Make a heatmap of TM-align values (unused by default)
  def tmmatrix_heatmap(self):
    import matplotlib.pyplot as plt

    # Convert DataFrame to numeric values
    tmmatrix_numeric = self.tmmatrix.apply(pd.to_numeric, errors='coerce')

    # Create the heatmap
    plt.figure(figsize=(25, 20))  # Adjust the size as needed
    plt.imshow(tmmatrix_numeric, cmap='cividis', interpolation='nearest')

    # Set tick labels for x and y axes
    plt.xticks(ticks=range(len(tmmatrix_numeric.columns)), labels=tmmatrix_numeric.columns, rotation=90)
    plt.yticks(ticks=range(len(tmmatrix_numeric.index)), labels=tmmatrix_numeric.index)

    # Add a colorbar
    plt.colorbar(label='TM-align score')

    # Save and show the heatmap
    plt.savefig(self.results_path.joinpath('tmmatrix_heatmap.png'), dpi=300, bbox_inches='tight')
    plt.show()

if __name__ == '__main__':
  colab_instance = TMmatrix()
  colab_instance.tmalign_output_to_matrix()
  colab_instance.make_tree_from_matrix()
  # colab_instance.tmmatrix_heatmap() # Uncomment to create a score heatmap

In [None]:
# @title Zip and download results

import datetime
from google.colab import files
import zipfile

current_dt = datetime.datetime.now()

results_path = Path.cwd().joinpath('results')
zip_path = results_path.joinpath(f'tmmatrix_results_{current_dt.strftime("%Y%m%d-%H%M")}.zip')

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
  for file in results_path.rglob('*'):
    # Add only files (not directories) to the ZIP archive
    if file.is_file() and file.name != zip_path.name:
      zipf.write(file, arcname=file.relative_to(results_path))

files.download(zip_path)