### ColabAlign&nbsp;&nbsp;&nbsp;&nbsp;[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/crfield18/ColabAlign/blob/main/colabalign.ipynb)

Create a phylogenetic tree that compares the secondary structure of proteins, rather than nucleotide or amino acid sequence. Scoring uses the [_TM-align_](https://zhanggroup.org/TM-align/) algorithm by [_Zhang and Skolnick (2005)_](https://doi.org/10.1093%2Fnar%2Fgki524). Based of [_mTM-align_](http://yanglab.nankai.edu.cn/mTM-align/) by [_Dong et al. (2018)_](https://doi.org/10.1093/nar/gky430). Made possible with [US-align](https://github.com/pylelab/USalign) and [Biopython](https://biopython.org).

A score of **<0.17** indicates similarity indistinguishable from a random pair of structures, where as as score **≥0.50** indicates a pair with broadly the same fold ([_Xu et al., 2010_](https://doi.org/10.1093/bioinformatics/btq066))


##### Usage

* For the best performance, click `Runtime` -> `Change runtime type` -> `TPU v2`

* Upload .pdb or .cif format files directly to the Colab instance by clicking the folder icon on the left, then dragging and dropping your structures

* For the best results, upload PDB/mmCIF files that contain a single subunit and with only a single position for each atom (crystallography/cryo-EM structures from the PDB can sometimes have multiple positions for some residues where a single position could not be resolved).



In [None]:
# @title Download and Compile the latest version of TM-align, Biopython and pandas

%%script bash

pip install biopython matplotlib
# pip install pandas # pandas is already installed on Colab. Uncomment if this changes

# Download and compile an up-to-date copy of TM-align/US-align
git clone https://github.com/pylelab/USalign.git
g++ -static -O3 -ffast-math -lm -o TMalign USalign/TMalign.cpp

In [None]:
# @title Set up file structure

# @markdown Make sure your .pdb and/or .cif files are uploaded before running this block.

%%script bash

mkdir -p results models results/sub-lists results

# Delete any PDB or mmCIF files downloaded from the US-align (TM-align) repository
# US-align comes with 2 example PDB files and we don't want to add these to our
rm USalign/*.pdb

find . -type f \( -name "*.pdb" -o -name "*.cif" \) -exec mv {} models/ \; > /dev/null 2>&1

# Count the number of models in models/ to show how many alignments are needed
# Mainly to estimate how long
model_count=$(find models/ -type f | wc -l)
calc_count=$((($model_count * ($model_count - 1)) / 2))
echo "There are ${model_count} files in the models directory. This means ${calc_count} total calculations."

# Make a list of all PDB and mmCIF files in the models directory and write to model_list.txt in the results directory
# model_list.txt is used to split work up between threads later
find models/ -maxdepth 1 \( -name "*.pdb" -o -name "*.cif" \) -printf "%f\n" > results/model_list.txt

In [None]:
# @title Run pairwise TM-align calculations

import os
import subprocess
from itertools import combinations
from concurrent.futures import ProcessPoolExecutor, as_completed

# Read model list from file
with open('results/model_list.txt', 'r') as model_list_file:
  model_list = [line.strip() for line in model_list_file]

# print(f'Core count:\t{os.cpu_count()}')

# Make sub-lists of models to compare using combinations
all_combos = combinations(model_list, 2)

sub_lists = {}
for pair in all_combos:
  if pair[0] not in sub_lists:
    sub_lists[pair[0]] = []
  sub_lists[pair[0]].append(pair[1])

# Write individual model list files for each target model
for target, sub_list in sub_lists.items():
  with open(f'results/sub-lists/{target}-model_list.txt', 'w') as sublist_file:
    for model in sub_list:
      sublist_file.writelines(f'{model}\n')

# Function to run TM-align for a given model
def run_tmalign(model):
  print(f'Running TM-align for {model}')
  # Align model against models listed in corresponding model list file
  cmd = f'./TMalign models/{model} -dir2 models/ results/sub-lists/{model}-model_list.txt -outfmt 2 > results/sub-lists/{model}-tm-align_out.tsv'
  process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  stdout, stderr = process.communicate()
  return model, stdout, stderr

# Run multiple TM-align jobs in parallel
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
  # Submit all tasks to the executor
  futures = [executor.submit(run_tmalign, model) for model in sub_lists.keys()]

  # Ensure all tasks are completed
  for future in as_completed(futures):
    try:
      model, stdout, stderr = future.result()
      if stderr:
        print(f"Error for models {model}: {stderr.decode()}")
    except Exception as e:
      print(f"Error occurred: {e}")

# Merge individual TM-align results files into 1 file
tmalign_results_files = [file for file in os.listdir('results/sub-lists') if file.endswith('.tsv')]

with open('results/tm-align_out.tsv', 'w') as merged_tmalign_results_file:
  # Write header line
  merged_tmalign_results_file.write('#PDBchain1\tPDBchain2\tTM1\tTM2\tRMSD\tID1\tID2\tIDali\tL1\tL2\tLali\n')

  # Open each results file and copy all lines except the header and footer (lines beginning with #)
  for file in tmalign_results_files:
    with open(os.path.join('results/sub-lists', file), 'r') as result_file:
      for line in result_file:
        if not line.startswith('#'):
          # merged_tmalign_results_file.write(line[7:]) # temp fix for index naming

          # Remove models/ from #PDBchain1 name
          new_line = line.split('\t')
          new_line[0] = new_line[0].split('/')[-1]
          merged_tmalign_results_file.write('\t'.join(new_line))

In [None]:
# @title Make phylogenetic tree from TM-align results

# %matplotlib inline

from math import isnan
from pathlib import Path
import pandas as pd
import numpy as np

from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix

class ColabAlign():
  def __init__(self) -> None:
    self.home_path = Path.cwd()
    self.models_path = self.home_path.joinpath('models')
    self.results_path = self.home_path.joinpath('results')
    self.colabalign = None

    with open(self.results_path.joinpath('model_list.txt'), 'r') as model_list_file:
      self.model_list = [line.strip('\n') for line in model_list_file]

  def tmalign_output_to_matrix(self):
    # inputs: 
    # outputs:  

    tmalign_output = pd.read_csv(self.results_path.joinpath('tm-align_out.tsv'), sep='\t')

    # Initialize a matrix with NaN values to be populated with TM-align values
    # Probably not the most memory/time efficient method for this but I found it much easier
    # to understand how the data is handled
    matrix_data = np.full((len(self.model_list), len(self.model_list)), np.nan)

    # Create a dictionary to map model names to indices
    model_index = {model: idx for idx, model in enumerate(self.model_list)}

    # Populate the matrix with the highest TM-align score for each pair
    for index, row in tmalign_output.iterrows():
      if row['#PDBchain1'].startswith('#'):
        continue
      else:
        max_tm = max(row['TM1'], row['TM2'])
        i, j = model_index[row['#PDBchain1']], model_index[row['PDBchain2']]
        matrix_data[i, j] = max_tm
        matrix_data[j, i] = max_tm

    # Fill the diagonal with scores of 1
    # We don't need to align a model to itself because this will always return a TM-score of 1
    # Calculated TM-align scores are given to 4 decimal places
    np.fill_diagonal(matrix_data, 1.0000)

    self.colabalign = pd.DataFrame(matrix_data, index=self.model_list, columns=self.model_list)
    self.colabalign.to_csv(self.results_path.joinpath('tm-align_score_matrix.csv'), index=True)

    return self.colabalign

  def make_tree_from_matrix(self):
    # Invert TM-align scores to make them suitable for distances on a phylogenetic tree
    # More similar pairs of models (i.e., higher TM-scores) have shorter distances to each other
    # Distance values are all rounded to 4 decimal places since all TM-align scores are also to 4 d.p.
    # Clipping is also applied to ensure no values below 0 or above 1 are present in the distance matrix
    if not self.colabalign.empty:
      distances_df = 1.0000 - self.colabalign
      distances_df = distances_df.round(4)
      distances_df = distances_df.clip(lower=0.0000,upper=1.0000)

    # Convert the scores matrix to a lower triangle matrix
    # The lower and upper triangles of the matrix are identical and Bio.Phylo.TreeConstruction
    # requires a lower triangle matrix rather than the full matrix
    lower_tri_df = distances_df.where(np.tril(np.ones(distances_df.shape)).astype(bool))
    lower_tri_lists = [[value for value in row if not isnan(value)] for row in lower_tri_df.values.tolist()]

    # Generate phylogenetic tree using the UPGMA clustering method
    # We can safely ignore the Molecular Clock hypothesis because we are not
    # deriving evolutionary relationships between proteins
    tm_matrix = DistanceMatrix(names=self.model_list, matrix=lower_tri_lists)
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(tm_matrix)

    print('\n')
    Phylo.draw_ascii(tree)
    Phylo.write(tree, self.results_path.joinpath('colabalign.tree'), 'newick')


if __name__ == '__main__':
  colab_instance = ColabAlign()
  colab_instance.tmalign_output_to_matrix()
  colab_instance.make_tree_from_matrix()

In [None]:
# @title Zip and download results

import datetime
from google.colab import files
import os

# Name the zipped results file using the current date and time
# to not accidentally overwrite older results files when downloading
current_dt = datetime.datetime.now()
zip_filename = f'colabalign_results_{current_dt.strftime("%Y%m%d-%H%M")}.zip'

# Using the built-in zip function rather than a python module for efficiency
os.system(f'zip -r {zip_filename} models results ')
files.download(zip_filename)