In [None]:
# @title Upload PDB/mmCIF files

# @markdown Alternatively, click the folder icon on the left, `right click` > `New Folder` and name it _models_ (all lower case), then drag and drop your PDB/mmCIF files into the _models folder_.

from pathlib import Path
from google.colab import files

# Upload PDB/mmCIF files
uploaded = files.upload()

# Make results and models directories
!mkdir results models

# Move PDB and mmCIF files to the models directory (silently)
!mv *.pdb models/ 2>/dev/null
!mv *.cif models/ 2>/dev/null


In [None]:
# @title How many calculations?

%%script bash

count=$(find models/ -type f | wc -l)

result=$((($count * ($count - 1)) / 2))

echo "There are ${count} files in the models directory. This means ${result} total calculations."

In [None]:
# @title Download and Compile the latest version of TM-align, Biopython and pandas

%%script bash

# Download Biopython and pandas
pip install biopython pandas

# Download an up-to-date copy of TMalign/USalign
git clone https://github.com/pylelab/USalign.git

# Compile TMalign
g++ -static -O3 -ffast-math -lm -o TMalign USalign/TMalign.cpp

In [None]:
# @title Run pairwise TM-align calculations

%%script bash

# Make a list of all PDB and mmCIF files in the models directory and write to model_list.txt in the results directory
find models/ -maxdepth 1 \( -name "*.pdb" -o -name "*.cif" \) -printf "%f\n" > results/model_list.txt

# Run pairwise TM-align calculations

# ./TMalign -h

./TMalign -dir /content/models/ /content/results/model_list.txt -outfmt 2 | tee results/tmalign_out.tsv # Print TM-align output to stdout (for debugging)

# ./TMalign -dir /content/models/ /content/results/model_list.txt -outfmt 2 > results/tmalign_out.tsv #

In [None]:
# @title Make phylogenetic tree from TM-align results

from math import isnan
from pathlib import Path
import pandas as pd
import numpy as np

from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix

class TMmatrix():
  def __init__(self) -> None:
    # Set home, models and results paths
    self.home_path = Path.cwd()
    self.models_path = self.home_path.joinpath('models')
    self.results_path = self.home_path.joinpath('results')

    # Initialise self.tmmatrix as nothing
    self.tmmatrix = None
    
    # Get model list from results/model_list.txt
    with open(self.results_path.joinpath('model_list.txt'), 'r') as model_list_file:
      self.model_list = [line.strip('\n') for line in model_list_file]
  
  def tmalign_output_to_matrix(self):
    # Load tmalign_out.tsv into self.tmalign_output
    tmalign_output = pd.read_csv(self.results_path.joinpath('tmalign_out.tsv'), sep='\t')

    # make new df with model list as column and rows
    self.tmmatrix = pd.DataFrame(index=self.model_list, columns=self.model_list)

    # populate from output df with the highest TM-align score for each pair
    for index, row in tmalign_output.iterrows():
      if row['#PDBchain1'].startswith('#'):
        continue
      else:
        max_tm = max(row['TM1'], row['TM2'])

        self.tmmatrix.at[row['#PDBchain1'], row['PDBchain2']] = max_tm
        self.tmmatrix.at[row['PDBchain2'], row['#PDBchain1']] = max_tm

    # Fill the diagonal with scores of 1.0 (equivalent to aligning a model to itself)
    for obj in self.model_list:
      self.tmmatrix.at[obj,obj] = 1.0

    return self.tmmatrix

  def make_tree_from_matrix(self):
    # Invert TM-align scores
    # More similar pairs of models (i.e., higher scores) have shorter distances to each other
    if self.tmmatrix.empty:
      distances_df = 1 - self.tmmatrix

    # Convert the scores matrix to a lower triangle matrix
    lower_tri_df = distances_df.where(np.tril(np.ones(distances_df.shape)).astype(bool))
    lower_tri_lists = [[value for value in row if not isnan(value)] for row in lower_tri_df.values.tolist()]

    # Generate phylogenetic tree using the UPGMA clustering method
    tm_matrix = DistanceMatrix(names=self.model_list, matrix=lower_tri_lists)
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(tm_matrix)
        
    print('\n')

    # # Draw the tree for quick validation
    # # Only label terminal nodes
    # def no_internal_labels(node):
    #   if node.is_terminal():
    #     return node.name
    #   else:
    #     return None

    # Phylo.draw(tree, label_func=no_internal_labels)

    # Draw tree in ASCII for quick validation
    Phylo.draw_ascii(tree)

    # Write the tree to results/tmmatrix.tree
    Phylo.write(tree, self.results_path.joinpath('tmmatrix.tree'), 'newick')

if __name__ == '__main__':
  colab_instance = TMmatrix()
  colab_instance.tmalign_output_to_matrix()
  colab_instance.make_tree_from_matrix()

In [None]:
# @title Zip and download results

from google.colab import files
import zipfile

results_path = Path.cwd().joinpath('results')
zip_path = results_path.joinpath('results.zip')

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for file in results_path.rglob('*'):
        if file.is_file() and file.name != 'results.zip':  # Add only files (not directories) to the ZIP archive
            zipf.write(file, arcname=file.relative_to(results_path))

files.download(zip_path)