# GASS-PPI

## Section 0: Libraries, Global Variables, Utility Functions

In [68]:
import subprocess
import re
import numpy as np
from Bio.PDB import *

repository = "/Users/albertdarmawan/Documents/gass-ppi/"
current_directory = repository + "core/"
dataset_directory = repository + "dataset/benchmark5/structures/"
pdb_parser = PDBParser()

# List of PDB ID of protein complexes
# aa stands for Antibody-Antigen (in this case, Antibody is the receptor and Antigen is the ligand)
dbd5_aa_list = ["1AHW", "1BVK", "1DQJ", "1E6J", "1JPS", "1MLC", "1VFB", "1WEJ",
                              "2FD6", "2I25", "2VIS", "2VXT", "2W9E", "3EOA", "3HMX", "3MXW",
                              "3RVW", "4DN4", "4FQI", "4G6J", "4G6M", "4GXU", "3EO1", "3G6D",
                              "3HI6", "3L5W", "3V6Z", "1BGX"]

dbd5_aa_r_list = list(map(lambda x: x + "_r_u", dbd5_aa_list))
dbd5_aa_l_list = list(map(lambda x: x + "_l_u", dbd5_aa_list))
# example = pdb_parser.get_structure("1AHW_L", dataset_directory + "1AHW_l_u.pdb")

In [58]:
# Residue is represented as a Last Heavy Atom (LHA)
# Residue is a "gene" in genetic algorithms
# Individual is a list of residue
# 2 residue info, 1 chain info, 2 atom (lha) info
class Residue:
    def __init__(self, residue_name, residue_sequence_position, chain_name, lha_name, lha_coordinates):
        self.residue_name = residue_name
        self.residue_sequence_position = residue_sequence_position
        self.chain_name = chain_name
        self.lha_name = lha_name
        self.lha_coordinates = lha_coordinates

In [59]:
# Euclidean Distance
# Given two 3D coordinates (1D NumPy), calculate its Euclidean distance
def euclidean_distance(coordinate_1, coordinate_2):
    return float(np.sqrt(((coordinate_1[0] - coordinate_2[0]) ** 2) +
                   ((coordinate_1[1] - coordinate_2[1]) ** 2) +
                   ((coordinate_1[2] - coordinate_2[2]) ** 2)))

In [66]:
# Using Sandro's script, generate a new PDB file with only its LHA
def generate_lha_file(pdb_id):
    pdb_file_name = pdb_id + ".pdb"
    new_thread = subprocess.run(["python", "pdb-preprocessing.py", dataset_directory, pdb_file_name], capture_output=True, text=True)
    
generate_lha_file("1AHW_l_u")

In [47]:
# TMAlign Structural Alignment (https://zhanggroup.org/TM-score/)
# Compare structural similarities between two PDB structures (regardless of the rotation)
# Return a TMscore in float
# TM-Score is between (0, 1], 1 indicates a perfect match. >0.5 is similar enough. <0.17 is two unrelated structures.
def tmalign_structural_alignment(pdb_id_1, pdb_id_2):
    pdb_id_1_file_directory = dataset_directory + pdb_id_1.lower() + ".pdb"
    pdb_id_2_file_directory = dataset_directory + pdb_id_2.lower() + ".pdb"
    # Execute TMAlign
    tmalign_thread = subprocess.run(["./TMalign", pdb_id_1_file_directory, pdb_id_2_file_directory], capture_output=True, text=True)
    output_text = tmalign_thread.stdout

    # Retrieved TMScore from TMAlign results
    tmscore_raw_list = re.findall("TM-score=\s[0-9]+.[0-9]+", output_text)

    # Convert the TMScore into floats, then get the maximum TMScore
    tmscore_list = list(map(lambda x: float(re.sub("TM-score=\s", "", x)), tmscore_raw_list))
    max_tmscore = max(tmscore_list)
    return max_tmscore

tmscore = tmalign_structural_alignment("1AHW_l_u", "1BVK_l_u")
print(tmscore)

0.23652


In [None]:
# Given one receptor and one ligand, infer its interfaces
# The protein-protein interfaces is a list of Residue class
def get_interface(pdb_id_1, pdb_id_2, threshold=6.0):
    pdb_1_lha = pdb_parser.get_structure(pdb_id_1, dataset_directory + pdb_id_1 + ".pdb")
    pdb_2_lha = pdb_parser.get_structure(pdb_id_2, dataset_directory + pdb_id_2 + ".pdb")
    
    
protein_interfaces = get_interface("1AHW_l_u", "1AHW_r_u")

## Section 1: Proof-of-Concept

In [76]:
# Query Protein
input_pdb_id = "1AHW_r_u"

# Step 1: Find the structural neighbour of the query protein using TMAlign Structural Alignment

# Remove the first element in this case, since it's the input_pdb_id
dataset_list = dbd5_aa_r_list[1:]

tmscore_list = list(map(lambda x: tmalign_structural_alignment(input_pdb_id, x), dataset_list))
print(tmscore_list) 
maximum_index = np.argmax(tmscore_list)
print(dataset_list[maximum_index])

# Step 2: Find the interface of the structural neighbour, then use it as the template


[0.96368, 0.71974, 0.89282, 0.96941, 0.86854, 0.96954, 0.66046, 0.85705, 0.76904, 0.79738, 0.66313, 0.45949, 0.71285, 0.96372, 0.97342, 0.96571, 0.95849, 0.70099, 0.66768, 0.49131, 0.79083, 0.68356, 0.60908, 0.64004, 0.953, 0.51087, 0.8124]
3MXW_r_u
