In [1]:
from Bio.PDB import PDBParser, Superimposer, PDBIO
from Bio import PDB
from Bio import SeqIO
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
import numpy as np
import mdtraj as md
import glob
import pandas as pd 
import warnings
warnings.filterwarnings("ignore") 



In [2]:
# Function to get protein sequence
def get_seq(structure):
    sequences = {}
    for model in structure:
        for chain in model:
            seq = ''
            for residue in chain:
                if residue.id[0] == ' ': # checks for heteroatoms to exclude them (like water)
                    seq += three_to_one.get(residue.resname, 'X')  # 'X' for unknown or non-standard amino acids
            sequences = seq
    return sequences

In [3]:
# Dictionary for converting three-letter to one-letter codes
three_to_one = {
    'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F',
    'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L',
    'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R',
    'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'
}

In [4]:
# Calculating ratio of overlapping sequences for a sample file

reference_file = 'Data/TM_only/5ht1b_human_5V54_prot_TM.pdb'
parser = PDBParser()
reference_structure = parser.get_structure('reference', reference_file)
reference_sequences = get_seq(reference_structure)

sample_file = 'Data/Aligned/5HT1B_aligned.pdb'
sample_structure = parser.get_structure('sample', sample_file)
sample_sequences = get_seq(sample_structure)

alignments = pairwise2.align.globalxx(reference_sequences, sample_sequences)
reference, sample, _, _, _ = alignments[0]
overlapping_sequence = ''

for char_ref, char in zip(reference, sample):
    if char_ref == char and char_ref != '-':
        overlapping_sequence += char_ref
            
ratio = len(overlapping_sequence) / len(reference_sequences)
print("Ratio of overlapping sequence: ", ratio)

Ratio of overlapping sequence:  0.9955947136563876
