In [2]:
from Bio import PDB, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import math

# Get PDB data
pdb_code = "2r4r"
pdb_file = PDB.PDBList()
pdb_file.retrieve_pdb_file(pdb_code, file_format="pdb", pdir='C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files')

def three_to_one(resname):
    aa_dict = {
        "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
        "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
        "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
        "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V"
    }
    return aa_dict.get(resname, "X")  # Return 'X' for unknown residues


def extract_pdb_fasta(pdb_filepath, chain_id, output_fasta):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_code, pdb_filepath)

    sequence = []
    observed_residues = []
    
    for model in structure:
        for chain in model:
            if chain.id == chain_id:  # Only process the target chain
                for residue in chain.get_residues():
                    if PDB.is_aa(residue):
                        sequence.append(three_to_one(residue.get_resname()))
                        observed_residues.append(residue.id[1])  # Keep track of numbering

    fasta_seq = SeqRecord(Seq("".join(sequence)), id=f"{pdb_filepath}_{chain_id}", description="")
    SeqIO.write(fasta_seq, output_fasta, "fasta")

    return observed_residues  # Return numbering for later alignment

pdb_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files\\pdb{pdb_code}.ent"
chain_id = "A"
fasta_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\{pdb_code}.fasta"

observed_residues = extract_pdb_fasta(pdb_filepath, chain_id, fasta_filepath)
print(f"Extracted residues: {observed_residues}")


Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb2r4r.ent' 
Extracted residues: [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 31

In [3]:
# Attempting to use DeepTMHMM through the terminal but accessing that through python
import subprocess
import os
import shutil

results_dir = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}"
os.makedirs(results_dir, exist_ok=True)

# Run DeepTMHMM within the results directory
process = subprocess.Popen(
    ["biolib", "run", "DTU/DeepTMHMM", "--fasta", fasta_filepath],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
    cwd=results_dir 
)

stdout, stderr = process.communicate() 
print("STDOUT:", stdout)
print("STDERR:", stderr)

STDOUT: 2025-03-04 17:59:13,822 | INFO : Extracted zip file to: output/

2025-03-04 17:59:13,823 | INFO : Done in 1.84 seconds

2025-03-04 17:59:13,822 | INFO : Extracted zip file to: output/
2025-03-04 17:59:13,823 | INFO : Done in 1.84 seconds

STDERR: 


In [13]:
# Align output ranges with actual observed residues
tmh_ranges = []
tmh_result_file = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}\\biolib_results\\TMRs.gff3"

with open(tmh_result_file) as file:
    for line in file:
        if "TMhelix" in line:
            parts = line.strip().split("\t")
            start, end = int(parts[2]), int(parts[3])
            tmh_ranges.append((start, end))

print(tmh_ranges)


Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb2r4r.ent' 
Extracted residues: [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 31

In [14]:
import numpy as np
import torch
import pydssp

parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure(pdb_code, pdb_filepath)

# Select the first structure in the file
model = structure[0]

# Select the A chain
chain = model['A']

# Retrieve coordinates
coordinates = []
for residue in chain:
    if PDB.is_aa(residue):
        res_name = residue.get_resname()
        for atom in residue:
            if atom.get_name() in ['N', 'CA', 'C', 'O']:
                coordinates.append(atom.coord)

# Prepare eliments of pytorch tensor
L = sum(1 for residue in chain if PDB.is_aa(residue))
atoms = 4
xyz = 3

# Make coordinates into array first to increase efficiency
coord_array = np.array(coordinates)

# Create tensor
coord_tensor = torch.tensor(coord_array, dtype=torch.float32).reshape([L, atoms, xyz])

#dssp_hbond_matrix = pydssp.get_hbond_map(coord_tensor) > 0.5

# Use pydssp to get secondary structure
ss_data = pydssp.assign(coord_tensor, out_type='c3')

In [15]:
def calculate_desired_extensions(tmh_ranges, ss_data, max_extend=9):
    """Determine how much each TMH would like to extend based on consecutive 'H' residues."""
    desired_extensions = []

    for start, end in tmh_ranges:
        # Backward extension
        backward_extension = 0
        for i in range(1, max_extend + 1):
            if start - i >= 0 and ss_data[start - i] == "H":
                backward_extension += 1
            else:
                break  # Stop at first non-'H'

        # Forward extension
        forward_extension = 0
        for i in range(1, max_extend + 1):
            if end + i < len(ss_data) and ss_data[end + i] == "H":
                forward_extension += 1
            else:
                break  # Stop at first non-'H'

        desired_extensions.append((backward_extension, forward_extension))

    return desired_extensions

# Example usage
desired_extensions = calculate_desired_extensions(tmh_ranges, ss_data)
print("Desired Extensions:", desired_extensions)

Desired Extensions: [(0, 2), (4, 0), (0, 5), (4, 1), (9, 4), (5, 4), (6, 0)]


Available Spaces: [14, 1, 20, 1, 33, 2]


In [17]:
def calculate_available_spaces(tmh_ranges):
    """Calculate the number of residues available between consecutive TMHs."""
    available_spaces = []

    for i in range(len(tmh_ranges) - 1):
        prev_end = tmh_ranges[i][1]  # End of the current TMH
        next_start = tmh_ranges[i + 1][0]  # Start of the next TMH
        available_space = next_start - prev_end - 1  # Residues in between
        available_spaces.append(available_space)

    return available_spaces

# Example usage
available_spaces = calculate_available_spaces(tmh_ranges)
print("Available Spaces:", available_spaces)

# First establish the maximum amount any extension can be
max_extension = 9

# Edit desired_extensions so that none go over the max_extension
desired_extensions = [(min(start, max_extension), min(end, max_extension)) for start, end in desired_extensions]

# Put desired extensions in a dictionary so we can align them with available spaces and keep them in order
desired_extensions_dict = dict(enumerate(desired_extensions))

# Process each available space and ensure fair distribution
for i in range(len(available_spaces)):
    total_desired = desired_extensions_dict[i][1] + desired_extensions_dict[i + 1][0]  # Sum of forward and backward extension
    available = available_spaces[i]  # Available space between TMHs

    if total_desired > available:
        half_space = math.floor(available / 2)  # Half the space, rounded down
        
        # If the right-side extension fits within half_space, the left gets the remainder
        if half_space >= desired_extensions_dict[i][1]:
            desired_extensions_dict[i + 1] = (available - desired_extensions_dict[i][1], desired_extensions_dict[i + 1][1])
        
        # If the left-side extension fits within half_space, the right gets the remainder
        elif half_space >= desired_extensions_dict[i + 1][0]:
            desired_extensions_dict[i] = (desired_extensions_dict[i][0], available - desired_extensions_dict[i + 1][0])

        # Otherwise, split evenly
        else:
            desired_extensions_dict[i] = (desired_extensions_dict[i][0], half_space)
            desired_extensions_dict[i + 1] = (half_space, desired_extensions_dict[i + 1][1])
            
print(desired_extensions_dict)

# Apply the extensions to tmh_ranges
extended_tmh_ranges = []

for i, (start, end) in enumerate(tmh_ranges):
    left_extension = desired_extensions_dict[i][0]  # Backward extension
    right_extension = desired_extensions_dict[i][1]  # Forward extension
    
    new_start = start - left_extension  # Extend start backwards
    new_end = end + right_extension  # Extend end forward
    
    extended_tmh_ranges.append((new_start, new_end))  # Store the updated range

# Print or return the extended TMH ranges
print(extended_tmh_ranges)

# Convert TMH positions to actual PDB residue numbers as pairs
tmh_extended_pairs = [
    (observed_residues[start - 1], observed_residues[end - 1]) for start, end in extended_tmh_ranges
]

print(f"TMH start/end residue pairs in PDB numbering: {tmh_extended_pairs}")

def reorder_gpcr_tmh_ends(tmh_extended_pairs):
    """Reorder TMH ends for a GPCR assuming 14 TMH ends in the given pattern."""
    pattern = ["extra", "intra", "intra", "extra", "extra", "intra", "intra", 
               "extra", "extra", "intra", "intra", "extra", "extra", "intra"]

    reordered = []
    for i, label in enumerate(pattern):
        if label == "extra":
            reordered.append(tmh_extended_pairs[i // 2][0])  # Take start residue
        else:  # "intra"
            reordered.append(tmh_extended_pairs[i // 2][1])  # Take end residue

    return reordered

# Process
flattened_tmh = reorder_gpcr_tmh_ends(tmh_extended_pairs)
print(flattened_tmh)


{0: (0, 2), 1: (4, 0), 2: (0, 5), 3: (4, 0), 4: (0, 4), 5: (5, 1), 6: (1, 0)}
