In [34]:
from Bio import PDB, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import math

# Get PDB data
pdb_code = "3sn6"
pdb_file = PDB.PDBList()
pdb_file.retrieve_pdb_file(pdb_code, file_format="pdb", pdir='C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files')

def three_to_one(resname):
    aa_dict = {
        "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
        "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
        "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
        "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V"
    }
    return aa_dict.get(resname, "X")  # Return 'X' for unknown residues


def extract_pdb_fasta(pdb_filepath, chain_id, output_fasta):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_code, pdb_filepath)

    sequence = []
    observed_residues = []
    
    for model in structure:
        for chain in model:
            if chain.id == chain_id:  # Only process the target chain
                for residue in chain.get_residues():
                    if PDB.is_aa(residue):
                        sequence.append(three_to_one(residue.get_resname()))
                        observed_residues.append(residue.id[1])  # Keep track of numbering

    fasta_seq = SeqRecord(Seq("".join(sequence)), id=f"{pdb_filepath}_{chain_id}", description="")
    SeqIO.write(fasta_seq, output_fasta, "fasta")

    return observed_residues  # Return numbering for later alignment

pdb_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files\\pdb{pdb_code}.ent"
chain_id = "R"
fasta_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\{pdb_code}.fasta"

observed_residues = extract_pdb_fasta(pdb_filepath, chain_id, fasta_filepath)
print(f"Extracted residues: {observed_residues}")


Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb3sn6.ent' 
Extracted residues: [1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141,

In [8]:
# Attempting to use DeepTMHMM through the terminal but accessing that through python
import subprocess
import os
import shutil

results_dir = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}"
os.makedirs(results_dir, exist_ok=True)

# Run DeepTMHMM within the results directory
process = subprocess.Popen(
    ["biolib", "run", "DTU/DeepTMHMM", "--fasta", fasta_filepath],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
    cwd=results_dir 
)

stdout, stderr = process.communicate() 
print("STDOUT:", stdout)
print("STDERR:", stderr)

STDOUT: 2025-03-16 17:13:39,083 | INFO : Extracted zip file to: output/

2025-03-16 17:13:39,084 | INFO : Done in 2.88 seconds

2025-03-16 17:13:39,083 | INFO : Extracted zip file to: output/
2025-03-16 17:13:39,084 | INFO : Done in 2.88 seconds

STDERR: 


In [35]:
# Align output ranges with actual observed residues
tmh_ranges = []
tmh_result_file = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}\\biolib_results\\TMRs.gff3"

with open(tmh_result_file) as file:
    for line in file:
        if "TMhelix" in line:
            parts = line.strip().split("\t")
            start, end = int(parts[2]), int(parts[3])
            tmh_ranges.append((start, end))

print(tmh_ranges)


[(161, 170), (172, 188), (201, 225), (238, 259), (280, 301), (327, 347), (377, 400), (409, 428)]


In [36]:
import numpy as np
import torch
import pydssp

parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure(pdb_code, pdb_filepath)

# Select the first structure in the file
model = structure[0]

# Select the A chain
chain = model['R']

# Retrieve coordinates
coordinates = []
for residue in chain:
    if PDB.is_aa(residue):
        res_name = residue.get_resname()
        for atom in residue:
            if atom.get_name() in ['N', 'CA', 'C', 'O']:
                coordinates.append(atom.coord)

# Prepare eliments of pytorch tensor
L = sum(1 for residue in chain if PDB.is_aa(residue))
atoms = 4
xyz = 3

# Make coordinates into array first to increase efficiency
coord_array = np.array(coordinates)

# Create tensor
coord_tensor = torch.tensor(coord_array, dtype=torch.float32).reshape([L, atoms, xyz])

#dssp_hbond_matrix = pydssp.get_hbond_map(coord_tensor) > 0.5

# Use pydssp to get secondary structure
ss_data = pydssp.assign(coord_tensor, out_type='c3')
print(ss_data)

['-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' '-' '-' 'E' '-' 'E' 'E'
 '-' '-' '-' '-' '-' 'E' 'E' 'E' '-' '-' '-' 'E' 'E' '-' 'E' '-' '-' '-'
 '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' '-' '-'
 '-' 'E' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-'
 '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' 'H' 'H' 'H'
 'H' 'H' 'H' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' 'H' 'H' 'H' 'H' 'H' '-' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' '-' '-' '-' '-' '-' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' 'H' 'H' 'H' '-' '-' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' '-' '-' '-' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H

In [37]:
def calculate_desired_extensions(tmh_ranges, ss_data, max_extend=9):
    """Determine how much each TMH would like to extend based on consecutive 'H' residues."""
    desired_extensions = []

    for start, end in tmh_ranges:
        # Backward extension
        backward_extension = 0
        for i in range(1, max_extend + 1):
            if start - i >= 0 and ss_data[start - i] == "H":
                backward_extension += 1
            else:
                break  # Stop at first non-'H'

        # Forward extension
        forward_extension = 0
        for i in range(1, max_extend + 1):
            if end + i < len(ss_data) and ss_data[end + i] == "H":
                forward_extension += 1
            else:
                break  # Stop at first non-'H'

        desired_extensions.append((backward_extension, forward_extension))

    return desired_extensions

# Example usage
desired_extensions = calculate_desired_extensions(tmh_ranges, ss_data)
print("Desired Extensions:", desired_extensions)

Desired Extensions: [(0, 9), (9, 1), (5, 0), (6, 6), (4, 0), (4, 9), (9, 0), (3, 0)]


In [38]:
def calculate_available_spaces(tmh_ranges):
    """Calculate the number of residues available between consecutive TMHs."""
    available_spaces = []

    for i in range(len(tmh_ranges) - 1):
        prev_end = tmh_ranges[i][1]  # End of the current TMH
        next_start = tmh_ranges[i + 1][0]  # Start of the next TMH
        available_space = next_start - prev_end - 1  # Residues in between
        available_spaces.append(available_space)

    return available_spaces

# Example usage
available_spaces = calculate_available_spaces(tmh_ranges)
print("Available Spaces:", available_spaces)

# First establish the maximum amount any extension can be
max_extension = 9

# Edit desired_extensions so that none go over the max_extension
desired_extensions = [(min(start, max_extension), min(end, max_extension)) for start, end in desired_extensions]

# Put desired extensions in a dictionary so we can align them with available spaces and keep them in order
desired_extensions_dict = dict(enumerate(desired_extensions))

# Process each available space and ensure fair distribution
for i in range(len(available_spaces)):
    total_desired = desired_extensions_dict[i][1] + desired_extensions_dict[i + 1][0]  # Sum of forward and backward extension
    available = available_spaces[i]  # Available space between TMHs

    if total_desired > available:
        half_space = math.floor(available / 2)  # Half the space, rounded down
        
        # If the right-side extension fits within half_space, the left gets the remainder
        if half_space >= desired_extensions_dict[i][1]:
            desired_extensions_dict[i + 1] = (available - desired_extensions_dict[i][1], desired_extensions_dict[i + 1][1])
        
        # If the left-side extension fits within half_space, the right gets the remainder
        elif half_space >= desired_extensions_dict[i + 1][0]:
            desired_extensions_dict[i] = (desired_extensions_dict[i][0], available - desired_extensions_dict[i + 1][0])

        # Otherwise, split evenly
        else:
            desired_extensions_dict[i] = (desired_extensions_dict[i][0], half_space)
            desired_extensions_dict[i + 1] = (half_space, desired_extensions_dict[i + 1][1])
            
print(desired_extensions_dict)

# Apply the extensions to tmh_ranges
extended_tmh_ranges = []

for i, (start, end) in enumerate(tmh_ranges):
    left_extension = desired_extensions_dict[i][0]  # Backward extension
    right_extension = desired_extensions_dict[i][1]  # Forward extension
    
    new_start = start - left_extension  # Extend start backwards
    new_end = end + right_extension  # Extend end forward
    
    extended_tmh_ranges.append((new_start, new_end))  # Store the updated range

# Print or return the extended TMH ranges
print(tmh_ranges)
print(extended_tmh_ranges)

# Convert TMH positions to actual PDB residue numbers as pairs
tmh_extended_pairs = [
    (observed_residues[start] -1, observed_residues[end] -1) for start, end in extended_tmh_ranges
]

print(f"TMH start/end residue pairs in PDB numbering: {tmh_extended_pairs}")

def reorder_gpcr_tmh_ends(tmh_extended_pairs):
    """Reorder TMH ends for a GPCR assuming 14 TMH ends in the given pattern."""
    pattern = ["extra", "intra", "intra", "extra", "extra", "intra", "intra", 
               "extra", "extra", "intra", "intra", "extra", "extra", "intra"]

    reordered = []
    for i, label in enumerate(pattern):
        if label == "extra":
            reordered.append(tmh_extended_pairs[i // 2][0])  # Take start residue
        else:  # "intra"
            reordered.append(tmh_extended_pairs[i // 2][1])  # Take end residue

    return reordered

# Process
flattened_tmh = reorder_gpcr_tmh_ends(tmh_extended_pairs)
print(flattened_tmh)


Available Spaces: [1, 12, 12, 20, 25, 29, 8]
{0: (0, 0), 1: (0, 1), 2: (5, 0), 3: (6, 6), 4: (4, 0), 5: (4, 9), 6: (9, 0), 7: (3, 0)}
[(161, 170), (172, 188), (201, 225), (238, 259), (280, 301), (327, 347), (377, 400), (409, 428)]
[(161, 170), (172, 189), (196, 225), (232, 265), (276, 301), (323, 356), (368, 400), (406, 428)]
TMH start/end residue pairs in PDB numbering: [(31, 40), (42, 59), (66, 95), (102, 135), (146, 171), (196, 229), (266, 298), (304, 326)]
[31, 40, 59, 42, 66, 95, 135, 102, 146, 171, 229, 196, 266, 298]


In [25]:
aligned_data = list(zip(observed_residues, ss_data))

# Print the results
# for res_num, sec_struct in aligned_data:
#     print(f"Residue {res_num}: {sec_struct}")

# print(desired_extensions_dict)
print(observed_residues)
print(tmh_ranges)
print(extended_tmh_ranges)

[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251

In [30]:
print(f"PDB Code: {pdb_code}")
# print(f"Observed Residues: {observed_residues.get(pdb_code, 'Not Found')}")
print(f"Extended TMH Ranges: {extended_tmh_ranges}")

for start, end in extended_tmh_ranges:
    if start < 0 or start >= len(observed_residues):
        print(f"Error: start index {start} out of range for {pdb_code}")
    if end < 0 or end >= len(observed_residues):
        print(f"Error: end index {end} out of range for {pdb_code}")

PDB Code: 3sn6
Extended TMH Ranges: [(2, 31), (38, 67), (73, 107), (118, 143), (168, 200), (366, 397), (404, 427)]
Error: start index 366 out of range for 3sn6
Error: end index 397 out of range for 3sn6
Error: start index 404 out of range for 3sn6
Error: end index 427 out of range for 3sn6


In [32]:
print(len(observed_residues))

349
