In [36]:
from Bio import PDB, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Get PDB data
pdb_code = ["3d4s","2rh1"]
pdb_file = PDB.PDBList()
pdb_file.retrieve_pdb_file(pdb_code, file_format="pdb", pdir='C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files')

def three_to_one(resname):
    aa_dict = {
        "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
        "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
        "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
        "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V"
    }
    return aa_dict.get(resname, "X")  # Return 'X' for unknown residues


def extract_pdb_fasta(pdb_filepath, chain_id, output_fasta):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_code, pdb_filepath)

    sequence = []
    observed_residues = []
    
    for model in structure:
        for chain in model:
            if chain.id == chain_id:  # Only process the target chain
                for residue in chain.get_residues():
                    if PDB.is_aa(residue):
                        sequence.append(three_to_one(residue.get_resname()))
                        observed_residues.append(residue.id[1])  # Keep track of numbering

    fasta_seq = SeqRecord(Seq("".join(sequence)), id=f"{pdb_filepath}_{chain_id}", description="")
    SeqIO.write(fasta_seq, output_fasta, "fasta")

    return observed_residues  # Return numbering for later alignment

pdb_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files\\pdb{pdb_code}.ent"
chain_id = "A"
fasta_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\{pdb_code}.fasta"

observed_residues = extract_pdb_fasta(pdb_filepath, chain_id, fasta_filepath)
print(f"Extracted residues: {observed_residues}")


Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb3d4s.ent' 
Extracted residues: [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 21

In [37]:
# Attempting to use DeepTMHMM through the terminal but accessing that through python
import subprocess
import os
import shutil

results_dir = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}"
os.makedirs(results_dir, exist_ok=True)

# Run DeepTMHMM within the results directory
process = subprocess.Popen(
    ["biolib", "run", "DTU/DeepTMHMM", "--fasta", fasta_filepath],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
    cwd=results_dir 
)

stdout, stderr = process.communicate() 
print("STDOUT:", stdout)
print("STDERR:", stderr)

STDOUT: 2025-03-03 12:21:48,529 | INFO : Extracted zip file to: output/

2025-03-03 12:21:48,529 | INFO : Done in 2.48 seconds

2025-03-03 12:21:48,529 | INFO : Extracted zip file to: output/
2025-03-03 12:21:48,529 | INFO : Done in 2.48 seconds

STDERR: 


In [45]:
# Align output ranges with actual observed residues
tmh_ranges = []
tmh_result_file = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}\\biolib_results\\TMRs.gff3"

with open(tmh_result_file) as file:
    for line in file:
        if "TMhelix" in line:
            parts = line.strip().split("\t")
            start, end = int(parts[2]), int(parts[3])
            tmh_ranges.append((start, end))

# Convert TMH positions to actual PDB residue numbers as pairs
tmh_pdb_residue_pairs = [
    (observed_residues[start - 1], observed_residues[end - 1]) for start, end in tmh_ranges
]

print(f"TMH start/end residue pairs in PDB numbering: {tmh_pdb_residue_pairs}")
print(tmh_ranges)


TMH start/end residue pairs in PDB numbering: [(34, 58), (71, 92), (108, 129), (150, 171), (198, 219), (275, 295), (307, 327)]
[(3, 27), (40, 61), (77, 98), (119, 140), (167, 188), (372, 392), (404, 424)]


In [47]:
import numpy as np
import torch
import pydssp

parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure(pdb_code, pdb_filepath)

# Select the first structure in the file
model = structure[0]

# Select the A chain
chain = model['A']

# Retrieve coordinates
coordinates = []
for residue in chain:
    if PDB.is_aa(residue):
        res_name = residue.get_resname()
        for atom in residue:
            if atom.get_name() in ['N', 'CA', 'C', 'O']:
                coordinates.append(atom.coord)

# Prepare eliments of pytorch tensor
L = sum(1 for residue in chain if PDB.is_aa(residue))
atoms = 4
xyz = 3

# Make coordinates into array first to increase efficiency
coord_array = np.array(coordinates)

# Create tensor
coord_tensor = torch.tensor(coord_array, dtype=torch.float32).reshape([L, atoms, xyz])

#dssp_hbond_matrix = pydssp.get_hbond_map(coord_tensor) > 0.5

# Use pydssp to get secondary structure
ss_data = pydssp.assign(coord_tensor, out_type='c3')
print(ss_data)

['-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' 'H' 'H' 'H' '-' '-' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' '-' '-' '-' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-'
 '-' '-' '-' '-' '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' '-' '-'
 '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' '-' '-' '-' '-' '-'
 '-' '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' 'H' 'H' 'H'
 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H'
 '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' '-' '-' 'E' '-' 'E' '-' 'E'
 'E' '-' '-' '-' '-' '-' 'E' 'E' 'E' 'E' '-' '-' 'E' 'E' '-' 'E' '-' '-'
 '-' '-' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H

In [43]:
ss_data_string = "".join(ss_data)
print(ss_data_string)

-HHHHHHHHHHHHHHHHHHHHHHHHHHHH-HHH--HHHHHHHHHHHHHHHHHHH-HHHHHHHHH------HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH----------HHHHHHHHHHHHHHHHHHHHHHHH--------HHHHHHHH----------HHHHHHHHHHH-HHHHHHHHHHHHHHHHHHHHH--HHHHHHHHH--E-E-EE-----EEEE--EE-E----HHHHHHHHHHHH------E--HHHHHHHHHHHHHHHHHHHHH-HHHHHHHHH--HHHHHHHHHHHHHH-HHHH---HHHHHHHH---HHHHHHHHH--HHHHH-HHHHHHHHHHHHH----------HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHH------HHHHHHHHHHHHHHHH-HHHHHHH-HHHHHHHHHH---


In [60]:
def extend_tmh_ranges(tmh_ranges, ss_data, max_extend=9):
    extended_tmh_ranges = []

    for start, end in tmh_ranges:
        # Extend start position backwards if residues are consecutive 'H'
        extended_start = start
        for i in range(1, max_extend + 1):
            prev_res = start - i
            if prev_res >= 0 and ss_data[prev_res] == "H":  # Ensure valid index
                extended_start = prev_res
            else:
                break  # Stop if a non-'H' is encountered

        # Extend end position forwards if residues are consecutive 'H'
        extended_end = end
        for i in range(1, max_extend + 1):
            next_res = end + i
            if next_res < len(ss_data) and ss_data[next_res] == "H":  # Ensure valid index
                extended_end = next_res
            else:
                break  # Stop if a non-'H' is encountered

        extended_tmh_ranges.append((extended_start, extended_end))

    return extended_tmh_ranges

extended_tmh_ranges = extend_tmh_ranges(tmh_ranges, ss_data)
print(f"Extended TMH Ranges: {extended_tmh_ranges}")


Extended TMH Ranges: [(1, 28), (35, 63), (70, 104), (115, 140), (165, 197), (363, 394), (401, 424)]


In [61]:
# Convert TMH positions to actual PDB residue numbers as pairs
tmh_extended_pairs = [
    (observed_residues[start - 1], observed_residues[end - 1]) for start, end in extended_tmh_ranges
]

print(f"TMH start/end residue pairs in PDB numbering: {tmh_extended_pairs}")

TMH start/end residue pairs in PDB numbering: [(32, 59), (66, 94), (101, 135), (146, 171), (196, 228), (266, 297), (304, 327)]


In [64]:
def reorder_gpcr_tmh_ends(tmh_extended_pairs):
    """Reorder TMH ends for a GPCR assuming 14 TMH ends in the given pattern."""
    pattern = ["extra", "intra", "intra", "extra", "extra", "intra", "intra", 
               "extra", "extra", "intra", "intra", "extra", "extra", "intra"]

    reordered = []
    for i, label in enumerate(pattern):
        if label == "extra":
            reordered.append(tmh_extended_pairs[i // 2][0])  # Take start residue
        else:  # "intra"
            reordered.append(tmh_extended_pairs[i // 2][1])  # Take end residue

    return reordered

# Process
flattened_tmh = reorder_gpcr_tmh_ends(tmh_extended_pairs)
print(flattened_tmh)


[32, 59, 94, 66, 101, 135, 171, 146, 196, 228, 297, 266, 304, 327]
