In [42]:
from Bio import PDB, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import numpy as np
import torch
import pydssp
import tempfile
import pandas as pd
import subprocess
import os
import shutil


# Get PDB data
pdb_codes = ["3d4s", "3d4s"]
pdb_file = PDB.PDBList()
for pdb_code in pdb_codes:
    pdb_file.retrieve_pdb_file(pdb_code, file_format="pdb", 
                               pdir="C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files")
def three_to_one(resname):
    aa_dict = {
        "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
        "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
        "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
        "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V"
    }
    return aa_dict.get(resname, "X")  # Return 'X' for unknown residues


def extract_pdb_fasta(pdb_filepath, chain_id, output_fasta):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_code, pdb_filepath)

    sequence = []
    observed_residues = []
    
    for model in structure:
        for chain in model:
            if chain.id == chain_id:  # Only process the target chain
                for residue in chain.get_residues():
                    if PDB.is_aa(residue):
                        sequence.append(three_to_one(residue.get_resname()))
                        observed_residues.append(residue.id[1])  # Keep track of numbering

    fasta_seq = SeqRecord(Seq("".join(sequence)), id=f"{pdb_filepath}_{chain_id}", description="")
    SeqIO.write(fasta_seq, output_fasta, "fasta")

    return observed_residues  # Return numbering for later alignment

chain_id = "A"
all_observed_residues = {}

for pdb_code in pdb_codes:
    pdb_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files\\pdb{pdb_code}.ent"
    fasta_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\{pdb_code}.fasta"

    observed_residues = extract_pdb_fasta(pdb_filepath, chain_id, fasta_filepath)
    all_observed_residues[pdb_code] = observed_residues  # Store results

#    print(f"PDB: {pdb_code}, Extracted residues: {observed_residues}")

# Attempting to use DeepTMHMM through the terminal but accessing that through python


def run_deeptmhmm(pdb_code, fasta_filepath):
    results_dir = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}"
    os.makedirs(results_dir, exist_ok=True)
    
    # Run DeepTMHMM within the results directory
    process = subprocess.Popen(
        ["biolib", "run", "DTU/DeepTMHMM", "--fasta", fasta_filepath],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        cwd=results_dir 
    )
    
    stdout, stderr = process.communicate() 
    print(f"[{pdb_code}] STDOUT:", stdout)
    print(f"[{pdb_code}] STDERR:", stderr)

for pdb_code in pdb_codes:
    fasta_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\{pdb_code}.fasta"
    run_deeptmhmm(pdb_code, fasta_filepath)

Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb3d4s.ent' 
Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb3d4s.ent' 
[3d4s] STDOUT: 2025-03-03 17:55:08,180 | INFO : Extracted zip file to: output/

2025-03-03 17:55:08,180 | INFO : Done in 2.70 seconds

2025-03-03 17:55:08,180 | INFO : Extracted zip file to: output/
2025-03-03 17:55:08,180 | INFO : Done in 2.70 seconds

[3d4s] STDERR: 
[3d4s] STDOUT: 2025-03-03 17:57:02,732 | INFO : Extracted zip file to: output/

2025-03-03 17:57:02,733 | INFO : Done in 2.59 seconds

2025-03-03 17:57:02,732 | INFO : Extracted zip file to: output/
2025-03-03 17:57:02,733 | INFO : Done in 2.59 seconds

[3d4s] STDERR: 


In [43]:
tmh_data = {}
pdb_data = {}

# Align output ranges with actual observed residues
for pdb_code in pdb_codes:
    tmh_ranges = []
    tmh_result_file = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}\\biolib_results\\TMRs.gff3"
    
    with open(tmh_result_file) as file:
        for line in file:
            if "TMhelix" in line:
                parts = line.strip().split("\t")
                start, end = int(parts[2]), int(parts[3])
                tmh_ranges.append((start, end))
    
    # Use DSSP
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_code, pdb_filepath)
    
    # Select the first structure in the file and the A chain
    chain = structure[0]['A']
    
    # Retrieve coordinates
    coordinates = []
    for residue in chain:
        if PDB.is_aa(residue):
            res_name = residue.get_resname()
            for atom in residue:
                if atom.get_name() in ['N', 'CA', 'C', 'O']:
                    coordinates.append(atom.coord)
    
    # Prepare eliments of pytorch tensor
    L = sum(1 for residue in chain if PDB.is_aa(residue))
    atoms = 4
    xyz = 3
    
    # Make coordinates into array first to increase efficiency
    coord_array = np.array(coordinates)
    
    # Create tensor
    coord_tensor = torch.tensor(coord_array, dtype=torch.float32).reshape([L, atoms, xyz])
    
    # Use pydssp to get secondary structure
    ss_data = pydssp.assign(coord_tensor, out_type='c3')

    def extend_tmh_ranges(tmh_ranges, ss_data, max_extend=9):
        extended_tmh_ranges = []
    
        for start, end in tmh_ranges:
            # Extend start position backwards if residues are consecutive 'H'
            extended_start = start
            for i in range(1, max_extend + 1):
                prev_res = start - i
                if prev_res >= 0 and ss_data[prev_res] == "H":  # Ensure valid index
                    extended_start = prev_res
                else:
                    break  # Stop if a non-'H' is encountered
    
            # Extend end position forwards if residues are consecutive 'H'
            extended_end = end
            for i in range(1, max_extend + 1):
                next_res = end + i
                if next_res < len(ss_data) and ss_data[next_res] == "H":  # Ensure valid index
                    extended_end = next_res
                else:
                    break  # Stop if a non-'H' is encountered
    
            extended_tmh_ranges.append((extended_start, extended_end))
    
    
        return extended_tmh_ranges
    
    extended_tmh_ranges = extend_tmh_ranges(tmh_ranges, ss_data)
    print(extended_tmh_ranges)

    # Convert TMH positions to actual PDB residue numbers as pairs
    tmh_extended_pairs = [
        (observed_residues[start - 1], observed_residues[end - 1]) for start, end in extended_tmh_ranges
    ]

    def reorder_gpcr_tmh_ends(tmh_extended_pairs):
        pattern = ["extra", "intra", "intra", "extra", "extra", "intra", "intra", 
                   "extra", "extra", "intra", "intra", "extra", "extra", "intra"]
    
        reordered = []
        for i, label in enumerate(pattern):
            if label == "extra":
                reordered.append(tmh_extended_pairs[i // 2][0])  # Take start residue
            else:  # "intra"
                reordered.append(tmh_extended_pairs[i // 2][1])  # Take end residue
    
        return reordered
    
    flattened_tmh = reorder_gpcr_tmh_ends(tmh_extended_pairs)
    
    pdb_data[pdb_code.upper()] = {"A": flattened_tmh}


print(pdb_data)

[(1, 28), (35, 63), (70, 104), (115, 140), (165, 197), (363, 394), (401, 424)]
[(1, 28), (35, 63), (70, 104), (115, 140), (165, 197), (363, 394), (401, 424)]
{'3D4S': {'A': [32, 59, 94, 66, 101, 135, 171, 146, 196, 228, 297, 266, 304, 327]}}
