In [1]:
# Call neccesary liraries
from Bio import PDB
from Bio import SeqIO

# Get PDB data
pdb_code = "6prz"
pdb_file = PDB.PDBList()
pdb_file.retrieve_pdb_file(pdb_code, file_format="pdb", pdir='C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files')

# Parse the file
parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure(pdb_code, f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files\\pdb{pdb_code}.ent")

# Define filepaths
pdb_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files\\pdb{pdb_code}.ent"
fasta_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\{pdb_code}.fasta"

# Convert PDB to fasta
records = list(SeqIO.parse(pdb_filepath, "pdb-atom"))
count = SeqIO.write(records, fasta_filepath, "fasta")

# Check fasta file has been created
if count == 0:
    raise ValueError(f"FASTA file creation failed for {pdb_code}")

# Attempting to use DeepTMHMM through the terminal but accessing that through python
import subprocess
import os
import shutil

results_dir = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}"
os.makedirs(results_dir, exist_ok=True)

#os.makedirs(results_dir, exist_ok=True)

# Run DeepTMHMM within the results directory
result = subprocess.run(
    ["biolib", "run", "DTU/DeepTMHMM", "--fasta", fasta_filepath],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
    cwd=results_dir 
)

print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)





Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb6prz.ent' 




STDOUT: 2025-03-14 10:23:18,221 | INFO : Extracted zip file to: output/

2025-03-14 10:23:18,221 | INFO : Done in 6.28 seconds


STDERR: Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Student\AppData\Local\Programs\Python\Python313\Scripts\biolib.exe\__main__.py", line 7, in <module>
    sys.exit(call_cli())
             ~~~~~~~~^^
  File "C:\Users\Student\AppData\Local\Programs\Python\Python313\Lib\site-packages\biolib\__init__.py", line 28, in call_cli
    biolib.cli.cli()
    ~~~~~~~~~~~~~~^^
  File "C:\Users\Student\AppData\Local\Programs\Python\Python313\Lib\site-packages\click\core.py", line 1161, in __call__
    return self.main(*args, **kwargs)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Users\Student\AppData\Local\Programs\Python\Python313\Lib\site-packages\click\core.py", line 1082, in main
    rv = self.invoke(ctx)
  File "C:\Users\Student\AppData\Local\Programs

In [69]:
# Find first residue of PDB structure to align DeepTMHMM resultant residue ranges
def get_first_residue(pdb_file):
    for model in structure:
        for chain in model:
            for residue in chain.get_residues():
                if PDB.is_aa(residue):
                    return residue.id[1]
    return None

first_residue = get_first_residue(pdb_file)

# Extract TMH residue ranges from the results
TMR_result = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}\\biolib_results\\TMRs.gff3"

tmh_ranges = []

with open(TMR_result) as file:
    for line in file:
        if "TMhelix" in line:
            parts = line.strip().split("\t")
            start, end = int(parts[2]), int (parts[3])
            tmh_ranges.append((start, end))

print("DeepTMHMM result ranges:", tmh_ranges)

result_residues = adjusted_tmh_list = [residue for start, end in tmh_ranges for residue in (start + first_residue - 1, end + first_residue - 1)]


print("Residues:", result_residues)

# Extract observed residues from PDB structure
def get_observed_residues(structure):
    observed_residues = []
    for model in structure:
        for chain in model:
            if chain.id == "A":  # Only process Chain A
                for residue in chain.get_residues():
                    if PDB.is_aa(residue):
                        observed_residues.append(residue.id[1])  
    return observed_residues

observed_residues = get_observed_residues(structure)

# Map TMH start/end indices to actual residue numbers and flatten the list
adjusted_tmh_list = []
for start, end in tmh_ranges:
    if start - 1 < len(observed_residues) and end - 1 < len(observed_residues):
        adjusted_tmh_list.extend([observed_residues[start - 1], observed_residues[end - 1]])

print("Corrected TMH residue numbers:", adjusted_tmh_list)

print(observed_residues)
if len(observed_residues) >= 289:
    print(observed_residues[28])  # 291st residue (index 290)
else:
    print("Error: Less observed residues found.")

print(len(observed_residues))

DeepTMHMM result ranges: [(2, 22), (34, 56), (72, 93), (114, 134), (164, 184), (239, 259), (273, 291)]
Residues: [38, 58, 70, 92, 108, 129, 150, 170, 200, 220, 275, 295, 309, 327]
Corrected TMH residue numbers: [38, 58, 71, 110, 126, 147, 206, 226, 277, 316]
[37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 264, 265, 266, 267, 268, 269, 270, 271, 272

In [11]:
# Input into coordinate retrieval code:

# Call libraries
from Bio import PDB
import tempfile
import pandas as pd

# Create function to extract coordinates
def get_coords(pdb_id, chain_id, residues):
    pdb_id = pdb_id.lower()
    pdbl = PDB.PDBList()
    coords = [pdb_id.upper(), chain_id] 

    # Create temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        pdb_file_path = pdbl.retrieve_pdb_file(pdb_id, pdir=temp_dir, file_format="pdb")

        # Parse the file
        parser = PDB.PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, pdb_file_path)

        # Extract resolution
        resolution = "Unknown"
        with open(pdb_file_path, "r") as f:
            for line in f:
                if line.startswith("REMARK   2 RESOLUTION"):
                    resolution = line.split()[3]
                    break

        coords.insert(1, resolution) 
        
        # Loop through file to extract residue coordinates
        for residue_id in residues:
            found = False
            for model in structure:
                if chain_id in model:
                    chain = model[chain_id]
                    if residue_id in chain:
                        residue = chain[residue_id]
                        residue_name = residue.get_resname()
                        for atom in residue:
                            if atom.get_name() == "CA":
                                coords.extend([f"{residue_name}{residue_id}", *atom.coord])
                                found = True
                                break
            if not found:
                coords.extend([f"Unknown{residue_id}", "NA", "NA", "NA"])

    return coords

# Select pdb code, chain and residues
pdb_data = {
    "2R4R": {"A": adjusted_tmh_ranges},
    "2R4S": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "2RH1": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "6e67": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "3D4S": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "6E67": {"B": [219, 326, 207, 315, 121, 282, 268, 131]},
    "3SN6": {"R": [219, 326, 207, 315, 121, 282, 268, 131]},
    "3SN6": {"R": [219, 326, 207, 315, 121, 282, 268, 131]},




}

data = []

# Collect and prepare items to be processed by function
for pdb_id, chains in pdb_data.items():
    for chain_id, residues in chains.items():
        data.append(get_coords(pdb_id, chain_id, residues))

# Organise the data frame for accurate conversion to Excel
max_residues = max(len(residues) for chains in pdb_data.values() for residues in chains.values())

# Define columns dynamically
columns = ["PDB ID", "Resolution", "Chain"] + sum([["Res", "X", "Y", "Z"]] * max_residues, [])

# Convert to DataFrame
df = pd.DataFrame(data, columns=columns)

# Save as Excel output
output_file = "C:/Users/Student/OneDrive - Aston University/Documents/Biology/Project/Landmarks/Automated landmarks/Protein_coordinates.xlsx"
df.to_excel(output_file, index=False)

#Print coordinates as a test
coordinates = get_coords(pdb_id, chain_id, residues)
print(coordinates)

Downloading PDB structure '2r4r'...
Downloading PDB structure '2r4s'...
Downloading PDB structure '2rh1'...
Downloading PDB structure '6e67'...
Downloading PDB structure '3d4s'...
Downloading PDB structure '6e67'...
Downloading PDB structure '3sn6'...
Downloading PDB structure '3sn6'...
['3SN6', '3.20', 'R', 'TYR219', np.float32(8.428), np.float32(6.031), np.float32(13.925), 'TYR326', np.float32(21.614), np.float32(12.361), np.float32(13.496), 'SER207', np.float32(16.308), np.float32(0.329), np.float32(-2.121), 'GLY315', np.float32(18.342), np.float32(12.648), np.float32(-2.895), 'ILE121', np.float32(18.362), np.float32(4.453), np.float32(2.789), 'PHE282', np.float32(14.362), np.float32(11.456), np.float32(4.247), 'GLU268', np.float32(3.561), np.float32(17.853), np.float32(22.387), 'ARG131', np.float32(14.963), np.float32(3.333), np.float32(18.044)]


In [81]:
from Bio import PDB
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# Parse the PDB file
parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure(pdb_code, pdb_filepath)

# Extract sequence from Chain A
def get_chain_sequence(structure, chain_id="A"):
    aa_dict = {
        "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
        "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
        "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
        "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V"
    }
    
    sequence = []
    for model in structure:
        for chain in model:
            if chain.id == chain_id:
                for residue in chain:
                    res_name = residue.get_resname()
                    if res_name in aa_dict:
                        sequence.append(aa_dict[res_name])
    
    return "".join(sequence)

chain_a_sequence = get_chain_sequence(structure, "A")

# Save the sequence as a FASTA file
seq_record = SeqRecord(Seq(chain_a_sequence), id=pdb_code, description="Chain A extracted from PDB")
with open(fasta_filepath, "w") as fasta_file:
    SeqIO.write(seq_record, fasta_file, "fasta")

print(f"FASTA file created successfully: {fasta_filepath}")

# Attempting to use DeepTMHMM through the terminal but accessing that through python
import subprocess
import os
import shutil

results_dir = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}"
os.makedirs(results_dir, exist_ok=True)

# Run DeepTMHMM within the results directory
result = subprocess.run(
    ["biolib", "run", "DTU/DeepTMHMM", "--fasta", fasta_filepath],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
    cwd=results_dir 
)

print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)




FASTA file created successfully: C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\Fasta_files\2rh1.fasta
STDOUT: 2025-03-01 18:22:36,815 | INFO : Extracted zip file to: output/

2025-03-01 18:22:36,815 | INFO : Done in 2.43 seconds

2025-03-01 18:22:36,815 | INFO : Extracted zip file to: output/
2025-03-01 18:22:36,815 | INFO : Done in 2.43 seconds

STDERR: 


In [85]:
fasta_file = "C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\2rh1.fasta"
record = next(SeqIO.parse(fasta_file, "fasta"))

print("FASTA sequence used for DeepTMHMM:")
print(record.seq)

FASTA sequence used for DeepTMHMM:
DEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYAEETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSELDKAIGRNTNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRAALINMVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPNRAKRVITTFRTGTWDAYKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCL


In [86]:
def get_observed_residues(structure, chain_id="A"):
    observed_residues = []
    for model in structure:
        for chain in model:
            if chain.id == chain_id:
                for residue in chain:
                    if PDB.is_aa(residue):
                        observed_residues.append(residue.id[1])  
    return observed_residues

observed_residues = get_observed_residues(structure)
print("Observed residues in PDB:", observed_residues)

Observed residues in PDB: [29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 1002, 1003, 1004, 1005, 1006, 1007

In [87]:
adjusted_tmh_list = [observed_residues[i - 1] for start, end in tmh_ranges for i in (start, end)]
print("Adjusted TMH residues:", adjusted_tmh_list)

Adjusted TMH residues: [30, 50, 62, 84, 100, 121, 142, 162, 192, 212, 1038, 1058, 1072, 1090]


In [None]:
def run_deeptmhmm(pdb_code, fasta_filepath_wsl):
    pdb_results_dir = f"/Users/Student/OneDrive - Aston University/Documents/Biology/Project/Project_automation/Python/DeepTMHMM_results/{pdb_code}"
    os.makedirs(pdb_results_dir, exist_ok=True)

    process = subprocess.run(
        ["wsl", "/home/dan/.local/bin/biolib", "run", "--local", "DTU/DeepTMHMM:1.0.24", "--fasta", f"{fasta_filepath_wsl}"],
        text=True,
        capture_output=True,
        cwd=pdb_results_dir
    )

run_deeptmhmm("6prz", 