In [None]:
# Call neccesary liraries
from Bio import PDB
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# Get PDB data
pdb_code = "2RH1"
pdb_file = PDB.PDBList()
pdb_file.retrieve_pdb_file(pdb_code, file_format="pdb")

# Parse the file
parser = PDB.PDBParser(QUIET=True)
structure = parser.get_structure(pdb_code, f"rh\\pdb2rh1.pdb")

# Select the first structure in the file
model = structure[0]

# Select the A chain
chain = model['A']

# Retrieve coordinates
coordinates = []
fasta_seq = []
for residue in chain:
    if PDB.is_aa(residue):
        res_name = residue.get_resname()
        fasta_seq.append(three_to_one.get(res_name))
        for atom in residue:
            if atom.get_name() in ['N', 'CA', 'C', 'O']:
                coordinates.append(atom.coord)

In [None]:
# Prepare eliments of pytorch tensor
L = sum(1 for residue in chain if PDB.is_aa(residue))
atoms = 4
xyz = 3

# Create tensor
coord_tensor = torch.tensor(coordinates, dtype=torch.float32).reshape([L, atoms, xyz])

# Use pydssp to get secondary structure
ss_data = pydssp.assign(coord_tensor, out_type='c3')

# Put it all into one string
ss_data_string = "".join(ss_data)
print(ss_data_string)

In [None]:
# Run DeepTMHMM locally in python
import subprocess

# Set stdout to True for real-time progress
biolib.utils.STREAM_STDOUT = True

# Path to your input FASTA file
fasta_file = "2rh1.fasta"

# Command you would normally run in terminal
command = [
    'biolib', 'run', '--local', 'DTU/DeepTMHMM:1.0.24', '--fasta', '2rh1.fasta'
]

# Run the command from Python
result = subprocess.run(command, capture_output=True, text=True)

# Print stdout and stderr if needed
print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)

# Check if the command was successful
if result.returncode == 0:
    print("DeepTMHMM ran successfully.")
else:
    print("Error running DeepTMHMM.")

In [None]:
# Call libraries
from Bio import PDB
import tempfile
import pandas as pd

# Create function to extract coordinates
def get_coords(pdb_id, chain_id, residues):
    pdb_id = pdb_id.lower()
    pdbl = PDB.PDBList()
    coords = [pdb_id.upper(), chain_id] 

    # Create temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        pdb_file_path = pdbl.retrieve_pdb_file(pdb_id, pdir=temp_dir, file_format="pdb")

        # Parse the file
        parser = PDB.PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, pdb_file_path)

        # Extract resolution
        resolution = "Unknown"
        with open(pdb_file_path, "r") as f:
            for line in f:
                if line.startswith("REMARK   2 RESOLUTION"):
                    resolution = line.split()[3]
                    break

        coords.insert(1, resolution) 
        
        # Loop through file to extract residue coordinates
        for residue_id in residues:
            found = False
            for model in structure:
                if chain_id in model:
                    chain = model[chain_id]
                    if residue_id in chain:
                        residue = chain[residue_id]
                        residue_name = residue.get_resname()
                        for atom in residue:
                            if atom.get_name() == "CA":
                                coords.extend([f"{residue_name}{residue_id}", *atom.coord])
                                found = True
                                break
            if not found:
                coords.extend([f"Unknown{residue_id}", "NA", "NA", "NA"])

    return coords

# Select pdb code, chain and residues
pdb_data = {
    "2R4R": {"A": [37, 58, 90, 67]},
    "2R4S": {"A": [37, 58, 90, 70]},
    "2RH1": {"A": [30, 60, 96, 67]},
    "3D4S": {"A": [33, 60, 96, 67]},
    "3KJ6": {"A": [37, 57, 90, 67]},
    "3NY8": {"A": [33, 60, 96, 67]}
}

data = []

# Collect and prepare items to be processed by function
for pdb_id, chains in pdb_data.items():
    for chain_id, residues in chains.items():
        data.append(get_coords(pdb_id, chain_id, residues))

# Organise the data frame for accurate conversion to Excel
max_residues = max(len(residues) for chains in pdb_data.values() for residues in chains.values())

# Define columns dynamically
columns = ["PDB ID", "Resolution", "Chain"] + sum([["Res", "X", "Y", "Z"]] * max_residues, [])

# Convert to DataFrame
df = pd.DataFrame(data, columns=columns)

# Save as Excel output
output_file = "C:/Users/Student/OneDrive - Aston University/Documents/Biology/Project/Landmarks/Automated landmarks/Protein_coordinates.xlsx"
df.to_excel(output_file, index=False)

#Print coordinates as a test
coordinates = get_coords(pdb_id, chain_id, residues)
print(coordinates)

In [None]:
# Attempting to use DeepTMHMM through the terminal but accessing that through python
import subprocess
import os
import shutil

results_dir = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}"
os.makedirs(results_dir, exist_ok=True)

# Run DeepTMHMM within the results directory
result = subprocess.run(
    ["biolib", "run", "DTU/DeepTMHMM", "--fasta", fasta_filepath],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True,
    cwd=results_dir 
)

print("STDOUT:", result.stdout)
print("STDERR:", result.stderr)

In [None]:
# Align output ranges with actual observed residues
tmh_ranges = []
tmh_result_file = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\DeepTMHMM_results\\{pdb_code}\\biolib_results\\TMRs.gff3"

with open(tmh_result_file) as file:
    for line in file:
        if "TMhelix" in line:
            parts = line.strip().split("\t")
            start, end = int(parts[2]), int(parts[3])
            tmh_ranges.append((start, end))

# Convert TMH positions to actual PDB residue numbers as pairs
tmh_pdb_residue_pairs = [
    (observed_residues[start - 1], observed_residues[end - 1]) for start, end in tmh_ranges
]

def extend_tmh_ranges(tmh_ranges, ss_data, max_extend=9):
    extended_tmh_ranges = []

    for start, end in tmh_ranges:
        # Extend start position backwards if residues are consecutive 'H'
        extended_start = start
        for i in range(1, max_extend + 1):
            prev_res = start - i
            if prev_res >= 0 and ss_data[prev_res] == "H":  # Ensure valid index
                extended_start = prev_res
            else:
                break  # Stop if a non-'H' is encountered

        # Extend end position forwards if residues are consecutive 'H'
        extended_end = end
        for i in range(1, max_extend + 1):
            next_res = end + i
            if next_res < len(ss_data) and ss_data[next_res] == "H":  # Ensure valid index
                extended_end = next_res
            else:
                break  # Stop if a non-'H' is encountered

        extended_tmh_ranges.append((extended_start, extended_end))

    return extended_tmh_ranges

extended_tmh_ranges = extend_tmh_ranges(tmh_ranges, ss_data)

# Convert TMH positions to actual PDB residue numbers as pairs
tmh_extended_pairs = [
    (observed_residues[start - 1], observed_residues[end - 1]) for start, end in extended_tmh_ranges
]

In [None]:
# Determine how much each TMH would like to extend based on consecutive 'H' residues
def calculate_desired_extensions(tmh_ranges, ss_data, max_extend=9):
    desired_extensions = []

    for start, end in tmh_ranges:
        # Backward extension
        backward_extension = 0
        for i in range(1, max_extend + 1):
            if start - i >= 0 and ss_data[start - i] == "H":
                backward_extension += 1
            else:
                break  # Stop at first non-'H'

        # Forward extension
        forward_extension = 0
        for i in range(1, max_extend + 1):
            if end + i < len(ss_data) and ss_data[end + i] == "H":
                forward_extension += 1
            else:
                break  # Stop at first non-'H'

        desired_extensions.append((backward_extension, forward_extension))

    return desired_extensions

# Example usage
desired_extensions = calculate_desired_extensions(tmh_ranges, ss_data)
print("Desired Extensions:", desired_extensions)



# Calculate the number of residues available between consecutive TMHs
def calculate_available_spaces(tmh_ranges):
    available_spaces = []

    for i in range(len(tmh_ranges) - 1):
        prev_end = tmh_ranges[i][1]  # End of the current TMH
        next_start = tmh_ranges[i + 1][0]  # Start of the next TMH
        available_space = next_start - prev_end - 1  # Residues in between
        available_spaces.append(available_space)

    return available_spaces

# Example usage
available_spaces = calculate_available_spaces(tmh_ranges)
print("Available Spaces:", available_spaces)

import math

# First establish the maximum amount any extension can be
max_extension = 9

# Edit desired_extensions so that none go over the max_extension
desired_extensions = [(min(start, max_extension), min(end, max_extension)) for start, end in desired_extensions]

# Put desired extensions in a dictionary so we can align them with available spaces and keep them in order
desired_extensions_dict = dict(enumerate(desired_extensions))

# Process each available space and ensure fair distribution
for i in range(len(available_spaces)):
    total_desired = desired_extensions_dict[i][1] + desired_extensions_dict[i + 1][0]  # Sum of forward and backward extension
    available = available_spaces[i]  # Available space between TMHs

    if total_desired > available:
        half_space = math.floor(available / 2)  # Half the space, rounded down
        
        # If the right-side extension fits within half_space, the left gets the remainder
        if half_space >= desired_extensions_dict[i][1]:
            desired_extensions_dict[i + 1] = (available - desired_extensions_dict[i][1], desired_extensions_dict[i + 1][1])
        
        # If the left-side extension fits within half_space, the right gets the remainder
        elif half_space >= desired_extensions_dict[i + 1][0]:
            desired_extensions_dict[i] = (desired_extensions_dict[i][0], available - desired_extensions_dict[i + 1][0])

        # Otherwise, split evenly
        else:
            desired_extensions_dict[i] = (desired_extensions_dict[i][0], half_space)
            desired_extensions_dict[i + 1] = (half_space, desired_extensions_dict[i + 1][1])
            
print(desired_extensions_dict)

# Apply the extensions to tmh_ranges
extended_tmh_ranges = []

for i, (start, end) in enumerate(tmh_ranges):
    left_extension = desired_extensions_dict[i][0]  # Backward extension
    right_extension = desired_extensions_dict[i][1]  # Forward extension
    
    new_start = start - left_extension  # Extend start backwards
    new_end = end + right_extension  # Extend end forward
    
    extended_tmh_ranges.append((new_start, new_end))  # Store the updated range

# Print or return the extended TMH ranges
print(extended_tmh_ranges)

In [None]:
# Download multiple PDB files in parallel
def download_all_pdbs(pdb_codes, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    with ThreadPoolExecutor() as executor:
        executor.map(download_pdb, pdb_codes, [save_dir] * len(pdb_codes))

# Run DeepTMHMM for multiple PDB codes in parallel asynchronously
def run_all_deeptmhmm(pdb_codes, fasta_dir, results_dir):
    os.makedirs(results_dir, exist_ok=True)

    with ThreadPoolExecutor() as executor:
        processes = {
            pdb_code: executor.submit(run_deeptmhmm, pdb_code, os.path.join(fasta_dir, f"{pdb_code}.fasta"), results_dir)
            for pdb_code in pdb_codes
        }

    # Wait for all processes to complete
    for pdb_code, future in processes.items():
        process = future.result()
        stdout, stderr = process.communicate()
        print(f"[{pdb_code}] STDOUT:", stdout)
        print(f"[{pdb_code}] STDERR:", stderr)

In [None]:
def run_deeptmhmm(pdb_code, fasta_filepath_wsl):
    pdb_results_dir = f"/Users/Student/OneDrive - Aston University/Documents/Biology/Project/Project_automation/Python/DeepTMHMM_results/{pdb_code}"
    os.makedirs(pdb_results_dir, exist_ok=True)

    process = subprocess.run(
        ["wsl", "/home/dan/.local/bin/biolib", "run", "--local", "DTU/DeepTMHMM:1.0.24", "--fasta", f"{fasta_filepath_wsl}"],
        text=True,
        capture_output=True,
        cwd=pdb_results_dir
    )
    
    print(process.stdout)
    print(process.stderr)
    
    return pdb_code