In [2]:
# Call libraries
from Bio import PDB
import tempfile
import pandas as pd

# Create function to extract coordinates
def get_coords(pdb_id, chain_id, residues):
    pdb_id = pdb_id.lower()
    pdbl = PDB.PDBList()
    coords = [pdb_id.upper(), chain_id] 

    # Create temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        pdb_file_path = pdbl.retrieve_pdb_file(pdb_id, pdir=temp_dir, file_format="pdb")

        # Parse the file
        parser = PDB.PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, pdb_file_path)

        # Extract resolution
        resolution = "Unknown"
        with open(pdb_file_path, "r") as f:
            for line in f:
                if line.startswith("REMARK   2 RESOLUTION"):
                    resolution = line.split()[3]
                    break

        coords.insert(1, resolution) 
        
        # Loop through file to extract residue coordinates
        for residue_id in residues:
            found = False
            for model in structure:
                if chain_id in model:
                    chain = model[chain_id]
                    if residue_id in chain:
                        residue = chain[residue_id]
                        residue_name = residue.get_resname()
                        for atom in residue:
                            if atom.get_name() == "CA":
                                coords.extend([f"{residue_name}{residue_id}", *atom.coord])
                                found = True
                                break
            if not found:
                coords.extend([f"Unknown{residue_id}", "NA", "NA", "NA"])

    return coords

# Select pdb code, chain and residues
pdb_data = {
    "2R4R": {"A": [37, 58, 90, 67]},
    "2R4S": {"A": [37, 58, 90, 70]},
    "2RH1": {"A": [30, 60, 96, 67]},
    "3D4S": {"A": [33, 60, 96, 67]},
    "3KJ6": {"A": [37, 57, 90, 67]},
    "3NY8": {"A": [33, 60, 96, 67]}
}

data = []

# Collect and prepare items to be processed by function
for pdb_id, chains in pdb_data.items():
    for chain_id, residues in chains.items():
        data.append(get_coords(pdb_id, chain_id, residues))

# Organise the data frame for accurate conversion to Excel
max_residues = max(len(residues) for chains in pdb_data.values() for residues in chains.values())

# Define columns dynamically
columns = ["PDB ID", "Resolution", "Chain"] + sum([["Res", "X", "Y", "Z"]] * max_residues, [])

# Convert to DataFrame
df = pd.DataFrame(data, columns=columns)

# Save as Excel output
output_file = "C:/Users/Student/OneDrive - Aston University/Documents/Biology/Project/Landmarks/Automated landmarks/Protein_coordinates.xlsx"
df.to_excel(output_file, index=False)

#Print coordinates as a test
coordinates = get_coords(pdb_id, chain_id, residues)
print(coordinates)

Downloading PDB structure '2r4r'...
Downloading PDB structure '2r4s'...
Downloading PDB structure '2rh1'...
Downloading PDB structure '3d4s'...
Downloading PDB structure '3kj6'...
Downloading PDB structure '3ny8'...
Downloading PDB structure '3ny8'...
['3NY8', '2.84', 'A', 'VAL33', np.float32(14.207), np.float32(25.4), np.float32(57.341), 'LYS60', np.float32(-6.015), np.float32(19.556), np.float32(22.683), 'MET96', np.float32(5.649), np.float32(22.468), np.float32(61.567), 'VAL67', np.float32(-4.797), np.float32(4.51), np.float32(23.6)]
