In [1]:
# Call libraries
from Bio import PDB
import tempfile
import pandas as pd

# Create function to extract coordinates
def get_coords(pdb_id, chain_id, residues):
    pdb_id = pdb_id.lower()
    pdbl = PDB.PDBList()
    coords = [pdb_id.upper(), chain_id] 

    # Create temporary directory
    with tempfile.TemporaryDirectory() as temp_dir:
        pdb_file_path = pdbl.retrieve_pdb_file(pdb_id, pdir=temp_dir, file_format="pdb")

        # Parse the file
        parser = PDB.PDBParser(QUIET=True)
        structure = parser.get_structure(pdb_id, pdb_file_path)

        # Extract resolution
        resolution = "Unknown"
        with open(pdb_file_path, "r") as f:
            for line in f:
                if line.startswith("REMARK   2 RESOLUTION"):
                    resolution = line.split()[3]
                    break

        coords.insert(1, resolution) 
        
        # Loop through file to extract residue coordinates
        for residue_id in residues:
            found = False
            for model in structure:
                if chain_id in model:
                    chain = model[chain_id]
                    if residue_id in chain:
                        residue = chain[residue_id]
                        residue_name = residue.get_resname()
                        for atom in residue:
                            if atom.get_name() == "CA":
                                coords.extend([f"{residue_name}{residue_id}", *atom.coord])
                                found = True
                                break
            if not found:
                coords.extend([f"Unknown{residue_id}", "NA", "NA", "NA"])

    return coords

# Select pdb code, chain and residues
pdb_data = {
    "2R4R": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "2R4S": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "2RH1": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "3D4S": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "3D4S": {"A": [219, 326, 207, 315, 121, 282, 268, 131]},
    "6E67": {"B": [219, 326, 207, 315, 121, 282, 268, 131]},
    "3SN6": {"R": [219, 326, 207, 315, 121, 282, 268, 131]},
    "3SN6": {"R": [219, 326, 207, 315, 121, 282, 268, 131]},




}

data = []

# Collect and prepare items to be processed by function
for pdb_id, chains in pdb_data.items():
    for chain_id, residues in chains.items():
        data.append(get_coords(pdb_id, chain_id, residues))

# Organise the data frame for accurate conversion to Excel
max_residues = max(len(residues) for chains in pdb_data.values() for residues in chains.values())

# Define columns dynamically
columns = ["PDB ID", "Resolution", "Chain"] + sum([["Res", "X", "Y", "Z"]] * max_residues, [])

# Convert to DataFrame
df = pd.DataFrame(data, columns=columns)

# Save as Excel output
output_file = "C:/Users/Student/OneDrive - Aston University/Documents/Biology/Project/Landmarks/Automated landmarks/Protein_coordinates.xlsx"
df.to_excel(output_file, index=False)

#Print coordinates as a test
coordinates = get_coords(pdb_id, chain_id, residues)
print(coordinates)

Downloading PDB structure '2r4r'...
Downloading PDB structure '2r4s'...
Downloading PDB structure '2rh1'...
Downloading PDB structure '3d4s'...
Downloading PDB structure '6e67'...
Downloading PDB structure '3sn6'...
Downloading PDB structure '3sn6'...
['3SN6', '3.20', 'R', 'TYR219', np.float32(8.428), np.float32(6.031), np.float32(13.925), 'TYR326', np.float32(21.614), np.float32(12.361), np.float32(13.496), 'SER207', np.float32(16.308), np.float32(0.329), np.float32(-2.121), 'GLY315', np.float32(18.342), np.float32(12.648), np.float32(-2.895), 'ILE121', np.float32(18.362), np.float32(4.453), np.float32(2.789), 'PHE282', np.float32(14.362), np.float32(11.456), np.float32(4.247), 'GLU268', np.float32(3.561), np.float32(17.853), np.float32(22.387), 'ARG131', np.float32(14.963), np.float32(3.333), np.float32(18.044)]


In [3]:
print(coords)

NameError: name 'coords' is not defined