In [7]:
from Bio import PDB, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Get PDB data
pdb_codes = ["3d4s", "2rh1"]
pdb_file = PDB.PDBList()
for pdb_code in pdb_codes:
    pdb_file.retrieve_pdb_file(pdb_code, file_format="pdb", 
                               pdir="C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files")
def three_to_one(resname):
    aa_dict = {
        "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D", "CYS": "C",
        "GLN": "Q", "GLU": "E", "GLY": "G", "HIS": "H", "ILE": "I",
        "LEU": "L", "LYS": "K", "MET": "M", "PHE": "F", "PRO": "P",
        "SER": "S", "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V"
    }
    return aa_dict.get(resname, "X")  # Return 'X' for unknown residues


def extract_pdb_fasta(pdb_filepath, chain_id, output_fasta):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure(pdb_code, pdb_filepath)

    sequence = []
    observed_residues = []
    
    for model in structure:
        for chain in model:
            if chain.id == chain_id:  # Only process the target chain
                for residue in chain.get_residues():
                    if PDB.is_aa(residue):
                        sequence.append(three_to_one(residue.get_resname()))
                        observed_residues.append(residue.id[1])  # Keep track of numbering

    fasta_seq = SeqRecord(Seq("".join(sequence)), id=f"{pdb_filepath}_{chain_id}", description="")
    SeqIO.write(fasta_seq, output_fasta, "fasta")

    return observed_residues  # Return numbering for later alignment

chain_id = "A"
all_observed_residues = {}

for pdb_code in pdb_codes:
    pdb_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\PDB_files\\pdb{pdb_code}.ent"
    fasta_filepath = f"C:\\Users\\Student\\OneDrive - Aston University\\Documents\\Biology\\Project\\Project_automation\\Python\\Fasta_files\\{pdb_code}.fasta"

    observed_residues = extract_pdb_fasta(pdb_filepath, chain_id, fasta_filepath)
    all_observed_residues[pdb_code] = observed_residues  # Store results

#    print(f"PDB: {pdb_code}, Extracted residues: {observed_residues}")


Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb3d4s.ent' 
Structure exists: 'C:\Users\Student\OneDrive - Aston University\Documents\Biology\Project\Project_automation\Python\PDB_files\pdb2rh1.ent' 
