In [34]:
from Bio import SeqIO
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Dictionary to map three-letter amino acid codes to single-letter codes
three_to_one = {
    "ALA": "A", "CYS": "C", "ASP": "D", "GLU": "E", "PHE": "F",
    "GLY": "G", "HIS": "H", "ILE": "I", "LYS": "K", "LEU": "L",
    "MET": "M", "ASN": "N", "PRO": "P", "GLN": "Q", "ARG": "R",
    "SER": "S", "THR": "T", "VAL": "V", "TRP": "W", "TYR": "Y"
}

def pdb_to_fasta_specific_chains(pdb_file, output_file, dir_name, chain_list):
    # Parse the PDB file
    parser = PDBParser(QUIET=True)  # QUIET=True suppresses warnings
    structure = parser.get_structure("protein", pdb_file)
    
    # Get the first model
    model = structure[0]
    
    # List to store SeqRecord objects for specified chains
    records = []
    
    # Iterate through specified chains in the model
    for chain_id in chain_list:
        if chain_id in model:
            chain = model[chain_id]
            sequence = ""
            
            # Extract the sequence from the chain
            for residue in chain:
                if residue.get_id()[0] == " ":  # Check if it's a standard amino acid
                    resname = residue.resname  # Get three-letter code
                    if resname in three_to_one:  # Convert to single-letter code
                        sequence += three_to_one[resname]
                    else:
                        print(f"Warning: Unknown residue {resname} in chain {chain_id}")
            
            # Create a SeqRecord object for the chain
            pdb_desc = pdb_file.split(f'{dir_name}/')[-1].split('.pdb')[0]
            record = SeqRecord(
                Seq(sequence),
                id=f"{pdb_desc}_chain_{chain_id}",
                description=f""
            )
            records.append(record)
        else:
            print(f"Warning: Chain {chain_id} not found in the PDB file.")
    
    # Write all sequences to a FASTA file
    SeqIO.write(records, output_file, "fasta")


In [29]:
# Usage
pdb_file = "/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_2/8ee2_2_4A_contact_pssm_semigreedy_adam_0_120_32_0.01_models_5_weights_0.0_0.19_0.0_0.0_0.0_0.0_0.19_0.19_0.19_0.19_0.048_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb"
output_file = "/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_2/8ee2_2_4A_contact_pssm_semigreedy_adam_0_120_32_0.01_models_5_weights_0.0_0.19_0.0_0.0_0.0_0.0_0.19_0.19_0.19_0.19_0.048_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.fasta"
chain_list = ['A','B']  # Specify the chains you want to extract
pdb_to_fasta_specific_chains(pdb_file, output_file, '8ee2_2', chain_list)

In [30]:
import os
directory = '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_2'
pdb_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdb')]
output_files = [f"{directory}/{f.split('8ee2_2/')[-1].split('.pdb')[0]}.fasta" for f in pdb_files]
print(pdb_files)
print(output_files)

['/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_2/8ee2_2_4A_contact_pssm_semigreedy_adam_0_120_32_0.01_models_5_weights_0.0_0.078_0.0_0.0_0.0_0.0_0.039_0.392_0.392_0.078_0.02_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb', '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_2/8ee2_2_4A_contact_pssm_semigreedy_adam_0_120_32_0.01_models_5_weights_0.0_0.133_0.0_0.0_0.0_0.0_0.133_0.267_0.267_0.133_0.067_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb', '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_2/8ee2_2_4A_contact_pssm_semigreedy_adam_0_120_32_0.01_models_5_weights_0.0_0.152_0.0_0.0_0.0_0.0_0.076_0.304_0.304_0.152_0.013_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb', '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_2/8ee2_2_4A_contact_pssm_semigreedy_adam_0_120_32_0.01_models_5_weights_0.0_0.216_0.0_0.0_0.0_0.0_0.108_0.216_0.216_0.216_0.027_c38_use_templates_True_rm_template_ic_F

In [31]:
for i in range(len(pdb_files)):
    pdb_to_fasta_specific_chains(pdb_files[i], output_files[i], '8ee2_2', ['B'])

In [32]:
directory = '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_3'
pdb_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdb')]
output_files = [f"{directory}/{f.split('8ee2_3/')[-1].split('.pdb')[0]}.fasta" for f in pdb_files]
print(pdb_files)
print(len(pdb_files))
print(output_files)

['/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_3/8ee2_3_4A_contact_pssm_semigreedy_adam_0_120_32_0.001_models_3_weights_0.0_0.0_0.0_0.0_0.0_0.0_1.0_0.0_0.0_0.0_0.0_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb', '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_3/8ee2_3_4A_contact_pssm_semigreedy_adam_0_120_32_0.005_models_3_weights_0.0_1.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb', '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_3/8ee2_3_4A_contact_pssm_semigreedy_adam_0_120_32_0.001_models_3_weights_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_1.0_0.0_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb', '/usr/users/fatma.chafra01/ColabDesign/af/examples/8ee2_3/8ee2_3_4A_contact_pssm_semigreedy_adam_0_120_32_0.001_models_3_weights_0.0_1.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_0.0_c38_use_templates_True_rm_template_ic_False_bias_True_num_recycles_3.pdb', '/usr/u

In [36]:
for i in range(len(pdb_files)):
    pdb_to_fasta_specific_chains(pdb_files[i], output_files[i], '8ee2_3', ['B'])