### Grab example PDB and convert to backbone only
- Additionally, run the data preparation to JSONL per ProteinMPNN package

In [0]:
%pip install biopython
%pip install ../proteinmpnn
# some extras we specify with pip requirements file to handle the extra url paths needed for odd cuda specific versions of packages
# %pip install -r ../envs/requirements.txt
dbutils.library.restartPython()

In [0]:
import sys
sys.path.append('../')

In [0]:
import Bio.PDB as PDB
def convert_to_backbone(pdb_file_path, output_file_path, chain_id='A'):
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('protein', pdb_file_path)

    new_structure = PDB.Structure.Structure('backbone')
    new_model = PDB.Model.Model(0)
    new_chain = PDB.Chain.Chain(chain_id) 
    new_structure.add(new_model)
    new_model.add(new_chain)

    residue_id = 1 
    for model in structure:
        for chain in model:
            if chain.id==chain_id:
                for residue in chain:
                    if residue.id[0] != ' ' or residue.resname == 'HOH':
                        continue
                    print(residue)
                    first_atom = True
                    for atom in residue:
                        if atom.altloc == 'A' or atom.altloc == ' ':
                            if atom.name in ['N', 'CA', 'C', 'O']:
                                # print(residue, atom)
                                if first_atom:
                                    # print("is first")
                                    new_residue = PDB.Residue.Residue((' ', residue_id, ' '), 'GLY', ' ')
                                    first_atom = False
                                new_residue.add(atom)  # Add the backbone atom to the new residue
                    try:
                        last_one = [r for r in new_chain.get_residues()][-1]
                    except:
                        last_one = None
                    if last_one!=new_residue:
                        new_chain.add(new_residue)
                    residue_id += 1  # Increment residue ID

    # Step 3: Write the modified alpha carbons to a new PDB file
    io = PDB.PDBIO()
    io.set_structure(new_structure)
    io.save(output_file_path)

#### Now actually donwload a PDB and process to backbone

In [0]:
# Download the PDB file
import requests

url = "https://files.rcsb.org/download/5yd3.pdb"
import tempfile

response = requests.get(url)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdb") as temp_file:
    temp_file.write(response.content)
    init_pdb_str = response.content.decode('utf-8')
    pdb_file_path = temp_file.name
    convert_to_backbone(pdb_file_path,'../example_data/inputs/5yd3.pdb')

#### Convert the pdb backbone to JSONL format

In [0]:
import requests
from proteinmpnn.parse_multiple_chains import main, get_argparser
url = "https://files.rcsb.org/download/5yd3.pdb"
import tempfile
import shutil

parser = get_argparser()

with tempfile.TemporaryDirectory() as temp_dir:
    response = requests.get(url)

    # pre-clean the structure to a given chain, the actual removal of CA not necessary...
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdb") as temp_file:
        temp_file.write(response.content)
        pdb_file_path = temp_file.name
        convert_to_backbone(pdb_file_path,'../example_data/inputs/5yd3_backbone.pdb')
        
        shutil.copy(
            '../example_data/inputs/5yd3_backbone.pdb', 
            temp_dir + "/5yd3.pdb"
        )

    arg_list = []
    arg_list.extend(['--ca_only'])
    arg_list.extend(['--input_path', temp_dir])
    arg_list.extend(['--output_path', '../example_data/inputs/example_inputs.jsonl'])
    args = parser.parse_args(arg_list)
    main(args)

    