# Classic pre-prediction processing tools

Howdy there you handsome Hacker! Fancy seeing you here (ah-wink-ah-wink, if ya'll know what I'm saying) - I am not drunk writing this, I'm just listening to southern folk music.

Aaaanyways, this notebook is made specifically for you to have a nice, all-in-one collection of functions that can be useful for your pre-processing of pdb files, or any other types of input files that you may need for running predictions on your favourite model.

Here you will find :

- convert your file from pdb to cif
- convert yoour file from cif to pdb
- convert files from pdb to FASTA for AF3 prediction from server

**N.B.** : For the following pipelines you will need to download **BioPython** and **Biotite** from your terminal. We reccommend you do this in a **Python Virtual Environment (venv)**. If you're a Newbie to venv, you can ask your favourite AI agent how to create one on your PC :) .

If you're using python, after avtivating your venv, run:

-       pip install --upgrade pip
-       pip install biopython
-       pip install biotite

If you're on macbook (bash-zsh) run:

-       brew install biopython
-       brew install biotite

If any other needs for pre-processing come to your attention, please report to one of the coaches, and we'll make sure to add some code for that as well!

In the meantime, just enjoy the process, and happy protein engineering!

See ya mate

In [None]:
# ======================================================
# 1. Import all the libraries you need
# ======================================================
#!/usr/bin/env python
#!/usr/bin/env python
import os
import glob
from pathlib import Path
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import PDBIO
from Bio import SeqIO
import biotite.structure.io.pdb as pdb
import biotite.structure.io.pdbx as pdbx


In [None]:
# ======================================================
# 1. Convert CIF files to PDB format
# ======================================================

def rename_chains(structure):
    """Rename chains to single-letter IDs for PDB compatibility."""
    chainmap = {c.id: c.id for c in structure.get_chains() if len(c.id) == 1}
    next_chain = 0
    for chain in structure.get_chains():
        if len(chain.id) != 1:
            if chain.id[0] not in chainmap:
                chainmap[chain.id[0]] = chain.id
                chain.id = chain.id[0]
            else:
                while True:
                    c = chr(ord('A') + (next_chain % 26)) if next_chain < 26 else \
                        str(next_chain - 26) if next_chain < 36 else \
                        chr(ord('a') + next_chain - 36)
                    next_chain += 1
                    if c not in chainmap:
                        chainmap[c] = chain.id
                        chain.id = c
                        break
    return chainmap

cif_pattern = "/your/cif/folder/*.cif"  # Update to your glob pattern
cif_files = glob.glob(cif_pattern)

for ciffile in cif_files:
    try:
        strucid = os.path.basename(ciffile)[:4]
        parser = MMCIFParser()
        structure = parser.get_structure(strucid, ciffile)
        rename_chains(structure)
        
        # New saving logic: create subfolder and compute output path
        cif_dir = Path(ciffile).parent
        output_dir = cif_dir / "converted_pdbs"
        output_dir.mkdir(exist_ok=True)  # Creates if missing
        pdb_filename = Path(ciffile).stem + ".pdb"
        pdbfile = output_dir / pdb_filename
        
        io = PDBIO()
        io.set_structure(structure)
        io.save(str(pdbfile))  # Use str() for Path compatibility
        print(f"Success: {ciffile} -> {pdbfile}")
    except Exception as e:
        print(f"Error with {ciffile}: {e}")


# ======================================================
# 2. Convert PDB files to CIF format
# ======================================================

# === SET YOUR PATHS HERE ===
input_folder = "/your/input/path"  # Change this
output_folder = "/your/output/folder"  # Change this

def convert_pdb_to_cif(input_folder: str, output_folder: str):
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(parents=True, exist_ok=True)
    
    pdb_files = list(input_path.glob("*.pdb")) + list(input_path.glob("*.PDB"))
    print(f"Found {len(pdb_files)} PDB files to convert")
    
    for pdb_file_path in pdb_files:
        pdb_filename = pdb_file_path.stem
        output_cif_path = output_path / f"{pdb_filename}.cif"
        
        try:
            # FIXED: Skip bonds + explicit extra_fields to avoid parsing occupancy/b-factor
            pdb_file = pdb.PDBFile.read(pdb_file_path)
            structure = pdb_file.get_structure(
                model=1, 
                include_bonds=False,  # Skip bond inference (causes field parsing)
                extra_fields=[]       # Skip optional fields that trigger strict parsing
            )
            
            # Write as CIF
            cif_file = pdbx.CIFFile()
            pdbx.set_structure(cif_file, structure)
            cif_file.write(output_cif_path)
            
            print(f"✓ {pdb_filename}.pdb → {output_cif_path.name}")
            
        except Exception as e:
            print(f"✗ Failed {pdb_filename}: {str(e)[:80]}...")

# RUN CONVERSION
convert_pdb_to_cif(input_folder, output_folder)
print("Batch conversion complete!")

# ======================================================
# 3. Convert PDB files to FASTA format - useful for AF3
# ======================================================
# This lines are for the same purpose but in batch
def batch_structures_to_fasta(input_dir: str | Path,
                              output_dir: str | Path,
                              single_output: bool = False):
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if single_output:
        merged_path = output_dir / "all_sequences.fasta"
        if merged_path.exists():
            merged_path.unlink()

    total_chains = 0
    for path in input_dir.iterdir():
        if path.suffix.lower() != ".pdb":  # Adjust if you have CIFs
            continue

        records = list(SeqIO.parse(str(path), "pdb-atom"))
        if not records:
            print(f"Skipped {path.name}: no ATOM records")
            continue

        total_chains += len(records)
        if single_output:
            for rec in records:
                rec.id = f"{path.stem}|{rec.id}"
                rec.description = ""
            with open(merged_path, "a") as handle:
                SeqIO.write(records, handle, "fasta")
        else:
            out_path = output_dir / f"{path.stem}.fasta"
            SeqIO.write(records, str(out_path), "fasta")

    print(f"Processed {total_chains} total chains")


# ---------------------------
# Example usage:
# ---------------------------

batch_structures_to_fasta("PDBspath", "FASTAspath", single_output=False)
# or:
batch_structures_to_fasta("PDBspath", "FASTAspath", single_output=True) 
