<a href="https://colab.research.google.com/github/dharshinikbt23-crypto/Bioinformatics-5th-sem/blob/main/Metagonmics_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Biopython first
!pip install biopython

import os
import sys
import urllib.request
from Bio import PDB, SeqRecord, Seq

def download_read_pdb(pdbcode, datadir, keepfile=True):
    """
    Downloads a PDB file from the Internet and saves it in a data directory.
    Then it reads and returns the structure inside.
    :param pdbcode: The standard PDB ID e.g. '3ICB'
    :param datadir: The directory where the downloaded file will be saved
    :param keepfile: if False, then the downloaded file will be deleted
    :return: a Bio.PDB Structure object or None if something went wrong
    """
    pdbfilenm = download_pdb(pdbcode, datadir)
    if pdbfilenm is None:
        return None
    struct = read_pdb(pdbcode, pdbfilenm)
    if not keepfile and pdbfilenm and os.path.exists(pdbfilenm):
        os.remove(pdbfilenm)
    return struct

def download_pdb(pdbcode, datadir, downloadurl="https://files.rcsb.org/download/"):
    """
    Downloads a PDB file from the Internet and saves it in a data directory.
    :param pdbcode: The standard PDB ID e.g. '3ICB' or '3icb'
    :param datadir: The directory where the downloaded file will be saved
    :param downloadurl: The base PDB download URL
    :return: the full path to the downloaded PDB file or None if something went wrong
    """
    # Ensure datadir exists
    os.makedirs(datadir, exist_ok=True)

    pdbfn = pdbcode.upper() + ".pdb"
    url = downloadurl + pdbfn
    outfnm = os.path.join(datadir, pdbfn)

    try:
        urllib.request.urlretrieve(url, outfnm)
        print(f"Successfully downloaded {pdbcode} to {outfnm}")
        return outfnm
    except Exception as err:
        print(f"Error downloading {pdbcode}: {str(err)}", file=sys.stderr)
        return None

def read_pdb(pdbcode, pdbfilenm):
    """
    Read a PDB structure from a file.
    :param pdbcode: A PDB ID string
    :param pdbfilenm: The PDB file
    :return: a Bio.PDB.Structure object or None if something went wrong
    """
    try:
        pdbparser = PDB.PDBParser(QUIET=True)
        struct = pdbparser.get_structure(pdbcode, pdbfilenm)
        return struct
    except Exception as err:
        print(f"Error reading PDB file {pdbfilenm}: {str(err)}", file=sys.stderr)
        return None

def extract_seqrecords(pdbcode, struct):
    """
    Extracts the sequence records from a Bio.PDB structure.
    :param pdbcode: the PDB ID of the structure
    :param struct: a Bio.PDB.Structure object
    :return: a list of Bio.SeqRecord objects
    """
    ppb = PDB.PPBuilder()
    seqrecords = []

    for i, chain in enumerate(struct.get_chains()):
        try:
            pps = ppb.build_peptides(chain)

            # Check if we actually got any polypeptides
            if not pps:
                print(f"Warning: No polypeptides found in chain {chain.id}", file=sys.stderr)
                continue

            # Concatenate all polypeptides in the chain (handles chain breaks)
            full_seq = Seq.Seq("")
            for pp in pps:
                full_seq += pp.get_sequence()

            if len(full_seq) == 0:
                print(f"Warning: Empty sequence for chain {chain.id}", file=sys.stderr)
                continue

            seqid = pdbcode + "_" + chain.id
            seqrec = SeqRecord.SeqRecord(
                full_seq,
                id=seqid,
                description=f"Sequence #{i+1}, chain {chain.id}"
            )
            seqrecords.append(seqrec)

        except Exception as err:
            print(f"Error processing chain {chain.id}: {str(err)}", file=sys.stderr)
            continue

    return seqrecords

def get_calphas(struct):
    """
    Extracts the C-alpha atoms from a PDB structure.
    :param struct: A Bio.PDB.Structure object.
    :return: A list of Bio.PDB.Atom objects representing the C-alpha atoms.
    """
    calphas = [atom for atom in struct.get_atoms() if atom.get_fullname() == " CA "]
    return calphas

# Example usage
if __name__ == "__main__":
    # Download and process a sample PDB structure
    pdbcode = "3ICB"  # Change this to any PDB ID you want
    datadir = "./pdb_files"

    print(f"Downloading PDB structure {pdbcode}...")
    struct = download_read_pdb(pdbcode, datadir, keepfile=True)

    if struct:
        print(f"\nStructure loaded successfully!")

        # Extract sequences
        seqs = extract_seqrecords(pdbcode, struct)
        print(f"\nFound {len(seqs)} chain(s):")
        for seq in seqs:
            print(f"  {seq.id}: {len(seq.seq)} amino acids")
            print(f"  First 50 residues: {seq.seq[:50]}")

        # Get C-alpha atoms
        calphas = get_calphas(struct)
        print(f"\nTotal C-alpha atoms: {len(calphas)}")
    else:
        print("Failed to download/process structure")

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86
Downloading PDB structure 3ICB...
Successfully downloaded 3ICB to ./pdb_files/3ICB.pdb

Structure loaded successfully!

Found 1 chain(s):
  3ICB_A: 75 amino acids
  First 50 residues: KSPEELKGIFEKYAAKEGDPNQLSKEELKLLLQTEFPSLLKGPSTLDELF

Total C-alpha atoms: 75
