In [11]:
from google.colab import drive
drive.mount('/content/drive')

from pathlib import Path
import pandas as pd

BASE = Path("/content/drive/MyDrive/biolip_gnn")
SUBSET_PATH = BASE / "out" / "subset_200.csv"
STRUCT_DIR  = BASE / "structures"

print("Subset exists:", SUBSET_PATH.exists(), SUBSET_PATH)
print("Structures dir exists:", STRUCT_DIR.exists(), STRUCT_DIR)

df = pd.read_csv(SUBSET_PATH)
print("Subset rows:", len(df))
print("Unique PDB IDs:", df["pdb_id"].nunique())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Subset exists: True /content/drive/MyDrive/biolip_gnn/out/subset_200.csv
Structures dir exists: True /content/drive/MyDrive/biolip_gnn/structures
Subset rows: 200
Unique PDB IDs: 200


In [12]:
!pip -q install biopython

from Bio.PDB import MMCIFParser
from Bio.PDB.Polypeptide import is_aa
import gzip
import os

In [15]:
from pathlib import Path

parser = MMCIFParser(QUIET=True)

def _get_local_cif_path(pdb_id: str, struct_dir: Path) -> Path:
    pdb = pdb_id.strip().lower()
    gz_path = struct_dir / f"{pdb}.cif.gz"
    tmp_cif = Path("/tmp") / f"{pdb}.cif"

    # IMPORTANT: exists() and stat() are FUNCTIONS
    if tmp_cif.exists() and tmp_cif.stat().st_size > 0:
        return tmp_cif

    if gz_path.exists() and gz_path.stat().st_size > 0:
        with gzip.open(str(gz_path), "rb") as f_in:
            tmp_cif.write_bytes(f_in.read())
        return tmp_cif

    raise FileNotFoundError(f"Missing CIF for {pdb_id}. Expected: {gz_path}")

def parse_chain(pdb_id: str, chain_id: str, struct_dir: Path):
    pdb_id = str(pdb_id).strip()
    chain_id = str(chain_id).strip()

    cif_local = _get_local_cif_path(pdb_id, struct_dir)
    structure = parser.get_structure(pdb_id, str(cif_local))
    model = next(structure.get_models())

    chains = [c.id for c in model.get_chains()]
    if chain_id not in chains:
        raise KeyError(f"Chain '{chain_id}' not found in {pdb_id}. Available chains: {chains}")

    chain = model[chain_id]
    residues_out, n_ca = [], 0

    for res in chain.get_residues():
        if not is_aa(res, standard=True):
            continue

        het, resseq, icode = res.get_id()
        ca_coord = None
        if "CA" in res:
            ca_coord = np.array(res["CA"].get_coord(), dtype=float)
            n_ca += 1

        residues_out.append({
            "resname": res.get_resname(),
            "resseq": int(resseq),
            "icode": str(icode).strip(),
            "ca": ca_coord
        })

    return {
        "pdb_id": pdb_id,
        "chain": chain_id,
        "n_residues": len(residues_out),
        "n_ca": n_ca,
        "residues": residues_out
    }


In [16]:
import pandas as pd

sample_df = df.sample(n=10, random_state=42).reset_index(drop=True)

rows = []
for _, r in sample_df.iterrows():
    pdb_id = r["pdb_id"]
    chain  = r["chain"]
    seq_len_csv = int(r["seq_len"]) if pd.notna(r["seq_len"]) else None

    try:
        out = parse_chain(pdb_id, chain, STRUCT_DIR)
        n_res, n_ca = out["n_residues"], out["n_ca"]

        if seq_len_csv is None:
            note = "no seq_len"
        else:
            diff = n_res - seq_len_csv
            if abs(diff) <= 5:
                note = "OK (close)"
            elif abs(diff) <= 25:
                note = "Moderate mismatch"
            else:
                note = "Large mismatch"

        rows.append({
            "pdb_id": pdb_id,
            "chain": chain,
            "seq_len_csv": seq_len_csv,
            "n_residues_parsed": n_res,
            "n_ca_found": n_ca,
            "ca_coverage": round(n_ca / n_res, 3) if n_res else 0.0,
            "note": note
        })

    except Exception as e:
        rows.append({
            "pdb_id": pdb_id,
            "chain": chain,
            "seq_len_csv": seq_len_csv,
            "n_residues_parsed": None,
            "n_ca_found": None,
            "ca_coverage": None,
            "note": f"FAILED: {type(e).__name__}: {e}"
        })

report = pd.DataFrame(rows)
report


Unnamed: 0,pdb_id,chain,seq_len_csv,n_residues_parsed,n_ca_found,ca_coverage,note
0,6DZI,Y,103,103,103,1.0,OK (close)
1,2J8D,M,307,307,307,1.0,OK (close)
2,4G4J,A,369,369,369,1.0,OK (close)
3,8ZCR,A,329,329,329,1.0,OK (close)
4,6HDE,B,152,152,152,1.0,OK (close)
5,8JW0,i,126,126,126,1.0,OK (close)
6,5A24,A,222,222,222,1.0,OK (close)
7,6T62,A,244,244,244,1.0,OK (close)
8,2RAG,A,369,369,369,1.0,OK (close)
9,3TAW,A,349,345,345,1.0,OK (close)


In [17]:
print("Failures:", (report["note"].astype(str).str.startswith("FAILED")).sum(), "/", len(report))
print("\nResidues parsed summary:")
display(report[["n_residues_parsed", "n_ca_found", "ca_coverage"]].describe())

print("\nFull validation report:")
display(report)

Failures: 0 / 10

Residues parsed summary:


Unnamed: 0,n_residues_parsed,n_ca_found,ca_coverage
count,10.0,10.0,10.0
mean,256.6,256.6,1.0
std,102.094945,102.094945,0.0
min,103.0,103.0,1.0
25%,169.5,169.5,1.0
50%,275.5,275.5,1.0
75%,341.0,341.0,1.0
max,369.0,369.0,1.0



Full validation report:


Unnamed: 0,pdb_id,chain,seq_len_csv,n_residues_parsed,n_ca_found,ca_coverage,note
0,6DZI,Y,103,103,103,1.0,OK (close)
1,2J8D,M,307,307,307,1.0,OK (close)
2,4G4J,A,369,369,369,1.0,OK (close)
3,8ZCR,A,329,329,329,1.0,OK (close)
4,6HDE,B,152,152,152,1.0,OK (close)
5,8JW0,i,126,126,126,1.0,OK (close)
6,5A24,A,222,222,222,1.0,OK (close)
7,6T62,A,244,244,244,1.0,OK (close)
8,2RAG,A,369,369,369,1.0,OK (close)
9,3TAW,A,349,345,345,1.0,OK (close)


In [18]:
REPORT_PATH = BASE / "out" / "day3_validation_report.csv"
report.to_csv(REPORT_PATH, index=False)
print("Saved:", REPORT_PATH)

Saved: /content/drive/MyDrive/biolip_gnn/out/day3_validation_report.csv
