In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
from pathlib import Path

SUBSET_PATH = Path("/content/drive/MyDrive/biolip_gnn/out/subset_200.csv")
df = pd.read_csv(SUBSET_PATH)

print("Subset rows:", len(df))
print("Unique PDB IDs:", df["pdb_id"].nunique())
pdb_ids = sorted(df["pdb_id"].dropna().astype(str).str.strip().str.lower().unique())
print("First 10 IDs:", pdb_ids[:10])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Subset rows: 200
Unique PDB IDs: 200
First 10 IDs: ['1ce1', '1ft9', '1hqv', '1izc', '1j04', '1jot', '1kmm', '1ldn', '1lq8', '1m3u']


In [None]:
STRUCT_DIR = Path("/content/drive/MyDrive/biolip_gnn/structures")
LOG_PATH   = Path("/content/drive/MyDrive/biolip_gnn/download_log.txt")

STRUCT_DIR.mkdir(parents=True, exist_ok=True)
print("structures will be saved to:", STRUCT_DIR)

structures will be saved to: /content/drive/MyDrive/biolip_gnn/structures


In [None]:
import urllib.request
import time

failures = []
downloaded = 0
skipped = 0

for i, pid in enumerate(pdb_ids, start=1):
  url = f"https://files.rcsb.org/download/{pid}.cif.gz"
  out = STRUCT_DIR / f"{pid}.cif.gz"

  if out.exists() and out.stat().st_size > 0:
    skipped += 1
    continue

  try:
    urllib.request.urlretrieve(url, out)
  except Exception as e:
    failures.append((pid, str(e)))

  if i % 25 == 0:
    print(f"Progress: {i}/{len(pdb_ids)} | downloaded = {downloaded} skipped={skipped} faliures={len(faliures)}")
    time.sleep(0.2)

print("\nDONE")
print("Downloaded:", downloaded)
print("Skipped (already existed):", skipped)
print("Failures:", len(failures))






DONE
Downloaded: 0
Skipped (already existed): 200
Failures: 0


In [None]:
lines = []
lines.append(f"Total requested PDB IDs: {len(pdb_ids)}")
lines.append(f"Downloaded new files: {downloaded}")
lines.append(f"Skipped existing files: {skipped}")
lines.append(f"Failures: {len(failures)}")
lines.append("")

if failures:
    lines.append("FAILED DOWNLOADS:")
    for pid, err in failures:
        lines.append(f"{pid}\t{err}")

LOG_PATH.write_text("\n".join(lines), encoding="utf-8")
print("Wrote log to:", LOG_PATH)

Wrote log to: /content/drive/MyDrive/biolip_gnn/download_log.txt


In [None]:
files = list(STRUCT_DIR.glob("*.cif.gz"))
print("cif.gz files present:", len(files))
print("Example files:", [f.name for f in files[:10]])

cif.gz files present: 200
Example files: ['1ce1.cif.gz', '1ft9.cif.gz', '1hqv.cif.gz', '1izc.cif.gz', '1j04.cif.gz', '1jot.cif.gz', '1kmm.cif.gz', '1ldn.cif.gz', '1lq8.cif.gz', '1m3u.cif.gz']


In [None]:
from pathlib import Path

print(Path("/content/drive/MyDrive/biolip_gnn/out/subset_200.csv").exists())
print(Path("/content/drive/MyDrive/biolip_gnn/download_log.txt").exists())
print(len(list(Path("/content/drive/MyDrive/biolip_gnn/structures").glob("*.cif.gz"))))


True
True
200
