In [1]:
#!/usr/bin/env python3
import os
import sys
import tempfile
import shutil
import numpy as np
from Bio.PDB import PDBParser, MMCIFParser, PDBIO, MMCIFIO, Select
import subprocess
import os
import re

BIN = '/home/hwjang/aipd/250729/0_move/tmalign/TMalign'

AMINO_ACIDS = {
    'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS',
    'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'
}

def detect_format(filename):
    if filename.endswith('.pdb'):
        return 'pdb'
    elif filename.endswith('.cif') or filename.endswith('.mmcif'):
        return 'mmcif'
    else:
        raise ValueError("Unsupported file format: %s" % filename)

def load_structure(filename):
    fmt = detect_format(filename)
    if fmt == 'pdb':
        parser = PDBParser(QUIET=True)
    else:
        parser = MMCIFParser(QUIET=True)
    return parser.get_structure("X", filename)

def save_structure(structure, filename):
    if filename.endswith('.pdb'):
        io = PDBIO()
        io.set_structure(structure)
        io.save(filename)
    elif filename.endswith('.cif') or filename.endswith('.mmcif'):
        io = MMCIFIO()
        io.set_structure(structure)
        io.save(filename)
    else:
        raise ValueError(f"Unsupported file format: {filename}")

class ProteinOnlySelect(Select):
    def accept_residue(self, residue):
        return residue.get_resname().strip() in AMINO_ACIDS

def extract_protein_only(infile, outfile):
    fmt = detect_format(infile)
    struct = load_structure(infile)
    if outfile.endswith('.pdb'):
        io = PDBIO()
        io.set_structure(struct)
        io.save(outfile, select=ProteinOnlySelect())
    elif outfile.endswith('.cif') or outfile.endswith('.mmcif'):
        io = MMCIFIO()
        io.set_structure(struct)
        io.save(outfile, select=ProteinOnlySelect())
    else:
        raise ValueError("Unsupported file format")

def read_matrix(fname):
    with open(fname) as f:
        lines = f.readlines()
    mat = []
    for i in range(2, 5):
        x = lines[i].strip().split()
        mat.append([float(xx) for xx in x[1:]])
    return mat

def transform_coord(coord, matrix):
    return [
        matrix[i][0] + sum(matrix[i][j+1] * coord[j] for j in range(3))
        for i in range(3)
    ]

def transform_structure(structure, matrix):
    for atom in structure.get_atoms():
        atom.set_coord(transform_coord(atom.get_coord(), matrix))

def run_tmalign(ref_fn, cmp_fn, out_prefix):
    out_matrix_fn = f"{out_prefix}_m.txt"
    cmd = [BIN, cmp_fn, ref_fn, '-m', out_matrix_fn]
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0 or not os.path.isfile(out_matrix_fn):
        raise RuntimeError(f"TM-align failed to run: {' '.join(cmd)}")

    rmsd = None
    score = None
    for line in result.stdout.splitlines():
        if "RMSD=" in line:
            m = re.search(r'RMSD=\s*([0-9.]+)', line)
            if m:
                rmsd = float(m.group(1))
        elif line.startswith("TM-score=") and score is None:
            m = re.search(r'TM-score=\s*([0-9.]+)', line)
            if m:
                score = float(m.group(1))
    if rmsd is None or score is None:
        raise RuntimeError("TM-align failed to find RMSD or TM-score in the output.")
    return out_matrix_fn, rmsd, score

def align_and_transform(ref_fn, cmp_fn, out_fn):
    tmpdir = tempfile.mkdtemp()
    try:
        ref_protein = os.path.join(tmpdir, "ref_protein." + ref_fn.split('.')[-1])
        cmp_protein = os.path.join(tmpdir, "cmp_protein." + cmp_fn.split('.')[-1])
        extract_protein_only(ref_fn, ref_protein)
        extract_protein_only(cmp_fn, cmp_protein)
        out_prefix = os.path.join(tmpdir, "tm")
        matrix_file, rmsd, score = run_tmalign(ref_protein, cmp_protein, out_prefix)
        print(f"RMSD: {rmsd}, TM-score: {score}")
        matrix = read_matrix(matrix_file)
        structure_cmp = load_structure(cmp_fn)
        transform_structure(structure_cmp, matrix)
        save_structure(structure_cmp, out_fn)
    finally:
        shutil.rmtree(tmpdir)

def align_structures_to_ref(ref_fn, cmp_fn, out_dir):
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    basename = os.path.basename(ref_fn)
    out_fn = os.path.join(out_dir, basename.rsplit('.', 1)[0] + "_align.cif")
    align_and_transform(ref_fn, cmp_fn, out_fn)
    print(f"Saved: {out_fn}")

In [2]:
import os
import shutil
import tempfile
import pandas as pd
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from functools import partial

def tmalign_worker(row, model, col='lmpnn'):
    out_dir = 'tmaligned_' + model
    ref_fn = row[(col, 'path')]
    cmp_fn = row[(model, 'path')]
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir, exist_ok=True)
    basename = os.path.basename(ref_fn)
    out_fn = os.path.join(out_dir, basename.rsplit('.', 1)[0] + "_align.cif")
    tmpdir = tempfile.mkdtemp()
    try:
        ref_protein = os.path.join(tmpdir, "ref_protein." + ref_fn.split('.')[-1])
        cmp_protein = os.path.join(tmpdir, "cmp_protein." + cmp_fn.split('.')[-1])
        extract_protein_only(ref_fn, ref_protein)
        extract_protein_only(cmp_fn, cmp_protein)
        out_prefix = os.path.join(tmpdir, "tm")
        matrix_file, rmsd, score = run_tmalign(ref_protein, cmp_protein, out_prefix)
        matrix = read_matrix(matrix_file)
        structure_cmp = load_structure(cmp_fn)
        transform_structure(structure_cmp, matrix)
        save_structure(structure_cmp, out_fn)
        out_fn = os.path.abspath(out_fn)
    finally:
        shutil.rmtree(tmpdir)
    return rmsd, score, out_fn

def process(df, model, col='lmpnn', pool_size=cpu_count()):
    rows = [row for _, row in df.iterrows()]
    worker = partial(tmalign_worker, model=model, col=col)
    with Pool(pool_size) as pool:
        results = list(tqdm(pool.imap(worker, rows), total=len(rows)))
    tm_rmsd, tm_score, tm_path = zip(*results)
    df[(model, 'tm_rmsd')] = tm_rmsd
    df[(model, 'tm_score')] = tm_score
    df[(model, 'tm_path')] = tm_path
    return df

path = 'rmsd_1st.parquet'
df = pd.read_parquet(path)

df = process(df, model='af3', col='lmpnn', pool_size=8)
df = process(df, model='boltz', col='lmpnn', pool_size=8)

100%|██████████| 36/36 [00:01<00:00, 30.76it/s]
100%|██████████| 36/36 [00:01<00:00, 29.87it/s]


In [3]:
df.to_parquet('rmsd_2nd.parquet')
df

Unnamed: 0_level_0,diffusion,diffusion,lmpnn,lmpnn,lmpnn,lmpnn,lmpnn,lmpnn,lmpnn,lmpnn,...,af3,af3,boltz,boltz,af3,af3,af3,boltz,boltz,boltz
Unnamed: 0_level_1,id,batch,tag,ddg,fa_rep,res_totalscore,totalscore,seq,path,relaxed_path,...,ca_rmsd,sc_rmsd,ca_rmsd,sc_rmsd,tm_rmsd,tm_score,tm_path,tm_rmsd,tm_score,tm_path
0,result_7_packed_3_1,pht_demo,result_7_packed_3_1,-27.749462,95.805969,-1.583578,-199.530838,SLEEIIAKIRASDPATVDWGAHFREFCKAAGVAEVTPEERALAEKA...,../3_lmpnn/output/packed/result_7_packed_3_1.pdb,../3_lmpnn/output/packed/result_7_packed_3_1_b...,...,1.091246,1.289126,12.82891,6.293877,1.09,0.94133,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,2.34,0.59119,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
1,result_61_packed_8_1,pht_demo,result_61_packed_8_1,-26.640675,104.494186,-1.441779,-181.664185,SEELLAAIKAAFRKIAGDLLTDRVDLDELAQFILDTLTLSEEERAR...,../3_lmpnn/output/packed/result_61_packed_8_1.pdb,../3_lmpnn/output/packed/result_61_packed_8_1_...,...,14.882061,7.540875,4.103964,3.004177,3.83,0.44248,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,2.64,0.76126,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
2,result_7_packed_8_1,pht_demo,result_7_packed_8_1,-25.789537,98.276131,-1.57081,-197.922104,SLAEILAEIRAADPATVDWEAHFRRFCEAAGVEAVTPEERELAARA...,../3_lmpnn/output/packed/result_7_packed_8_1.pdb,../3_lmpnn/output/packed/result_7_packed_8_1_b...,...,2.475292,3.455196,11.608226,5.500517,2.48,0.83348,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,3.88,0.43797,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
3,result_7_packed_2_1,pht_demo,result_7_packed_2_1,-25.547806,102.521637,-1.61018,-202.882706,SLAELIQEIRDADPKTIDWEAFFRRFAEAAGVAAVTPEQRALAARM...,../3_lmpnn/output/packed/result_7_packed_2_1.pdb,../3_lmpnn/output/packed/result_7_packed_2_1_b...,...,10.356954,7.56935,5.136358,2.721634,3.95,0.48064,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,3.1,0.64034,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
4,result_16_packed_4_1,pht_demo,result_16_packed_4_1,-25.318375,91.409203,-1.448796,-160.816406,ALSDEVKAMLRRMAPAAERLGTEGLLRRMQELGVVPEVTPDLLKAF...,../3_lmpnn/output/packed/result_16_packed_4_1.pdb,../3_lmpnn/output/packed/result_16_packed_4_1_...,...,0.899024,1.500364,1.442684,2.219334,0.9,0.95299,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,1.44,0.88678,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
5,result_16_packed_7_1,pht_demo,result_16_packed_7_1,-24.98737,88.052048,-1.478883,-164.156021,MLSETVKNMLKRLAPAAERLGTEGLLRRMIEAGVIPEVTPELLKAL...,../3_lmpnn/output/packed/result_16_packed_7_1.pdb,../3_lmpnn/output/packed/result_16_packed_7_1_...,...,1.203721,1.934954,1.422562,1.649351,1.2,0.92023,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,1.42,0.8988,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
6,result_7_packed_4_1,pht_demo,result_7_packed_4_1,-23.823254,103.83667,-1.429071,-180.062973,SLAEILAEIRASDPATADWLALARRFAEAAGVDEVTPEERELAAKA...,../3_lmpnn/output/packed/result_7_packed_4_1.pdb,../3_lmpnn/output/packed/result_7_packed_4_1_b...,...,8.709874,4.62572,2.397142,2.042227,3.39,0.60854,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,2.01,0.84026,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
7,result_29_packed_7_1,pht_demo,result_29_packed_7_1,-22.68211,75.388725,-1.675448,-177.597519,SAAFRAILRAMCEAFAELAPGLTLSDEELELVLNPDDEELRKRLNV...,../3_lmpnn/output/packed/result_29_packed_7_1.pdb,../3_lmpnn/output/packed/result_29_packed_7_1_...,...,5.64005,3.841254,7.460195,2.651895,1.61,0.76861,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,1.63,0.80194,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
8,result_59_packed_1_1,pht_demo,result_59_packed_1_1,-22.405994,70.00029,-1.624849,-180.358276,LATEAFLRTFIQSAEALELMRARGTAAAAEIAALVLAALKAKGVSS...,../3_lmpnn/output/packed/result_59_packed_1_1.pdb,../3_lmpnn/output/packed/result_59_packed_1_1_...,...,10.367922,5.525122,6.260994,3.597751,4.08,0.41832,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,4.09,0.5282,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
9,result_59_packed_6_1,pht_demo,result_59_packed_6_1,-21.15624,66.255737,-1.786232,-198.271744,SATEAFLRLVIASPEALELMRTRGTAAADEIAALMLAALEAKGISA...,../3_lmpnn/output/packed/result_59_packed_6_1.pdb,../3_lmpnn/output/packed/result_59_packed_6_1_...,...,19.843605,19.514959,4.15033,2.965234,4.83,0.28555,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...,3.53,0.62361,/home/hwjang/aipd/noahs_ark/piplines/8_rmsd_fi...
