# TM-Score Comparison Between PDB Folders

This notebook compares structures between two folders:
- **Generated PDBs folder**: Contains predicted/generated structures
- **Original PDBs folder**: Contains reference/original structures

It calculates TM-scores and saves results to CSV.

In [9]:
import subprocess
import os
import pandas as pd
import re
import glob
from datetime import datetime

print("TM-Score Comparison Tool Loaded")
print(f"Timestamp: {datetime.now()}")

TM-Score Comparison Tool Loaded
Timestamp: 2025-09-05 04:23:33.255582


In [10]:
# Configuration - Update these paths for your setup
GENERATED_PDB_FOLDER = "PMPNN_Results"  # Folder with 505 generated PDBs
ORIGINAL_PDB_FOLDER = "original_pdbs"    # Folder with 505 original PDBs
OUTPUT_CSV = "PMPNN_TMscore.csv"      # Output CSV file name
TMSCORE_EXECUTABLE = "./TMscore"         # Path to TMscore executable

print(f"Generated PDB folder: {GENERATED_PDB_FOLDER}")
print(f"Original PDB folder: {ORIGINAL_PDB_FOLDER}")
print(f"Output CSV: {OUTPUT_CSV}")
print(f"TMscore executable: {TMSCORE_EXECUTABLE}")

Generated PDB folder: PMPNN_Results
Original PDB folder: original_pdbs
Output CSV: PMPNN_TMscore.csv
TMscore executable: ./TMscore


In [11]:
def run_tm_score_folder_comparison(generated_folder, original_folder, output_csv):
    """
    Compare all PDB files between two folders using TM-score
    
    Args:
        generated_folder: Path to folder containing generated/predicted PDBs
        original_folder: Path to folder containing original/reference PDBs
        output_csv: Path to output CSV file
    
    Returns:
        DataFrame with comparison results
    """
    
    # Check if folders exist
    if not os.path.exists(generated_folder):
        print(f"Error: Generated PDB folder not found: {generated_folder}")
        return None
    
    if not os.path.exists(original_folder):
        print(f"Error: Original PDB folder not found: {original_folder}")
        return None
    
    # Get all PDB files from generated folder
    generated_pdbs = glob.glob(os.path.join(generated_folder, "*.pdb"))
    generated_pdbs.sort()
    
    print(f"Found {len(generated_pdbs)} PDB files in generated folder")
    
    if len(generated_pdbs) == 0:
        print("No PDB files found in generated folder!")
        return None
    
    # Show first few filenames for debugging
    print(f"\nFirst 5 generated PDB files:")
    for i, pdb in enumerate(generated_pdbs[:5]):
        base_name = os.path.splitext(os.path.basename(pdb))[0]
        print(f"  {i+1}. {os.path.basename(pdb)} -> looking for: {base_name}_peptide.pdb")
    
    # Check what's actually in the original folder
    original_pdbs = glob.glob(os.path.join(original_folder, "*.pdb"))
    original_pdbs.sort()
    print(f"\nFound {len(original_pdbs)} PDB files in original folder")
    print(f"First 5 original PDB files:")
    for i, pdb in enumerate(original_pdbs[:5]):
        print(f"  {i+1}. {os.path.basename(pdb)}")
    
    print(f"\nExpected mapping pattern: generated_name.pdb -> generated_name_peptide.pdb")
    print("If this doesn't look right, we need to adjust the mapping logic.\n")
    
    # Store all results
    all_results = []
    successful_comparisons = 0
    failed_comparisons = 0
    
    print("\nStarting TM-score comparisons...")
    print("=" * 50)
    
    for i, generated_pdb in enumerate(generated_pdbs):
        # Extract base name for matching
        base_name = os.path.splitext(os.path.basename(generated_pdb))[0]
        
        # Look for corresponding original PDB with _peptide suffix
        original_pdb = os.path.join(original_folder, f"{base_name}_peptide.pdb")
        
        print(f"\nProcessing {i+1}/{len(generated_pdbs)}: {base_name}")
        print(f"  Generated: {generated_pdb}")
        print(f"  Looking for: {original_pdb}")
        
        if not os.path.exists(original_pdb):
            print(f"  ‚ö†Ô∏è  Original PDB not found: {original_pdb}")
            failed_comparisons += 1
            
            # Store failed result
            result = {
                'pdb_id': base_name,
                'tm_score': None,
                'original_pdb_path': original_pdb,
                'status': 'original_not_found'
            }
            all_results.append(result)
            continue
        
        try:
            # Set up environment for TMscore
            env = os.environ.copy()
            env["LD_LIBRARY_PATH"] = "/home/ribodiffusion/.conda/envs/ribodiffusionenv/lib:" + env.get("LD_LIBRARY_PATH", "")
            
            # Run TM-score comparison
            command = subprocess.run(
                [TMSCORE_EXECUTABLE, generated_pdb, original_pdb],
                capture_output=True,
                text=True,
                timeout=120,  # 2 minute timeout
                env=env
            )
            
            if command.returncode == 0:
                # Parse TM-score output - try multiple patterns
                output = command.stdout
                
                # Print output for debugging (first few cases)
                if i < 3:  # Only for first 3 files
                    print(f"  Debug - TMscore output preview:")
                    print(f"    {output[:200]}...")
                
                # Check if TMscore found no common residues
                if "no common residues" in output.lower():
                    print(f"  ‚ö†Ô∏è  No common residues found between structures")
                    failed_comparisons += 1
                    tm_score = None
                    status = 'no_common_residues'
                else:
                    # Try different TM-score patterns
                    tm_score = None
                    
                    # Pattern 1: Standard format
                    tm_score_match = re.search(r"TM-score\s*=\s*([\d.]+)", output, re.IGNORECASE)
                    if tm_score_match:
                        tm_score = float(tm_score_match.group(1))
                    else:
                        # Pattern 2: Alternative formats
                        patterns = [
                            r"TM-score\s*:\s*([\d.]+)",
                            r"TM\s*=\s*([\d.]+)",
                            r"TM-score\s+([\d.]+)",
                            r"TM\s*score\s*=\s*([\d.]+)",
                            r"([\d.]+)\s*\(normalized by length",  # Sometimes TM-score appears before this text
                        ]
                        
                        for pattern in patterns:
                            match = re.search(pattern, output, re.IGNORECASE)
                            if match:
                                tm_score = float(match.group(1))
                                break
                    
                    if tm_score is not None:
                        print(f"  ‚úÖ TM-score: {tm_score:.4f}")
                        successful_comparisons += 1
                        status = 'success'
                    else:
                        print(f"  ‚ùå Could not parse TM-score from output")
                        # Show debug info for parsing failures to understand the format
                        # Show first 10 parsing failures for debugging
                        if failed_comparisons < 10:
                            print(f"    DEBUG - Full TMscore output for {base_name}:")
                            print(f"    {output}")
                            print(f"    END DEBUG OUTPUT")
                        failed_comparisons += 1
                        status = 'parse_failed'
                
                # Store result (only the columns you need)
                result = {
                    'pdb_id': base_name,
                    'tm_score': tm_score,
                    'original_pdb_path': original_pdb,
                    'status': status
                }
                
            else:
                print(f"  ‚ùå TMscore command failed (return code: {command.returncode})")
                print(f"     Error: {command.stderr[:200]}...")
                failed_comparisons += 1
                
                result = {
                    'pdb_id': base_name,
                    'tm_score': None,
                    'original_pdb_path': original_pdb,
                    'status': 'tmscore_failed'
                }
                
        except subprocess.TimeoutExpired:
            print(f"  ‚è±Ô∏è  TMscore command timed out")
            failed_comparisons += 1
            
            result = {
                'pdb_id': base_name,
                'tm_score': None,
                'original_pdb_path': original_pdb,
                'status': 'timeout'
            }
            
        except Exception as e:
            print(f"  üí• Error: {str(e)}")
            failed_comparisons += 1
            
            result = {
                'pdb_id': base_name,
                'tm_score': None,
                'original_pdb_path': original_pdb,
                'status': f'error: {str(e)[:50]}'
            }
        
        all_results.append(result)
    
    # Create results DataFrame
    if all_results:
        results_df = pd.DataFrame(all_results)
        
        # Sort by TM-score (descending, with NaN values last)
        results_df = results_df.sort_values('tm_score', ascending=False, na_position='last')
        
        # Save to CSV
        results_df.to_csv(output_csv, index=False)
        
        print(f"\n{'='*50}")
        print(f"COMPARISON COMPLETE!")
        print(f"{'='*50}")
        print(f"Total comparisons: {len(all_results)}")
        print(f"Successful: {successful_comparisons}")
        print(f"Failed: {failed_comparisons}")
        print(f"Success rate: {(successful_comparisons/len(all_results))*100:.1f}%")
        
        # Calculate statistics for successful comparisons
        successful_results = results_df[results_df['status'] == 'success']
        if len(successful_results) > 0:
            print(f"\nTM-Score Statistics:")
            print(f"Average: {successful_results['tm_score'].mean():.4f}")
            print(f"Median: {successful_results['tm_score'].median():.4f}")
            print(f"Best: {successful_results['tm_score'].max():.4f}")
            print(f"Worst: {successful_results['tm_score'].min():.4f}")
            print(f"Std Dev: {successful_results['tm_score'].std():.4f}")
        
        print(f"\nüìÑ Results saved to: {output_csv}")
        
        return results_df
    else:
        print("No results to save")
        return None

In [12]:
# Run the comparison
print("Starting TM-score comparison between folders...")
print(f"Make sure {TMSCORE_EXECUTABLE} is available and has execute permissions")
print("Use: chmod +x TMscore")
print()

results = run_tm_score_folder_comparison(
    GENERATED_PDB_FOLDER, 
    ORIGINAL_PDB_FOLDER, 
    OUTPUT_CSV
)

Starting TM-score comparison between folders...
Make sure ./TMscore is available and has execute permissions
Use: chmod +x TMscore

Found 505 PDB files in generated folder

First 5 generated PDB files:
  1. 1a1r_C.pdb -> looking for: 1a1r_C_peptide.pdb
  2. 1a1u_A.pdb -> looking for: 1a1u_A_peptide.pdb
  3. 1a2c_L.pdb -> looking for: 1a2c_L_peptide.pdb
  4. 1a2x_B.pdb -> looking for: 1a2x_B_peptide.pdb
  5. 1a38_P.pdb -> looking for: 1a38_P_peptide.pdb

Found 505 PDB files in original folder
First 5 original PDB files:
  1. 1a1r_C_peptide.pdb
  2. 1a1u_A_peptide.pdb
  3. 1a2c_L_peptide.pdb
  4. 1a2x_B_peptide.pdb
  5. 1a38_P_peptide.pdb

Expected mapping pattern: generated_name.pdb -> generated_name_peptide.pdb
If this doesn't look right, we need to adjust the mapping logic.


Starting TM-score comparisons...

Processing 1/505: 1a1r_C
  Generated: PMPNN_Results/1a1r_C.pdb
  Looking for: original_pdbs/1a1r_C_peptide.pdb
  Debug - TMscore output preview:
     There is no common residues 

In [None]:
# Display top results
if results is not None:
    print("\nüèÜ TOP 10 RESULTS BY TM-SCORE:")
    print("=" * 60)
    
    top_results = results[results['status'] == 'success'].head(10)
    
    for i, (_, row) in enumerate(top_results.iterrows(), 1):
        print(f"{i:2d}. {row['pdb_id']:20s} | TM-score: {row['tm_score']:.4f}")
else:
    print("No results to display")

In [None]:
# Optional: Create a summary report
if results is not None:
    # Create summary by status
    status_summary = results['status'].value_counts()
    
    print("\nüìä SUMMARY BY STATUS:")
    print("=" * 30)
    for status, count in status_summary.items():
        print(f"{status:20s}: {count:3d}")
    
    # Save summary
    summary_file = OUTPUT_CSV.replace('.csv', '_summary.txt')
    with open(summary_file, 'w') as f:
        f.write(f"TM-Score Comparison Summary\n")
        f.write(f"Generated at: {datetime.now()}\n")
        f.write(f"="*50 + "\n")
        f.write(f"Generated folder: {GENERATED_PDB_FOLDER}\n")
        f.write(f"Original folder: {ORIGINAL_PDB_FOLDER}\n")
        f.write(f"Total comparisons: {len(results)}\n")
        f.write(f"\nStatus Summary:\n")
        for status, count in status_summary.items():
            f.write(f"{status}: {count}\n")
        
        successful_results = results[results['status'] == 'success']
        if len(successful_results) > 0:
            f.write(f"\nTM-Score Statistics:\n")
            f.write(f"Average: {successful_results['tm_score'].mean():.4f}\n")
            f.write(f"Median: {successful_results['tm_score'].median():.4f}\n")
            f.write(f"Best: {successful_results['tm_score'].max():.4f}\n")
            f.write(f"Worst: {successful_results['tm_score'].min():.4f}\n")
            f.write(f"Std Dev: {successful_results['tm_score'].std():.4f}\n")
    
    print(f"\nüìÑ Summary saved to: {summary_file}")
else:
    print("No summary to create")