In [1]:
### Generates array of CV values for CHARMM36m/TIP3P trajectory ###
### 1 Aug 2025 ###

import os
import sys
import numpy as np
import pyemma
from pyemma.util.contexts import settings

# Configuration parameters
FORCE_FIELD = 'CHARMM36m_TIP3P'
FEATURE_NAME = 'inv_HON'
PRODUCTION_DIR = '/project/dinner/ianjefab/papers/202508_ELP/data/trajs/CHARMM36m_TIP3P/2_prod'
STRUCTURE_FILE = f'{PRODUCTION_DIR}/VPG_solv_ions.gro'

def setup_molecular_featurizer(structure_file):
    """Initialize PyEMMA featurizer with protein structure."""
    featurizer = pyemma.coordinates.featurizer(structure_file)
    return featurizer

def select_protein_atoms(featurizer):
    """Select nitrogen, oxygen, and hydrogen atoms from protein."""
    atom_selections = {
        'nitrogen': featurizer.select('protein and name N'), # amide N
        'oxygen': featurizer.select('protein and name O'), # amide O
        'hydrogen': featurizer.select('protein and name H') # amide H
    }
    return atom_selections

def get_residue_indices(featurizer, atom_indices):
    """Get residue index for each atom."""
    topology = featurizer.topology
    residue_indices = [
        topology.atom(atom_idx).residue.index 
        for atom_idx in atom_indices
    ]
    return residue_indices

def create_non_adjacent_pairs(atom_selections, featurizer):
    """
    Create N-H and O-H atom pairs, excluding adjacent residues.
    
    Adjacent residues are excluded to focus on long-range interactions.
    """
    # Get residue indices for each atom type
    residue_indices = {}
    for atom_type, atoms in atom_selections.items():
        residue_indices[atom_type] = get_residue_indices(featurizer, atoms)
    
    pairs = []
    
    # Create N-H pairs (non-adjacent residues only)
    for i, nitrogen_atom in enumerate(atom_selections['nitrogen']):
        for j, hydrogen_atom in enumerate(atom_selections['hydrogen']):
            nitrogen_residue = residue_indices['nitrogen'][i]
            hydrogen_residue = residue_indices['hydrogen'][j]
            
            # Only include pairs from residues at least 2 residues apart
            if abs(nitrogen_residue - hydrogen_residue) > 1:
                pairs.append([nitrogen_atom, hydrogen_atom])
    
    # Create O-H pairs (non-adjacent residues only)
    for i, oxygen_atom in enumerate(atom_selections['oxygen']):
        for j, hydrogen_atom in enumerate(atom_selections['hydrogen']):
            oxygen_residue = residue_indices['oxygen'][i] 
            hydrogen_residue = residue_indices['hydrogen'][j]
            
            # Only include pairs from residues at least 2 residues apart
            if abs(oxygen_residue - hydrogen_residue) > 1:
                pairs.append([oxygen_atom, hydrogen_atom])
    
    return pairs

# Main execution
print("Setting up MD feature extraction...")
print(f"Force field: {FORCE_FIELD}")
print(f"Feature type: {FEATURE_NAME}")

# Initialize featurizer and select atoms
featurizer = setup_molecular_featurizer(STRUCTURE_FILE)
atom_selections = select_protein_atoms(featurizer)

print(f"Found {len(atom_selections['nitrogen'])} nitrogen atoms")
print(f"Found {len(atom_selections['oxygen'])} oxygen atoms") 
print(f"Found {len(atom_selections['hydrogen'])} hydrogen atoms")

# Create atom pairs and add inverse distance features
atom_pairs = create_non_adjacent_pairs(atom_selections, featurizer)
print(f"Created {len(atom_pairs)} atom pairs for inverse distance calculation")

featurizer.add_inverse_distances(atom_pairs)

# Get feature descriptions
feature_descriptions = featurizer.describe()
print(f"\nGenerated {len(feature_descriptions)} features")
print("\nFeature descriptions:")
for i, desc in enumerate(feature_descriptions):
    print(f"{i}: {desc}")

Setting up MD feature extraction...
Force field: CHARMM36m_TIP3P
Feature type: inv_HON
Found 8 nitrogen atoms
Found 8 oxygen atoms
Found 7 hydrogen atoms
Created 74 atom pairs for inverse distance calculation

Generated 74 features

Feature descriptions:
0: INVDIST: GLY 1 N 0 - GLY 3 H 26
1: INVDIST: GLY 1 N 0 - VAL 4 H 33
2: INVDIST: GLY 1 N 0 - GLY 6 H 63
3: INVDIST: GLY 1 N 0 - VAL 7 H 70
4: INVDIST: GLY 1 N 0 - GLY 8 H 86
5: INVDIST: VAL 2 N 9 - VAL 4 H 33
6: INVDIST: VAL 2 N 9 - GLY 6 H 63
7: INVDIST: VAL 2 N 9 - VAL 7 H 70
8: INVDIST: VAL 2 N 9 - GLY 8 H 86
9: INVDIST: GLY 3 N 25 - GLY 1 H 1
10: INVDIST: GLY 3 N 25 - GLY 6 H 63
11: INVDIST: GLY 3 N 25 - VAL 7 H 70
12: INVDIST: GLY 3 N 25 - GLY 8 H 86
13: INVDIST: VAL 4 N 32 - GLY 1 H 1
14: INVDIST: VAL 4 N 32 - VAL 2 H 10
15: INVDIST: VAL 4 N 32 - GLY 6 H 63
16: INVDIST: VAL 4 N 32 - VAL 7 H 70
17: INVDIST: VAL 4 N 32 - GLY 8 H 86
18: INVDIST: PRO 5 N 48 - GLY 1 H 1
19: INVDIST: PRO 5 N 48 - VAL 2 H 10
20: INVDIST: PRO 5 N 48 - G

In [2]:
def find_feature_indices(feature_descriptions, target_features):
    """
    Find indices of specific features in the feature description list.
    
    Args:
        feature_descriptions: List of feature description strings
        target_features: List of target feature strings to find
        
    Returns:
        List of indices corresponding to target features
    """
    found_indices = []
    
    print("Searching for target features...")
    for target_feature in target_features:
        try:
            feature_index = feature_descriptions.index(target_feature)
            found_indices.append(feature_index)
            print(f"Found '{target_feature}' at index {feature_index}")
        except ValueError:
            print(f"WARNING: '{target_feature}' not found in feature list")
    
    return found_indices

# Define target inverse distance features to extract
TARGET_INVERSE_DISTANCES = [
    'INVDIST: VAL 2 N 9 - VAL 4 H 33',
    'INVDIST: VAL 2 O 24 - GLY 6 H 63', 
    'INVDIST: VAL 4 O 47 - VAL 7 H 70',
    'INVDIST: PRO 5 N 48 - VAL 7 H 70'
]

# Find indices of target features
target_feature_indices = find_feature_indices(feature_descriptions, TARGET_INVERSE_DISTANCES)

print(f"\nTarget feature indices: {target_feature_indices}")
print(f"Successfully found {len(target_feature_indices)} out of {len(TARGET_INVERSE_DISTANCES)} target features")


Searching for target features...
Found 'INVDIST: VAL 2 N 9 - VAL 4 H 33' at index 5
Found 'INVDIST: VAL 2 O 24 - GLY 6 H 63' at index 43
Found 'INVDIST: VAL 4 O 47 - VAL 7 H 70' at index 53
Found 'INVDIST: PRO 5 N 48 - VAL 7 H 70' at index 21

Target feature indices: [5, 43, 53, 21]
Successfully found 4 out of 4 target features


In [3]:
def load_trajectory_columns(file_pattern, num_files, target_columns):
    """
    Load specific columns from multiple trajectory files.
    
    Args:
        file_pattern: String pattern for filenames (should contain {} for file number)
        num_files: Number of trajectory files to load
        target_columns: List of column indices to extract
        
    Returns:
        List of numpy arrays, each containing selected columns from one file
    """
    trajectory_data = []
    
    print(f"Loading {num_files} trajectory files...")
    for file_num in range(1, num_files + 1):
        filename = file_pattern.format(file_num)
        print(f"Loading {filename}...")
        
        # Load full trajectory data
        full_data = np.load(filename)
        
        # Extract only the columns we need
        selected_data = full_data[:, target_columns]
        trajectory_data.append(selected_data)
        
        print(f"  Selected columns shape: {selected_data.shape}")
    
    return trajectory_data

def concatenate_and_invert(trajectory_list):
    """
    Concatenate trajectory data and convert inverse distances to distances.
    
    Args:
        trajectory_list: List of numpy arrays to concatenate
        
    Returns:
        Numpy array with concatenated and inverted data
    """
    print("Concatenating trajectory data...")
    concatenated_data = np.concatenate(trajectory_list, axis=0)
    
    print("Converting inverse distances to distances...")
    distance_data = 1 / concatenated_data
    
    return distance_data, concatenated_data

def save_processed_data(data, filename, force_field):
    """Save processed data with descriptive filename."""
    output_filename = f'CV_array_{force_field}.npy'
    np.save(output_filename, data)
    print(f"Saved processed data to: {output_filename}")

# Configuration for trajectory loading
TRAJECTORY_PATTERN = f'spec_traj_{FEATURE_NAME}.npy'
NUM_TRAJECTORY_FILES = 1
TARGET_COLUMN_INDICES = [5, 43, 53, 21]  # Indices found in previous block

# Load and process trajectory data
print("Processing trajectory data...")
print(f"Target column indices: {TARGET_COLUMN_INDICES}")

# Load specific columns from all trajectory files
trajectory_data = load_trajectory_columns(
    TRAJECTORY_PATTERN, 
    NUM_TRAJECTORY_FILES, 
    TARGET_COLUMN_INDICES
)

# Concatenate and convert inverse distances to distances
processed_data, original_concatenated = concatenate_and_invert(trajectory_data)

# Display results
print(f"\nProcessing complete!")
print(f"Final data shape: {processed_data.shape}")
print(f"Data type: {processed_data.dtype}")
print(f"Sample distance values (first row): {processed_data[0, :]}")
print(f"Original inverse distance values: {original_concatenated[0, :]}")

# Make data available for future cells
inv_HON_data = processed_data
print(f"\nArray 'inv_HON_data' is now available for use in future cells")

# Save processed data
save_processed_data(inv_HON_data, 'CV_array', FORCE_FIELD)

# Display final shape (equivalent to original np.shape() call)
print(f"Final shape: {inv_HON_data.shape}")

Processing trajectory data...
Target column indices: [5, 43, 53, 21]
Loading 1 trajectory files...
Loading spec_traj_inv_HON.npy...
  Selected columns shape: (25000000, 4)
Concatenating trajectory data...
Converting inverse distances to distances...

Processing complete!
Final data shape: (25000000, 4)
Data type: float32
Sample distance values (first row): [0.71263325 0.5616556  0.6114557  0.7289863 ]
Original inverse distance values: [1.4032463 1.7804506 1.6354415 1.3717679]

Array 'inv_HON_data' is now available for use in future cells
Saved processed data to: CV_array_CHARMM36m_TIP3P.npy
Final shape: (25000000, 4)
