In [1]:
### GENERATES ARRAY OF 4-STATE POPULATIONS FOR EACH 100-ps SEGMENT ###
### 1 Aug 2025 ###

import numpy as np

# Configuration
FORCE_FIELD = 'CHARMM27_TIP3P'
CV1_THRESH = 0.605   # nm 
CV2_THRESH = 0.485   # nm 
CHUNK_SIZE = 5000    # frames per chunk (100 ps = 5000 frames with 20 fs save rate in trajectory)

# State definitions 
STATES = {
    1: 'C',   
    2: 'E',    
    3: 'NC',  
    4: 'N'    
}

def load_cv_data(force_field):
    """
    Load distance data from processed trajectory file.
    
    Args:
        force_field: String name of FF used in simulation
        
    Returns:
        numpy array with shape (n_frames, n_features) containing distance data
    """
    filename = f'CV_array_{force_field}.npy'
    cv_data = np.load(filename)
    return cv_data

def classify_states(cv_data, cv1_thresh, cv2_thresh):
    """
    Classify each frame into one of the four states.
    
    Uses a 2D classification scheme based on two key distance measurements:
    - CV1 (column 0)
    - CV2 (column 2)
    
    Args:
        cv_data: Array of collective variable data (distances)
        cv1_thresh: Threshold distance for CV1
        cv2_thresh: Threshold distance for CV2
        
    Returns:
        Array of state classifications (1-4) for each frame
    """
    # Extract the two key distance measurements from the data
    cv1 = cv_data[:, 0]  # column 0
    cv2 = cv_data[:, 2]  # column 2
    
    # Initialize result array - start with all zeros
    states = np.zeros(len(cv_data), dtype=int)
    
    # Apply 2D state classification rules based on distance thresholds
    # State 1 (C): CV1 long AND CV2 short
    states[(cv1 > cv1_thresh) & (cv2 < cv2_thresh)] = 1  
    # State 2 (E): CV1 long AND CV2 long  
    states[(cv1 > cv1_thresh) & (cv2 > cv2_thresh)] = 2  
    # State 3 (NC ): CV1 short AND CV2 short
    states[(cv1 < cv1_thresh) & (cv2 < cv2_thresh)] = 3  
    # State 4 (N): CV1 short AND CV2 long
    states[(cv1 < cv1_thresh) & (cv2 > cv2_thresh)] = 4  
    
    return states

def calculate_proportions(states, chunk_size):
    """
    Calculate state proportions for discrete time windows.
    
    Args:
        states: Array of state classifications for each frame
        chunk_size: Number of consecutive frames per time window
        
    Returns:
        2D array with shape (n_chunks, 4) containing state proportions for each time window
    """
    # Split the long trajectory into equal-sized time windows
    # Example: 25,000 frames → [frames 0-4999, 5000-9999, 10000-14999, ...]
    chunks = [states[i:i + chunk_size] for i in range(0, len(states), chunk_size)]
    
    # Calculate state proportions within each time window
    # For each chunk: count states → convert to percentages → store results
    # Example: chunk with [1,1,2,1,3] → counts [3,1,1,0] → props [0.6, 0.2, 0.2, 0.0]
    proportions = []
    for chunk in chunks:
        counts = np.bincount(chunk, minlength=5)[1:]  # Exclude state 0 (unclassified)
        props = counts / len(chunk)  # Normalize to get proportions
        proportions.append(props)
    
    return np.array(proportions)

def save_and_report(proportions, force_field):
    """
    Args:
        proportions: 2D array of state proportions for each time window
        force_field: String name of FF for filename
    """
    # Save proportions_array
    output_file = f'proportions_array_0100ps_{force_field}_PAPER.npy'
    np.save(output_file, proportions)
    
    # Calculate mean populations
    mean_props = np.mean(proportions, axis=0)
    
    # Report results
    print(f"--- State Populations for {force_field} ---")
    print(f"Analyzed {len(proportions)} time windows of {CHUNK_SIZE} frames each")
    
    for i, (state_num, state_name) in enumerate(STATES.items()):
        print(f"State {state_num} ({state_name:2s}): {mean_props[i]:6.2%}")
    
    print("\n" + "="*40)
    print(f"Shape of saved proportions array: {proportions.shape}")
    print(f"Saved to: {output_file}")

def main():
    # Load CV data
    print("Loading CV data...")
    cv_data = load_cv_data(FORCE_FIELD)
    print(f"Loaded data with shape: {cv_data.shape}")
    
    # Classify conformational states
    print(f"\nClassifying states (CV1: {CV1_THRESH} nm, CV2: {CV2_THRESH} nm)...")
    states = classify_states(cv_data, CV1_THRESH, CV2_THRESH)
    
    # Calculate time-resolved proportions
    print(f"Calculating proportions for {CHUNK_SIZE}-frame windows...")
    proportions = calculate_proportions(states, CHUNK_SIZE)
    
    # Save and report results
    save_and_report(proportions, FORCE_FIELD)

# RUN
if __name__ == "__main__":
    main()

Loading CV data...
Loaded data with shape: (25000000, 4)

Classifying states (CV1: 0.605 nm, CV2: 0.485 nm)...
Calculating proportions for 5000-frame windows...
--- State Populations for CHARMM27_TIP3P ---
Analyzed 5000 time windows of 5000 frames each
State 1 (C ): 18.17%
State 2 (E ): 16.02%
State 3 (NC): 34.28%
State 4 (N ): 31.54%

Shape of saved proportions array: (5000, 4)
Saved to: proportions_array_0100ps_CHARMM27_TIP3P_PAPER.npy
