# MD Analysis - 264THM + PPARG (50 ns Full Trajectory)

**Purpose:** Re-analyze the complete 50 ns trajectory by concatenating all checkpoint files

**Input:** Output from completed MD simulation notebook

**Accelerator:** CPU is sufficient (no GPU needed for analysis)

---

## Step 1: Install GROMACS

In [None]:
%%bash
set -e
apt-get update -qq
apt-get install -qq -y gromacs
gmx --version | head -3
echo 'GROMACS: OK'

## Step 2: Setup Paths and Copy Files

In [None]:
import os
import shutil
from pathlib import Path

CONFIG = {
    'complex_name': '264THM_PPARG',
    'total_time_ns': 50,
    'checkpoint_interval_ns': 10,
}

# Input dataset from completed MD simulation
INPUT_DIR = Path('/kaggle/input/264thm-pparg-md-analysis')
WORK_DIR = Path(f"/kaggle/working/{CONFIG['complex_name']}")
ANALYSIS_DIR = WORK_DIR / 'analysis_full'

ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)

print(f'Input: {INPUT_DIR}')
print(f'Output: {ANALYSIS_DIR}')

In [None]:
# Check available files in input dataset
if INPUT_DIR.exists():
    for item in sorted(INPUT_DIR.iterdir()):
        if item.is_dir():
            print(f'📁 {item.name}/')
            for sub in sorted(item.iterdir())[:5]:
                print(f'   - {sub.name}')
            if len(list(item.iterdir())) > 5:
                print(f'   ... and more')
        else:
            print(f'📄 {item.name}')
else:
    print('ERROR: Input dataset not found!')
    print('Please add the output from the MD simulation as a dataset.')

In [None]:
%%bash\n# Unzip checkpoints if uploaded as zip\ncd /kaggle/input/264thm-pparg-md-analysis\nif [ -f checkpoints.zip ]; then\n    echo 'Extracting checkpoints.zip...'\n    unzip -q checkpoints.zip -d /kaggle/working/\n    echo 'Done!'\n    ls -la /kaggle/working/checkpoints/\nelse\n    echo 'No zip file, using direct folders'\nfi

## Step 3: Concatenate All Trajectory Files

Merge all checkpoint `.xtc` files into one complete 50 ns trajectory

In [None]:
# Find all XTC files from checkpoints\ncheckpoint_xtcs = []\n\n# Check extracted checkpoints first\nextracted_dir = Path('/kaggle/working/checkpoints')\nif extracted_dir.exists():\n    print('Using extracted checkpoints from zip')\n    base_dir = extracted_dir\nelse:\n    base_dir = INPUT_DIR\n\nfor ns in range(CONFIG['checkpoint_interval_ns'], CONFIG['total_time_ns'] + 1, CONFIG['checkpoint_interval_ns']):\n    # Try multiple path patterns\n    patterns = [\n        base_dir / f'checkpoint_{ns}ns/md.xtc',\n        INPUT_DIR / f'checkpoints/checkpoint_{ns}ns/md.xtc',\n        INPUT_DIR / f'{CONFIG["complex_name"]}/checkpoints/checkpoint_{ns}ns/md.xtc',\n    ]\n    found = False\n    for xtc_path in patterns:\n        if xtc_path.exists():\n            checkpoint_xtcs.append(str(xtc_path))\n            print(f'✅ Found: {xtc_path.name} at {xtc_path.parent}')\n            found = True\n            break\n    if not found:\n        print(f'❌ Missing: checkpoint_{ns}ns')\n\nprint(f'\\nTotal XTC files found: {len(checkpoint_xtcs)}')

In [None]:
# Also check for the main md/ folder trajectory
main_xtc = INPUT_DIR / f"{CONFIG['complex_name']}/md/md.xtc"
if main_xtc.exists():
    print(f'✅ Main trajectory found: {main_xtc}')
    main_xtc_size = main_xtc.stat().st_size / 1e6
    print(f'   Size: {main_xtc_size:.1f} MB')
else:
    print('Main md/md.xtc not found, will use checkpoint files')

In [None]:
# Find all XTC files from checkpoints\ncheckpoint_xtcs = []\n\n# Check extracted checkpoints first\nextracted_dir = Path('/kaggle/working/checkpoints')\nif extracted_dir.exists():\n    print('Using extracted checkpoints from zip')\n    base_dir = extracted_dir\nelse:\n    base_dir = INPUT_DIR\n\nfor ns in range(CONFIG['checkpoint_interval_ns'], CONFIG['total_time_ns'] + 1, CONFIG['checkpoint_interval_ns']):\n    # Try multiple path patterns\n    patterns = [\n        base_dir / f'checkpoint_{ns}ns/md.xtc',\n        INPUT_DIR / f'checkpoints/checkpoint_{ns}ns/md.xtc',\n        INPUT_DIR / f'{CONFIG["complex_name"]}/checkpoints/checkpoint_{ns}ns/md.xtc',\n    ]\n    found = False\n    for xtc_path in patterns:\n        if xtc_path.exists():\n            checkpoint_xtcs.append(str(xtc_path))\n            print(f'✅ Found: {xtc_path.name} at {xtc_path.parent}')\n            found = True\n            break\n    if not found:\n        print(f'❌ Missing: checkpoint_{ns}ns')\n\nprint(f'\\nTotal XTC files found: {len(checkpoint_xtcs)}')

In [None]:
%%bash
set -e
cd /kaggle/working

echo "=== Concatenating trajectories ==="

# Get XTC files
XTC_FILES=$(cat xtc_files.txt | tr '\n' ' ')
echo "Files: $XTC_FILES"

# Concatenate all XTC files into one
if [ -n "$XTC_FILES" ]; then
    gmx trjcat -f $XTC_FILES -o 264THM_PPARG/analysis_full/full_trajectory.xtc -cat
    echo ""
    echo "=== Full trajectory created ==="
    ls -lh 264THM_PPARG/analysis_full/full_trajectory.xtc
else
    echo "ERROR: No XTC files found!"
fi

In [None]:
# Copy TPR file for analysis
tpr_source = list(INPUT_DIR.rglob('md.tpr'))
if tpr_source:
    shutil.copy(tpr_source[0], ANALYSIS_DIR / 'md.tpr')
    print(f'✅ Copied TPR from: {tpr_source[0]}')
else:
    print('❌ TPR file not found!')

## Step 4: Run Full 50 ns Analysis

In [None]:
%%bash
set -e
cd /kaggle/working/264THM_PPARG/analysis_full

echo "=== Checking trajectory info ==="
gmx check -f full_trajectory.xtc 2>&1 | tail -20

In [None]:
%%bash
set -e
cd /kaggle/working/264THM_PPARG/analysis_full

echo "=== RMSD Analysis (Backbone) ==="
# Group 4 = Backbone
printf '4\n4\n' | gmx rms -s md.tpr -f full_trajectory.xtc -o rmsd_backbone_50ns.xvg -tu ns

echo ""
echo "=== RMSF Analysis ==="
printf '4\n' | gmx rmsf -s md.tpr -f full_trajectory.xtc -o rmsf_50ns.xvg -res

echo ""
echo "=== Radius of Gyration ==="
printf '1\n' | gmx gyrate -s md.tpr -f full_trajectory.xtc -o gyrate_50ns.xvg

echo ""
echo "=== Analysis Complete ==="
ls -la *.xvg

## Step 5: Generate Plots

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def parse_xvg(filename):
    """Parse GROMACS XVG file"""
    data = []
    with open(filename, 'r') as f:
        for line in f:
            if not line.startswith(('#', '@')):
                values = [float(x) for x in line.split()]
                if values:
                    data.append(values)
    return np.array(data)

# Create figure
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle(f"{CONFIG['complex_name']} - MD Analysis (FULL 50 ns)", fontsize=14, fontweight='bold')

# RMSD
rmsd = parse_xvg(str(ANALYSIS_DIR / 'rmsd_backbone_50ns.xvg'))
axes[0].plot(rmsd[:, 0], rmsd[:, 1], color='#2E86AB', linewidth=0.8)
axes[0].set_xlabel('Time (ns)')
axes[0].set_ylabel('RMSD (nm)')
axes[0].set_title('Backbone RMSD')
axes[0].grid(True, alpha=0.3)
axes[0].set_xlim(0, 50)

# RMSF
rmsf = parse_xvg(str(ANALYSIS_DIR / 'rmsf_50ns.xvg'))
axes[1].plot(rmsf[:, 0], rmsf[:, 1], color='#2E86AB', linewidth=0.8)
axes[1].set_xlabel('Residue')
axes[1].set_ylabel('RMSF (nm)')
axes[1].set_title('RMSF per Residue')
axes[1].grid(True, alpha=0.3)

# Radius of Gyration
gyrate = parse_xvg(str(ANALYSIS_DIR / 'gyrate_50ns.xvg'))
axes[2].plot(gyrate[:, 0]/1000, gyrate[:, 1], color='#F18F01', linewidth=0.8)
axes[2].set_xlabel('Time (ns)')
axes[2].set_ylabel('Rg (nm)')
axes[2].set_title('Radius of Gyration')
axes[2].grid(True, alpha=0.3)
axes[2].set_xlim(0, 50)

plt.tight_layout()
plt.savefig(str(ANALYSIS_DIR / f'{CONFIG["complex_name"]}_analysis_50ns.png'), dpi=300, bbox_inches='tight')
plt.show()

print('\n=== Plot saved ===')

In [None]:
# Calculate statistics
print('='*60)
print('STATISTICS - Full 50 ns Simulation')
print('='*60)

# Overall stats
print(f'\nTime range: 0 - {rmsd[-1, 0]:.1f} ns')
print(f'Total frames: {len(rmsd)}')

# RMSD stats
print(f'\n--- RMSD (Backbone) ---')
print(f'Average: {rmsd[:, 1].mean():.3f} ± {rmsd[:, 1].std():.3f} nm')
print(f'Min: {rmsd[:, 1].min():.3f} nm')
print(f'Max: {rmsd[:, 1].max():.3f} nm')

# Equilibration check (last 20 ns)
last_20ns = rmsd[rmsd[:, 0] >= 30]
if len(last_20ns) > 0:
    print(f'\n--- RMSD (Last 20 ns, equilibrated) ---')
    print(f'Average: {last_20ns[:, 1].mean():.3f} ± {last_20ns[:, 1].std():.3f} nm')

# Rg stats
print(f'\n--- Radius of Gyration ---')
print(f'Average: {gyrate[:, 1].mean():.3f} ± {gyrate[:, 1].std():.3f} nm')

# RMSF - find flexible residues
print(f'\n--- RMSF Hotspots (Top 10 flexible residues) ---')
rmsf_sorted = np.argsort(rmsf[:, 1])[::-1][:10]
for idx in rmsf_sorted:
    print(f'Residue {int(rmsf[idx, 0])}: {rmsf[idx, 1]:.3f} nm')

## Step 6: Additional Analysis

In [None]:
%%bash
cd /kaggle/working/264THM_PPARG/analysis_full

echo "=== Hydrogen Bonds Analysis ==="
# Protein-Protein H-bonds
printf '1\n1\n' | gmx hbond -s md.tpr -f full_trajectory.xtc -num hbnum_protein.xvg 2>/dev/null || echo "H-bond analysis skipped"

echo "\n=== Secondary Structure (DSSP-like) ==="
# This may not work without DSSP, just attempt
printf '1\n' | gmx do_dssp -s md.tpr -f full_trajectory.xtc -o ss.xpm 2>/dev/null || echo "DSSP not available, skipped"

echo "\n=== Available analysis files ==="
ls -la *.xvg *.png 2>/dev/null || echo "No files yet"

In [None]:
# Plot H-bonds if available
hbond_file = ANALYSIS_DIR / 'hbnum_protein.xvg'
if hbond_file.exists():
    hbonds = parse_xvg(str(hbond_file))
    
    plt.figure(figsize=(10, 4))
    plt.plot(hbonds[:, 0]/1000, hbonds[:, 1], color='#28A745', linewidth=0.5, alpha=0.7)
    
    # Running average
    window = min(100, len(hbonds)//10)
    if window > 1:
        running_avg = np.convolve(hbonds[:, 1], np.ones(window)/window, mode='valid')
        time_avg = hbonds[window//2:len(running_avg)+window//2, 0]/1000
        plt.plot(time_avg, running_avg, color='#155724', linewidth=2, label='Running avg')
    
    plt.xlabel('Time (ns)')
    plt.ylabel('Number of H-bonds')
    plt.title(f'{CONFIG["complex_name"]} - Protein H-bonds (50 ns)')
    plt.grid(True, alpha=0.3)
    plt.xlim(0, 50)
    plt.legend()
    plt.tight_layout()
    plt.savefig(str(ANALYSIS_DIR / 'hbonds_50ns.png'), dpi=300)
    plt.show()
    
    print(f'Average H-bonds: {hbonds[:, 1].mean():.1f} ± {hbonds[:, 1].std():.1f}')
else:
    print('H-bond analysis file not found')

## Step 7: Save Results

In [None]:
# Copy all analysis files to output
OUTPUT_DIR = Path('/kaggle/working/output')
OUTPUT_DIR.mkdir(exist_ok=True)

# Copy analysis directory
shutil.copytree(ANALYSIS_DIR, OUTPUT_DIR / f'{CONFIG["complex_name"]}_analysis_50ns', dirs_exist_ok=True)

print('=== Output Files ===')
for f in sorted(OUTPUT_DIR.rglob('*')):
    if f.is_file():
        size_kb = f.stat().st_size / 1024
        print(f'{f.relative_to(OUTPUT_DIR)}: {size_kb:.1f} KB')

print('\n=== FULL 50 NS ANALYSIS COMPLETE ===')