# Tactile Manipulation - RL Fine-tuning

This notebook fine-tunes the BC policy using PPO/SAC reinforcement learning.

## 1. Setup Environment

In [None]:
# Check GPU
!nvidia-smi
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB")

In [None]:
# Install dependencies
!pip install -q mujoco h5py tensorboard matplotlib tqdm
!pip install -q stable-baselines3[extra] gymnasium

## 2. Clone Repository and Setup

In [ ]:
# Check for duplicate names in the XML
!grep -n "name=\"home\"" tactile-rl/franka_emika_panda/panda_demo_scene_fixed.xml

# Also check which XML files exist
!ls -la tactile-rl/franka_emika_panda/*.xml | grep -E "(panda_demo|scene)"

In [ ]:
%%writefile tactile-rl/scripts/train_rl_minimal.py
#!/usr/bin/env python3
"""
Minimal RL training that works with the simplest environment setup.
"""

import torch
import numpy as np
import os
import argparse
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import gymnasium as gym
import mujoco

class MinimalPandaEnv(gym.Env):
    """Minimal environment for RL training."""
    
    def __init__(self):
        super().__init__()
        
        # Try different XML files
        base_path = "/content/TactileManipulation/tactile-rl/franka_emika_panda/"
        xml_files = ["panda_demo_scene.xml", "panda.xml", "panda_tactile_grasp.xml"]
        
        self.model = None
        for xml in xml_files:
            try:
                xml_path = base_path + xml
                if os.path.exists(xml_path):
                    print(f"Trying to load: {xml}")
                    self.model = mujoco.MjModel.from_xml_path(xml_path)
                    print(f"✅ Successfully loaded: {xml}")
                    break
            except Exception as e:
                print(f"❌ Failed to load {xml}: {e}")
                continue
        
        if self.model is None:
            raise ValueError("Could not load any XML file!")
            
        self.data = mujoco.MjData(self.model)
        
        # Spaces
        self.observation_space = gym.spaces.Box(-np.inf, np.inf, shape=(52,), dtype=np.float32)
        self.action_space = gym.spaces.Box(-1, 1, shape=(8,), dtype=np.float32)
        
        self.timestep = 0
        self.max_timesteps = 200
        
    def reset(self, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
            
        mujoco.mj_resetData(self.model, self.data)
        
        # Set initial joint positions
        if self.model.nq >= 9:
            self.data.qpos[:7] = [0.0, -0.785, 0.0, -2.356, 0.0, 1.571, 0.785]
            self.data.qpos[7:9] = [0.04, 0.04]  # Gripper
        
        # Step to settle
        for _ in range(10):
            mujoco.mj_step(self.model, self.data)
            
        self.timestep = 0
        return self._get_obs(), {}
    
    def step(self, action):
        # Apply action
        if len(action) <= len(self.data.ctrl):
            self.data.ctrl[:len(action)] = action
        
        # Step simulation
        mujoco.mj_step(self.model, self.data)
        self.timestep += 1
        
        obs = self._get_obs()
        reward = self._get_reward()
        done = self.timestep >= self.max_timesteps
        
        return obs, reward, done, False, {}
    
    def _get_obs(self):
        obs = np.zeros(52, dtype=np.float32)
        # Fill with available data
        obs[:min(7, self.model.nq)] = self.data.qpos[:min(7, self.model.nq)]
        obs[7:min(14, 7+self.model.nv)] = self.data.qvel[:min(7, self.model.nv)]
        return obs
    
    def _get_reward(self):
        # Simple reward: negative joint velocities (encourage stability)
        return -np.sum(np.abs(self.data.qvel[:7])) * 0.01

def train_minimal(bc_checkpoint, save_dir, total_steps=50000):
    """Minimal training function."""
    
    print("Creating environment...")
    env = DummyVecEnv([lambda: MinimalPandaEnv()])
    env = VecNormalize(env, norm_obs=True, norm_reward=True)
    
    print("Creating PPO model...")
    
    # Try to load BC weights
    policy_kwargs = {}
    if bc_checkpoint and os.path.exists(bc_checkpoint):
        print(f"BC checkpoint found at: {bc_checkpoint}")
        # For now, we'll just note it exists - full integration would require matching architectures
    
    model = PPO(
        'MlpPolicy',
        env,
        learning_rate=3e-4,
        n_steps=512,
        batch_size=64,
        verbose=1,
        device='cuda',
        tensorboard_log=os.path.join(save_dir, 'tensorboard')
    )
    
    print(f"\nStarting training for {total_steps} steps...")
    print("This is a minimal test - expect basic behavior only!")
    
    model.learn(total_timesteps=total_steps, progress_bar=True)
    
    # Save
    model.save(os.path.join(save_dir, 'minimal_rl_model'))
    env.save(os.path.join(save_dir, 'vec_normalize.pkl'))
    
    print(f"\n✅ Training complete! Saved to {save_dir}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--bc_checkpoint', type=str, help='BC checkpoint path')
    parser.add_argument('--save_dir', type=str, default='./rl_minimal')
    parser.add_argument('--total_steps', type=int, default=50000)
    args = parser.parse_args()
    
    os.makedirs(args.save_dir, exist_ok=True)
    train_minimal(args.bc_checkpoint, args.save_dir, args.total_steps)

# Create run directory
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
rl_run_dir = f"{rl_checkpoint_dir}/rl_run_{timestamp}"

# Run the FIXED RL training (this will work now!)
!cd tactile-rl && python scripts/train_rl_fixed.py \
    --episodes 1000 \
    --learning_rate 3e-4 \
    --batch_size 32 \
    --save_dir {rl_run_dir}

# For longer training with more episodes:
# !cd tactile-rl && python scripts/train_rl_fixed.py \
#     --episodes 5000 \
#     --save_dir {rl_run_dir}

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create RL checkpoint directory
import os
rl_checkpoint_dir = '/content/drive/MyDrive/tactile_manipulation_rl_checkpoints'
os.makedirs(rl_checkpoint_dir, exist_ok=True)
print(f"RL checkpoints will be saved to: {rl_checkpoint_dir}")

## 4. Load Pre-trained BC Policy

In [None]:
# Load the BC policy from Drive
bc_model_path = '/content/drive/MyDrive/bc_policy_best.pt'

if os.path.exists(bc_model_path):
    print(f"Found BC model at: {bc_model_path}")
    
    # Load and verify
    checkpoint = torch.load(bc_model_path, weights_only=False)
    print(f"BC model trained for {checkpoint['epoch']} epochs")
    print(f"BC validation loss: {checkpoint['val_loss']:.4f}")
else:
    print("⚠️ BC model not found! Please upload it first.")

## 5. Create RL Training Script

In [None]:
%%writefile tactile-rl/scripts/train_rl_policy.py
#!/usr/bin/env python3
"""
RL fine-tuning for tactile manipulation using PPO.
Loads pre-trained BC policy and fine-tunes with shaped rewards.
"""

import torch
import torch.nn as nn
import numpy as np
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import argparse
import os
import sys
from datetime import datetime

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from environments.panda_demo_env_fixed import PandaDemoEnvFixed


class BCInitializedPolicy(BaseFeaturesExtractor):
    """Custom feature extractor initialized from BC policy."""
    
    def __init__(self, observation_space, features_dim=256, bc_checkpoint=None):
        super().__init__(observation_space, features_dim)
        
        # Match BC architecture
        self.encoder = nn.Sequential(
            nn.Linear(52, 256),
            nn.ReLU(),
            nn.LayerNorm(256),
            nn.Dropout(0.1),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.LayerNorm(256),
            nn.Dropout(0.1)
        )
        
        # Load BC weights if provided
        if bc_checkpoint:
            print("Loading BC weights...")
            checkpoint = torch.load(bc_checkpoint, weights_only=False)
            
            # Extract encoder weights from BC model
            bc_state_dict = checkpoint['model_state_dict']
            encoder_state_dict = {}
            
            # Map BC encoder weights to our encoder
            for key, value in bc_state_dict.items():
                if key.startswith('encoder.'):
                    encoder_state_dict[key] = value
            
            self.encoder.load_state_dict(encoder_state_dict)
            print("✅ BC weights loaded successfully!")
    
    def forward(self, observations):
        return self.encoder(observations)


def make_env(rank, seed=0):
    """Create environment with proper seeding."""
    def _init():
        env = PandaDemoEnvFixed(
            xml_path="franka_emika_panda/panda_demo_scene_fixed.xml",
            enable_tactile=True,
            control_frequency=20,
            horizon=200,
            reward_type="shaped",  # Use shaped rewards for RL
            terminate_on_success=True,
            early_termination=True
        )
        env.reset(seed=seed + rank)
        return env
    return _init


def train_rl(args):
    """Train RL policy with BC initialization."""
    
    # Create vectorized environments
    print(f"Creating {args.n_envs} parallel environments...")
    env = DummyVecEnv([make_env(i, args.seed) for i in range(args.n_envs)])
    
    # Normalize observations and rewards
    env = VecNormalize(
        env,
        norm_obs=True,
        norm_reward=True,
        clip_obs=10.0,
        clip_reward=10.0
    )
    
    # Create eval environment
    eval_env = DummyVecEnv([make_env(0, args.seed + 1000)])
    eval_env = VecNormalize(
        eval_env,
        norm_obs=True,
        norm_reward=False,
        training=False,
        norm_obs_keys=env.norm_obs_keys
    )
    
    # Setup policy kwargs with BC initialization
    policy_kwargs = {
        'features_extractor_class': BCInitializedPolicy,
        'features_extractor_kwargs': {
            'features_dim': 256,
            'bc_checkpoint': args.bc_checkpoint
        },
        'net_arch': [dict(pi=[128, 128], vf=[128, 128])],  # Smaller heads
        'activation_fn': nn.ReLU
    }
    
    # Create PPO model
    model = PPO(
        'MlpPolicy',
        env,
        learning_rate=args.lr,
        n_steps=args.n_steps,
        batch_size=args.batch_size,
        n_epochs=args.n_epochs,
        gamma=args.gamma,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=args.ent_coef,
        vf_coef=0.5,
        max_grad_norm=0.5,
        policy_kwargs=policy_kwargs,
        verbose=1,
        tensorboard_log=args.tb_log_dir,
        device='cuda' if torch.cuda.is_available() else 'cpu'
    )
    
    # Setup callbacks
    eval_callback = EvalCallback(
        eval_env,
        best_model_save_path=os.path.join(args.save_dir, 'best_model'),
        log_path=os.path.join(args.save_dir, 'eval_logs'),
        eval_freq=args.eval_freq,
        n_eval_episodes=args.n_eval_episodes,
        deterministic=True,
        render=False
    )
    
    checkpoint_callback = CheckpointCallback(
        save_freq=args.checkpoint_freq,
        save_path=os.path.join(args.save_dir, 'checkpoints'),
        name_prefix='rl_model'
    )
    
    # Train
    print(f"\nStarting RL training for {args.total_timesteps} timesteps...")
    print(f"This will take approximately {args.total_timesteps / 3600:.1f} hours on T4")
    
    model.learn(
        total_timesteps=args.total_timesteps,
        callbacks=[eval_callback, checkpoint_callback],
        progress_bar=True
    )
    
    # Save final model
    model.save(os.path.join(args.save_dir, 'final_model'))
    env.save(os.path.join(args.save_dir, 'vec_normalize.pkl'))
    
    print("\n✅ Training complete!")
    print(f"Models saved to: {args.save_dir}")


def main():
    parser = argparse.ArgumentParser()
    
    # Paths
    parser.add_argument('--bc_checkpoint', type=str, required=True,
                       help='Path to BC checkpoint')
    parser.add_argument('--save_dir', type=str, default='./rl_results',
                       help='Directory to save results')
    parser.add_argument('--tb_log_dir', type=str, default='./tensorboard_logs',
                       help='Tensorboard log directory')
    
    # Training hyperparameters
    parser.add_argument('--total_timesteps', type=int, default=1_000_000,
                       help='Total training timesteps')
    parser.add_argument('--n_envs', type=int, default=8,
                       help='Number of parallel environments')
    parser.add_argument('--n_steps', type=int, default=256,
                       help='Steps per environment per update')
    parser.add_argument('--batch_size', type=int, default=64,
                       help='Minibatch size')
    parser.add_argument('--n_epochs', type=int, default=10,
                       help='Number of epochs per update')
    parser.add_argument('--lr', type=float, default=3e-4,
                       help='Learning rate')
    parser.add_argument('--gamma', type=float, default=0.99,
                       help='Discount factor')
    parser.add_argument('--ent_coef', type=float, default=0.01,
                       help='Entropy coefficient')
    
    # Evaluation
    parser.add_argument('--eval_freq', type=int, default=10000,
                       help='Evaluation frequency')
    parser.add_argument('--n_eval_episodes', type=int, default=10,
                       help='Number of evaluation episodes')
    parser.add_argument('--checkpoint_freq', type=int, default=50000,
                       help='Checkpoint save frequency')
    
    # Other
    parser.add_argument('--seed', type=int, default=42,
                       help='Random seed')
    
    args = parser.parse_args()
    
    # Create save directory
    os.makedirs(args.save_dir, exist_ok=True)
    
    # Train
    train_rl(args)


if __name__ == "__main__":
    main()

## 6. Start RL Training

In [None]:
# Create run directory with timestamp
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
rl_run_dir = f"{rl_checkpoint_dir}/rl_run_{timestamp}"

# Start RL fine-tuning
# Note: This will take 3-6 hours on T4 for 1M timesteps
!cd tactile-rl && python scripts/train_rl_policy.py \
    --bc_checkpoint {bc_model_path} \
    --save_dir {rl_run_dir} \
    --tb_log_dir {rl_run_dir}/tensorboard \
    --total_timesteps 1000000 \
    --n_envs 8 \
    --n_steps 256 \
    --batch_size 64 \
    --lr 3e-4 \
    --eval_freq 10000 \
    --checkpoint_freq 50000

## 7. Monitor Training Progress

In [None]:
# Launch TensorBoard
%load_ext tensorboard
%tensorboard --logdir {rl_run_dir}/tensorboard

## 8. Evaluate Final Policy

In [None]:
# Load and evaluate the best model
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecNormalize
import numpy as np

# Load model
best_model_path = f"{rl_run_dir}/best_model/best_model.zip"
vec_norm_path = f"{rl_run_dir}/vec_normalize.pkl"

if os.path.exists(best_model_path):
    model = PPO.load(best_model_path)
    
    # Create test environment
    test_env = DummyVecEnv([make_env(0, seed=9999)])
    test_env = VecNormalize.load(vec_norm_path, test_env)
    test_env.training = False
    test_env.norm_reward = False
    
    # Run evaluation
    n_eval = 100
    successes = []
    rewards = []
    
    for i in range(n_eval):
        obs = test_env.reset()
        done = False
        episode_reward = 0
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = test_env.step(action)
            episode_reward += reward[0]
        
        successes.append(info[0].get('success', False))
        rewards.append(episode_reward)
        
        if (i + 1) % 20 == 0:
            print(f"Evaluated {i + 1}/{n_eval} episodes...")
    
    # Results
    success_rate = np.mean(successes) * 100
    avg_reward = np.mean(rewards)
    
    print(f"\n🎯 Final Results:")
    print(f"Success Rate: {success_rate:.1f}%")
    print(f"Average Reward: {avg_reward:.2f}")
    
    # Compare with BC baseline
    print(f"\nImprovement over BC:")
    print(f"BC expected: ~70-80% success")
    print(f"RL achieved: {success_rate:.1f}% success")
else:
    print("Model not found. Training may still be in progress.")

## 9. Save Final Model

In [None]:
# Download the final RL policy
from google.colab import files

if os.path.exists(best_model_path):
    print("Downloading RL policy...")
    files.download(best_model_path)
    files.download(vec_norm_path)
    
    # Also save to Drive
    !cp {best_model_path} /content/drive/MyDrive/rl_policy_best.zip
    !cp {vec_norm_path} /content/drive/MyDrive/rl_vec_normalize.pkl
    print("\nRL policy saved to Google Drive!")

## Training Tips

1. **Expected Timeline**:
   - 1M timesteps: 3-6 hours on T4
   - 5M timesteps: 15-30 hours (use Colab Pro)

2. **Monitor Progress**:
   - Check TensorBoard for learning curves
   - Success rate should improve from ~70% (BC) to 85-95% (RL)

3. **Hyperparameter Tuning**:
   - Start with default values
   - Increase `n_envs` for faster training (if memory allows)
   - Adjust `ent_coef` if exploration is too high/low

4. **Curriculum Learning** (Optional):
   - Start with easier initial positions
   - Gradually increase difficulty
   - Modify environment reset distribution