<a href="https://colab.research.google.com/github/felipe-parodi/imitation-learning-cis522/blob/main/Behavioral_Cloning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Behavioral Cloning for PyBullet Humanoid-v0**

###Add link to drive if using Colab. Run locally for visualizing agent. 

In [None]:
# SESSION PARAMETERS
use_colab    = False 
train_model  = 1    # 0: evaluate only
new_training = 0    # 0: load model and continue training; 1: Start from scratch

if use_colab:
  from google.colab import drive
  drive.mount('/content/gdrive')

In [None]:
# @markdown ### Install pybullet (Colab only)
if use_colab:
  !pip install pybullet
  !apt-get update
  !apt -q install imagemagick
  !apt install -q xvfb
  !pip install -q pyvirtualdisplay

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/4a/25/91b51ceffabb44859b081aa2417c9b535742e094681e34bf5285b6b8703b/pybullet-3.1.6-cp37-cp37m-manylinux1_x86_64.whl (89.3MB)
[K     |████████████████████████████████| 89.3MB 51kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.1.6
Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [53.9 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Relea

In [None]:
# @markdown ### Install dependencies
import time
import os
import glob
os.environ['MESA_GL_VERSION_OVERRIDE'] = '3.3'
os.environ['MESA_GLSL_VERSION_OVERRIDE'] = '330'
import json
import numpy as np
import pandas as pd
import random
import sys
import time
from builtins import super
from collections import namedtuple
from tqdm.auto import tqdm
from copy import deepcopy
from typing import NamedTuple
import pdb
import scipy.signal

# Pybullet
import pybullet as p
if use_colab:
  p.connect(p.DIRECT)
import pybullet_data
from   pybullet_utils.arg_parser import ArgParser
from   pybullet_envs.deep_mimic.env.pybullet_deep_mimic_env import PyBulletDeepMimicEnv
from pybullet_envs.deep_mimic.learning.path import *
  
# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from   torch.autograd import Variable as V
from   torch.distributions import Normal
from   torch.optim import Adam

# Plotting 
import matplotlib.pyplot as plt
from matplotlib import pylab
from matplotlib import animation

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp37-none-any.whl size=7411 sha256=aaac71e8847380aadbda208fd29cf128f303f24a53d82b04a5c5b564f7052456
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
numGPUs= 1
Using GPU hardware (eglRenderer)
Collecting numpngw
  Downloading https://files.pythonhosted.org/packages/48/99/a2482bbf4d3a663042f496e9a23fb68b068e8768baf0183293f3e5f9aaad/numpngw-0.0.8-py3-none-any.whl
Installing collected packages: numpngw
Successfully installed numpngw-0.0.8


### Build environment

In [None]:
# Define some parameters
timestep     = 1. / 240.
animating    = True
step         = False
total_reward = 0
steps        = 0

# Build arg parser
arg_parser   = ArgParser()
arg_file     = "run_humanoid3d_walk_args.txt"
argpath      = pybullet_data.getDataPath() + "/args/" + arg_file
succ         = arg_parser.load_file(argpath)
project_path = "/content/gdrive/MyDrive/Colab Notebooks/Penn Deep Learning Course/RL Project"
expert_data  = os.path.join(project_path,"state_action_deepmimic_850reward.npz")

# Buiild env
env = PyBulletDeepMimicEnv(arg_parser, enable_draw = not use_colab)

Initialization strategy: InitializationStrategy.RANDOM
motion_file= data/motions/humanoid3d_walk.txt
LOADING humanoid!


### Replay buffer for storing (state, action) pairs

In [None]:
if train_model: 
  # Define replay buffer for collecting expert data
  class ReplayBuffer:
    def __init__(self, state_dim, act_dim, buffer_size):
      # params
      self.buffer_size = buffer_size
      self.ptr = 0
      self.n_samples = 0
      # initialize state and action
      self.state  = torch.zeros(buffer_size, state_dim, dtype=torch.float32, device=device)
      self.action = torch.zeros(buffer_size, act_dim, dtype=torch.float32, device=device)
    
    def add(self, state, action):
      # Add to replay buffer
      self.state[self.ptr]  = torch.tensor(state)
      self.action[self.ptr] = torch.tensor(action)
      # Update vals
      if self.n_samples < self.buffer_size:
        self.n_samples += 1
      self.ptr = (self.ptr + 1) % self.buffer_size

    def sample(self, batch_size):      
      # Select batch_size number of sample indicies at random from the buffer
      idx    = np.random.choice(self.n_samples, batch_size)    
      # Using the random indices, assign the corresponding state and action
      state  = self.state[idx]
      action = self.action[idx]
      return state, action

  class BCAgent(nn.Module):
      def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
          super(BCAgent, self).__init__()
          self.num_actions = num_actions
          self.linear1     = nn.Linear(num_inputs, hidden_size)
          self.linear2     = nn.Linear(hidden_size, hidden_size)
          self.linear3     = nn.Linear(hidden_size, num_actions)
          self.optimizer   = optim.Adam(self.parameters(), lr=learning_rate)
          self.criterion   = nn.MSELoss() #CrossEntropyLoss()

      def forward(self, state):
          x = F.relu(self.linear1(state))
          x = F.relu(self.linear2(x))
          x = self.linear3(x)
          return x 
      
      def get_action(self, state):
          state = torch.from_numpy(state).float().unsqueeze(0).to(device)
          probs = self.forward(state)
          action = probs
          return action.cpu().numpy()#[0]
      
      def update(self, state, action):
          # Get output from model
          output = self.forward(state)
          # Compute loss
          loss = self.criterion(output,action.squeeze())
          # Take gradient step
          self.optimizer.zero_grad()
          loss.backward()
          self.optimizer.step()

          return loss.item()

def get_state_from_action(env,action,timestep):
     # Convert action to pose: calls self.desiredPose = self._humanoid.convertActionToPose(action)
     env.set_action(0, action)
     # Update env using PD controller to get joint torques: self._humanoid.computeAndApplyPDForces(self.desiredPose, maxForces=maxForces)
     env.update(timestep)
     # Get current state
     state = env.record_state(0)
     # Get simple reward for being alive else none
     if env.is_episode_end():
            reward = env.calc_reward(0) #0
            done   = True
     else: 
            reward = env.calc_reward(0) #1
            done   = False
     return state, reward, done

### Training function for behavioral cloning

In [None]:
if train_model: 
  def behavioral_cloning(env, agent, buffer, timestep, num_epochs=1, iters_per_epoch=200, batch_size=10,print_epoch=100,model_checkpoints=[],start_epoch=0):
      epoch_losses = []
      epoch_rewards = []
      epoch_count = 1
      for epoch in tqdm(range(start_epoch,num_epochs)):
          total_loss = 0
          for i in range(iters_per_epoch):
              
              # Sample a batch of states and actions from the buffer
              sample_state, sample_action = buffer.sample(batch_size)
              
              # Update agent
              # pdb.set_trace()
              loss = agent.update(sample_state,sample_action)
              total_loss += loss

          # Log average loss
          epoch_losses.append(total_loss / iters_per_epoch)
          # Evaluate in environment
          total_reward = 0
          done = False
          env.reset()
          state = env.record_state(0)
          while not done:
              with torch.no_grad():
                  action = agent.get_action(state)
              next_state, reward, done = get_state_from_action(env,action[0],timestep)
              #next_state, reward, done, _ = env.step(action)
              total_reward += reward
              state = next_state
          epoch_rewards.append(total_reward)
          if epoch_count == print_epoch:
            last_milestone = epoch
            print(f'Epoch [{epoch+1}/{num_epochs}], loss: {epoch_losses[-1]}, reward: {epoch_rewards[-1]}')
            filename = os.path.join(project_path,"behavioralCloningSimple_modelparams_epoch{}_reward{}.pt".format(epoch+1,int(total_reward)))
            torch.save({
              'epoch': epoch,
              'model_state_dict': agent.state_dict(),
              'optimizer_state_dict': agent.optimizer.state_dict(),
              'reward': total_reward,
              'loss': total_loss}, filename)
            
            # torch.save(agent.state_dict(),filename)
            # Add filename to list
            model_checkpoints.append(filename)
            if len(model_checkpoints)>=5:
              # Delete old checkpoint
              !rm $"{model_checkpoints[0]}"
              model_checkpoints.pop(0)
              # print(model_checkpoints)

            epoch_count = 1
          else:
            epoch_count += 1
          done = False
          env.reset()
      return epoch_losses, epoch_rewards, env

### Load expert (state, action) data and add to replay buffer

In [None]:
if train_model:
  # Load expert data
  with np.load(expert_data,allow_pickle=True) as data:
      action = data['actions']
      state  = data['states']
  # Get dimensions
  act_dim   = action.shape
  print("Action dimensions: ", act_dim)
  state_dim = state.shape
  print("State dimensions: ", state_dim)
  # Define replay buffer
  replay_buffer = ReplayBuffer(state_dim[1], act_dim[1], state_dim[0])
  # Add expert data to replay buffer    
  for ii in range(state_dim[0]):
      replay_buffer.add(state[ii,:], action[ii,:])
    

(118, 36)
(118, 197)


## Train humanoid using behavioral cloning

In [None]:
# Make learning agent
agent = BCAgent(state_dim[1],act_dim[1], 128).to(device)

# If not new training, load previous model parameters
previous_checkpoints=[]
if not new_training or not train_model:
  for file in glob.glob(os.path.join(project_path,"*.pt")):
    previous_checkpoints.append(file)
  checkpoint = torch.load(previous_checkpoints[-1],map_location=torch.device(device))
  agent.load_state_dict(checkpoint)
  temp = previous_checkpoints[-1].split("epoch")
  start_epoch = int(temp[1].split("_")[0])
  # agent.load_state_dict(checkpoint['model_state_dict'])
  # agent.optimizier.load_state_dict(checkpoint['optimizer_state_dict'])
else: 
  start_epoch = 0

if train_model: 
  # Train behavioral cloning
  losses, rewards, env = behavioral_cloning(env, agent, replay_buffer, timestep, num_epochs=100000, batch_size=10, model_checkpoints=previous_checkpoints,start_epoch=start_epoch)


### Evaluate model using N random starts

In [None]:
# Use already loaded agent
num_iter    = 1000
# How often to print
print_count   = 0
when_to_print = 10
# Initialize reward and steps arrays
all_reward = []
num_steps  = []
for ii in range(num_iter):
    if print_count == when_to_print:
      print(ii)
      print_count=0
    # Number of episode steps - never reached
    ep_steps=1000
    total_reward = 0
    state = env.reset()
    state = env.record_state(0)
    for s in range(ep_steps):
        with torch.no_grad():
            action = agent.get_action(state)
        next_state, reward, done = get_state_from_action(env,action[0],timestep)
        total_reward += reward
        state = next_state
        if done:
            all_reward.append(total_reward)
            num_steps.append(s)
            break
    print_count+=1
# print(all_reward)
# print(num_steps)    
np.savez("behavioralcloning_1000iterations_DeepMimicHumanoid",all_reward=all_reward,checkpoint=checkpoint,num_steps=num_steps)

success!
