## DRIVE and INSTALL

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/RL/Project"
!pip install -r requirements.txt
!pip install --upgrade pillow==6.2.2
!pip install -q dm_control>=1.0.9

## IMPORT

In [1]:
#@title Run to install MuJoCo and `dm_control`
import distutils.util
import subprocess
if subprocess.run('nvidia-smi').returncode:
  raise RuntimeError(
      'Cannot communicate with GPU. '
      'Make sure you are using a GPU Colab runtime. '
      'Go to the Runtime menu and select Choose runtime type.')

print('Installing dm_control...')


# Configure dm_control to use the EGL rendering backend (requires GPU)
%env MUJOCO_GL=egl

!echo Installed dm_control $(pip show dm_control | grep -Po "(?<=Version: ).+")
#@title All `dm_control` imports required for this tutorial

# The basic mujoco wrapper.
from dm_control import mujoco

# Access to enums and MuJoCo library functions.
from dm_control.mujoco.wrapper.mjbindings import enums
from dm_control.mujoco.wrapper.mjbindings import mjlib

# PyMJCF
from dm_control import mjcf

# Composer high level imports
from dm_control import composer
from dm_control.composer.observation import observable
from dm_control.composer import variation

# Imports for Composer tutorial example
from dm_control.composer.variation import distributions
from dm_control.composer.variation import noises
from dm_control.locomotion.arenas import floors

# Control Suite
from dm_control import suite

# Run through corridor example
from dm_control.locomotion.walkers import cmu_humanoid
from dm_control.locomotion.arenas import corridors as corridor_arenas
from dm_control.locomotion.tasks import corridors as corridor_tasks

# Soccer
from dm_control.locomotion import soccer

# Manipulation
from dm_control import manipulation

Sun Dec 18 17:49:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P0    35W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## TRAIN

In [2]:
%cd "/content/drive/MyDrive/RL/Project"
import numpy as np
import torch
import argparse
import os
import math
import gym
import sys
import random
import time
import json

import copy
from dm_control import suite
from dm_control.suite.wrappers import pixels
import utils

from logger import Logger
from video import VideoRecorder

from agent import Agent


seed = 1
domain_name = "ball_in_cup"
task_name = "catch"
image_size = 84
action_repeat = 1
frame_stack = 3
work_dir = '/content/drive/MyDrive/RL/Project'
save_video = False

replay_buffer_capacity = 100000
batch_size = 32

s_dim = 128
a_dim = 50

num_train_steps = 1000000

init_steps = 1000

save_model = True
save_buffer = True

num_eval_episodes = 10
eval_frequency = 1000


def evaluate(env, agent, video, num_episodes, L, step):
    for i in range(num_episodes):
        obs = env.reset()
        
        video.init(enabled=(i == 0))
        done = False
        episode_reward = 0
        while not done: 
            with utils.eval_mode(agent):
              
              action = agent.select_action(obs.observation['pixels'])
                
            time_step = env.step(action)
            reward = time_step.reward
            done = time_step.last()
            video.record(env)
            episode_reward += reward

        video.save('%d.mp4' % step)
        L.log('eval/episode_reward', episode_reward, step)
    L.dump(step)

def main():
    utils.set_seed_everywhere(seed)

    env_ = pixels.Wrapper(suite.load(
        domain_name=domain_name,
        task_name=task_name
    ))

    # stack several consecutive frames together

    env = utils.FrameStack(env_, k=frame_stack)
    s = env.reset()
    s = np.delete(s, np.s_[:480], axis=0)

    
    utils.make_dir(work_dir)
    video_dir = utils.make_dir(os.path.join(work_dir, 'video'))
    model_dir = utils.make_dir(os.path.join(work_dir, 'model'))
    buffer_dir = utils.make_dir(os.path.join(work_dir, 'buffer'))

    video = VideoRecorder(video_dir if save_video else None)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  
    replay_buffer = utils.ReplayBuffer(
        obs_shape=s.shape,
        action_shape=env.action_spec().shape,
        capacity=replay_buffer_capacity,
        batch_size=batch_size,
        device=device
    )
    
    agent = Agent(
        obs_shape=np.transpose(s).shape,
        a_shape=env.action_spec().shape,
        s_dim = s_dim,
        a_dim = a_dim,
        device=device
    )

    L = Logger(work_dir, use_tb=False)

    episode, episode_reward, done = 0, 0, True
    start_time = time.time()
    for step in range(num_train_steps):
        
        if done:
            if step > 0:
                L.log('train/duration', time.time() - start_time, step)
                start_time = time.time()
                L.dump(step)

            # evaluate agent periodically
            if step % eval_frequency == 0:
                L.log('eval/episode', episode, step)
                evaluate(env_, agent, video, num_eval_episodes, L, step)
                if save_model:
                    agent.save(model_dir, step)
                if save_buffer:
                    replay_buffer.save(buffer_dir)

            L.log('train/episode_reward', episode_reward, step)
            
            
            obs = env_.reset()
            obs = obs.observation['pixels']
            done = False
            episode_reward = 0
            episode_step = 0
            episode += 1

            L.log('train/episode', episode, step)

        # sample action for data collection
        if step < init_steps:
            spec = env.action_spec()
            action = np.random.uniform(spec.minimum,spec.maximum,spec.shape)
        else:
            with utils.eval_mode(agent): 
                action = agent.sample_action(obs)

        # run training update
        if step >= init_steps:
            num_updates = init_steps if step == init_steps else 1
            for _ in range(num_updates):
                agent.update(replay_buffer, step)

        time_step = env_.step(action)
        reward = time_step.reward
        next_obs = time_step.observation['pixels']
        done = time_step.last()
        # allow infinit bootstrap
        done_bool = 0 if episode_step + 1 == env._max_episode_steps else float(
            done
        )
        episode_reward += reward

        replay_buffer.add(obs, action, reward, next_obs, done_bool)
       
        obs = next_obs
        episode_step += 1

      
if __name__ == '__main__':
    
    main()



/content/drive/.shortcut-targets-by-id/1g-McIVAA4yo7dyxagjsG5olLRqS0vw4T/Project


tcmalloc: large alloc 23040000000 bytes == 0x7efc62b58000 @  0x7f02904541e7 0x7f0282968994 0x7f028296912f 0x7f02829c78f5 0x7f02829c92cb 0x7f0282a673ab 0x5aae14 0x49abe4 0x7f02879e4807 0x55cd91 0x5d8941 0x5da107 0x586de6 0x5d8cdf 0x55ea20 0x7f02879e4807 0x5d8868 0x4990ca 0x7f02879e4807 0x55cd91 0x55d743 0x627376 0x5aaeb9 0x4990ca 0x7f02879e4807 0x4f6097 0x562426 0x7f02879e4807 0x4f6097 0x562426 0x7f02879e4807
tcmalloc: large alloc 23040000000 bytes == 0x7ef7056b0000 @  0x7f02904541e7 0x7f0282968994 0x7f028296912f 0x7f02829c78f5 0x7f02829c92cb 0x7f0282a673ab 0x5aae14 0x49abe4 0x7f02879e4807 0x55cd91 0x5d8941 0x5da107 0x586de6 0x5d8cdf 0x55ea20 0x7f02879e4807 0x5d8868 0x4990ca 0x7f02879e4807 0x55cd91 0x55d743 0x627376 0x5aaeb9 0x4990ca 0x7f02879e4807 0x4f6097 0x562426 0x7f02879e4807 0x4f6097 0x562426 0x7f02879e4807


| [32meval[0m | S: 0 | ER: 0.0000
| [33mtrain[0m | E: 1 | S: 1000 | D: 243.7 s | R: 0.0000 | BR: 0.0000 | ALOSS: 0.0000 | CLOSS: 0.0000 | RLOSS: 0.0000
| [32meval[0m | S: 1000 | ER: 293.0000


RuntimeError: ignored

## IMPORT

In [14]:
from dm_control import suite
from dm_control.suite.wrappers import pixels

env = pixels.Wrapper(suite . load ( "ball_in_cup" , "catch" ))

s = env.reset()
d = s.observation['pixels']
def reshape_(x):
  return x.reshape((x[0] * 3,) + x[1:])

In [None]:
print(d.shape, type(d))

In [10]:
a = torch.Tensor(1,40,40,3)
print(a.transpose(3,1).shape)

torch.Size([1, 3, 40, 40])
