In [1]:
import gymnasium as gym
import numpy as np
import torch
from tqdm.notebook import tqdm
import os
import math

# pip install stable-baselines3=2.0.0
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt
from stable_baselines3.common.vec_env import DummyVecEnv

In [4]:
env = gym.make("CartPole-v1", render_mode="rgb_array")

def make_env():
    return gym.make("CartPole-v1")

vec_env = DummyVecEnv([make_env for _ in range(1)])

model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=50_000)

model.save("ppo_cartpole")

Using cpu device


In [5]:
loaded_model = PPO.load("ppo_cartpole")

mean_reward, std_reward = evaluate_policy(loaded_model, vec_env, n_eval_episodes=100)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

  th_object = th.load(file_content, map_location=device)


Mean reward: 500.00 +/- 0.00


In [25]:
env = gym.make("CartPole-v1", render_mode="human") # render_mode="rgb_array"

obs, _ = env.reset(seed=42)

for _ in range(500):
    action, _ = loaded_model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    if terminated or truncated:
        break

In [20]:
print("obs:", obs, obs.shape, obs.dtype)
print("action:", action, action.shape, action.dtype)
print("reward:", reward)
print("terminated:", terminated)
print("truncated:", truncated)
print("info:", info)

obs: [-0.01718314  0.02803363 -0.00328756 -0.04503307] (4,) float32
action: 0 () int64
reward: 1.0
terminated: False
truncated: True
info: {}


## Сбор датасета

In [45]:
env = gym.make("CartPole-v1", render_mode="rgb_array")

NUMBER_OF_TRAIN_DATA = 1000
MAX_EPISODE_LENGTH = 500
SAVE_DIR = "dataset"
os.makedirs(SAVE_DIR, exist_ok=True)

for i in tqdm(range(NUMBER_OF_TRAIN_DATA)):
    obs_list, act_list, rew_list, done_list, rtg_list, timesteps_list = [], [], [], [], [], []
    obs, _ = env.reset(seed=31415926535897+i)
    total_reward = 0

    for t in range(MAX_EPISODE_LENGTH):
        obs_list.append(obs)
        action, _ = loaded_model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        act_list.append(action)
        rew_list.append(reward)
        done_list.append(done)
        total_reward += reward
        rtg = MAX_EPISODE_LENGTH - total_reward
        rtg_list.append(rtg)
        timesteps_list.append(t)
        if done:
            break

    data = {
        'obs': np.array(obs_list),
        'action': np.array(act_list),
        'reward': np.array(rew_list),
        'done': np.array(done_list),
        'rtg': np.array(rtg_list),
        'timesteps': np.array(timesteps_list)
    }

    file_path = f'{SAVE_DIR}/train_data_{i}.npz'
    np.savez(file_path, **data)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [46]:
for key in data:
    print(f"{key}: {data[key].shape}")

obs: (500, 4)
action: (500,)
reward: (500,)
done: (500,)
rtg: (500,)
timesteps: (500,)


In [47]:
files = os.listdir(SAVE_DIR)
file_count = len(files)

total_size_bytes = 0
for file in files:
    file_path = os.path.join(SAVE_DIR, file)
    if os.path.isfile(file_path):
        total_size_bytes += os.path.getsize(file_path)

total_size_mb = total_size_bytes / (1024 * 1024)

avg_file_size_kb = (total_size_bytes / file_count) / 1024 if file_count > 0 else 0

print(f"Количество файлов в директории dataset/: {file_count}")
print(f"Общий размер директории dataset/: {total_size_mb:.2f} МБ")
print(f"Средний размер одного файла: {avg_file_size_kb:.2f} КБ")

Количество файлов в директории dataset/: 1000
Общий размер директории dataset/: 24.77 МБ
Средний размер одного файла: 25.37 КБ
