In [3]:
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_VISIBLE_DEVICES=3


In [4]:
import argparse
import numpy as np
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import os
import json
from functools import partial
import sys
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import time
import seaborn as sn
 
from tianshou.utils import WandbLogger
from tianshou.data import Batch, Collector, ReplayBuffer, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.policy import BasePolicy, PPOPolicy, PGPolicy, A2CPolicy
from tianshou.trainer import OnpolicyTrainer
from tianshou.utils.net.common import ActorCritic, Net
#from tianshou.utils.net.discrete import Actor, Critic
from tianshou.utils.net.continuous import Actor, Critic, ActorProb
from tianshou.trainer.utils import gather_info, test_episode

import gymnasium as gym
from gymnasium import spaces
from gymnasium.wrappers import TimeLimit, RescaleAction, TransformObservation
from stable_baselines3.common.env_checker import check_env

from lib.environments import *
from lib.policy import MarlPPOPolicy, IndpPGPolicy
from lib.distributions import ElementwiseNormal
from lib.models import get_actor_critic
from lib.utils import str2bool, Config, dict_to_wandb_table, restrict_to_num_threads
from lib.trainer import MyOnpolicyTrainer
from lib.models import FcNN, MyFCNNActorProb, MyFCNNCriticProb
from lib.models import *

#temporary solution for xlb imports
sys.path.append(os.path.abspath('/home/pfischer/XLB'))
#from my_flows.kolmogorov_2d import Kolmogorov_flow
from my_flows.helpers import get_kwargs


#from lib.custom_tianshou.my_actors import MyActorProb

import wandb
wandb.require("core")

device = "cuda" if torch.cuda.is_available() else "cpu"

np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7efd6c7cf350>

In [5]:
class test_network(nn.Module):

    def __init__(self, device="cpu", in_channels=9, feature_dim=27, out_channels=1, padding_mode="circular"):
        super().__init__()
        self.device = device
        self.model = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=feature_dim, kernel_size=1, stride=1,
                       padding=0, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=feature_dim, out_channels=feature_dim, kernel_size=1, stride=1,
                       padding=0, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels=feature_dim, out_channels=out_channels, kernel_size=1, stride=1,
                       padding=0, bias=True),
            nn.ReLU(inplace=True),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float, device=self.device)
        batch = obs.shape[0]

        values = self.model(obs.reshape(batch, -1, 128, 128))
        values = values.reshape(batch,128,128)
        return values
    

    class fcnn(nn.Module):
    def __init__(self):
        super(fcnn, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(9, 27),
            nn.ReLU(),
            nn.Linear(27, 27),
            nn.ReLU(),
            nn.Linear(27,1),
            nn.ReLU(),
        )

    def forward(self, x):
        x = self.model(x)
        return x

IndentationError: expected an indented block after class definition on line 28 (1335503177.py, line 29)

In [None]:
model = test_network()
model2 = fcnn()

test_network(
  (model): Sequential(
    (0): Conv2d(9, 27, kernel_size=(1, 1), stride=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(27, 27, kernel_size=(1, 1), stride=(1, 1))
    (3): ReLU(inplace=True)
    (4): Conv2d(27, 1, kernel_size=(1, 1), stride=(1, 1))
    (5): ReLU(inplace=True)
  )
)

In [None]:
print(model.model[0].weight.shape)
print(model.model[0].bias.shape)
print(model.model[2].weight.shape)
print(model.model[2].bias.shape)
print(model.model[4].weight.shape)
print(model.model[4].bias.shape)

print("**********************")

print(model2.model[0].weight.shape)
print(model2.model[0].bias.shape)
print(model2.model[2].weight.shape)
print(model2.model[2].bias.shape)
print(model2.model[4].weight.shape)
print(model2.model[4].bias.shape)

torch.Size([27, 9, 1, 1])
torch.Size([27])
torch.Size([27, 27, 1, 1])
torch.Size([27])
torch.Size([1, 27, 1, 1])
torch.Size([1])
**********************
torch.Size([27, 9])
torch.Size([27])
torch.Size([27, 27])
torch.Size([27])
torch.Size([1, 27])
torch.Size([1])


In [6]:
#######################################################################################################
####### environments ##################################################################################
#######################################################################################################
seeds = np.array([102, 348, 270, 106, 71, 188, 20, 121, 214, 330, 87, 372,
              99, 359, 151, 130, 149, 308, 257, 343, 413, 293, 385, 191, 276,
              160, 313, 21, 252, 235, 344, 42])

assert seeds.shape[0] == np.unique(seeds).shape[0]
train_seeds = seeds[:29]
val_seeds = seeds[29:]
#test_seeds = np.array([69, 33, 420])
    
train_env = KolmogorovEnvironment9(seeds=train_seeds, max_episode_steps=1535, step_factor=1)
test_env = KolmogorovEnvironment9(seeds=val_seeds, max_episode_steps=1535, step_factor=1)
#######################################################################################################
####### Policy ########################################################################################
#######################################################################################################
assert train_env.observation_space.shape is not None  # for mypy
assert train_env.action_space.shape is not None
#initialize PPO
actor = MyFCNNActorProb2(in_channels=2, device=device).to(device)
critic = MyFCNNCriticProb2(in_channels=2, device=device).to(device)
optim = torch.optim.Adam(actor.parameters(), lr=3e-4, eps=1e-7)
dist = torch.distributions.Normal
#dist = ElementwiseNormal

policy = MarlPPOPolicy(actor=actor,
    critic=critic, 
    optim=optim,
    dist_fn=dist, 
    action_space=train_env.action_space,
    discount_factor=0.9,
    reward_normalization=False, 
    advantage_normalization = False,
    value_clip = False,
    deterministic_eval=True,
    action_scaling= True,
    action_bound_method= "tanh",
    ent_coef = 1e-3,
    vf_coef = 5e-2,
    max_grad_norm = 1.,
    gae_lambda=0.9, 
    recompute_advantage=False,
)

policy2 = PGPolicy(model=actor, optim=optim, dist_fn=dist, action_space=train_env.action_space,
        discount_factor=0.9,reward_normalization=False, deterministic_eval=True,
        observation_space=train_env.observation_space, action_scaling=True, action_bound_method = "tanh",
    )

policy3 = IndpPGPolicy(model=actor, optim=optim, dist_fn=dist, action_space=train_env.action_space,
        discount_factor=0.9,reward_normalization=False, deterministic_eval=True,
        observation_space=train_env.observation_space, action_scaling=True, action_bound_method = "tanh",
    )


#######################################################################################################
####### Collectors ####################################################################################
#######################################################################################################
#train_collector = Collector(policy=policy, env=train_env, buffer=VectorReplayBuffer(args.buffer_size, len(train_env)))
train_collector = Collector(policy=policy3, env=train_env, buffer=VectorReplayBuffer(20000, 1))
test_collector = Collector(policy=policy3, env=test_env)
train_collector.reset()
test_collector.reset()

#######################################################################################################
####### Trainer #######################################################################################
#######################################################################################################
trainer = OnpolicyTrainer(
    policy=policy3,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=2,
    step_per_epoch=100,
    repeat_per_collect=1,
    episode_per_test=1,
    batch_size=16,
    step_per_collect=32,
    #episode_per_collect=args.episode_per_collect,
    show_progress=True,
    #stop_fn=lambda mean_reward: mean_reward >= args.reward_threshold,
)

Re=10000, m_prime=4224, T=22.74231693614892, omega=1.9996472117476842
Re=10000, m_prime=67584, T=22.74231693614892, omega=1.99437028369627
Re=10000, m_prime=4224, T=22.74231693614892, omega=1.9996472117476842
Re=10000, m_prime=67584, T=22.74231693614892, omega=1.99437028369627




In [13]:
train_env.f1.mean()

Array(0.11111111, dtype=float64)

In [4]:
def train_step():
        """Perform one training step."""
        assert trainer.episode_per_test is not None
        assert trainer.train_collector is not None
        stop_fn_flag = False

        #what does train_fn do?
        if trainer.train_fn:
            trainer.train_fn(trainer.epoch, trainer.env_step)

        #collect either step_per_collect or episode_per_collect results
        result = trainer.train_collector.collect(n_step=trainer.step_per_collect, n_episode=trainer.episode_per_collect)

        #transforms rewards
        if result["n/ep"] > 0 and trainer.reward_metric:
            rew = trainer.reward_metric(result["rews"]) #seems to apply the reward_metric ontop of the collected rewards
            result.update(rews=rew, rew=rew.mean(), rew_std=rew.std())

        trainer.env_step += int(result["n/st"]) #n/st measures the total number of steps taken in the environment
        trainer.logger.log_train_data(result, trainer.env_step) #logs training data
        trainer.last_rew = result["rew"] if result["n/ep"] > 0 else trainer.last_rew #updates last reward if a whole episode was played
        trainer.last_len = result["len"] if result["n/ep"] > 0 else trainer.last_len #updates last length if a whole episode was played

        #updated printed statistics of the call to train_step
        data = {
            "env_step": str(trainer.env_step),
            "rew": f"{trainer.last_rew:.2f}",
            "len": str(int(trainer.last_len)),
            "n/ep": str(int(result["n/ep"])),
            "n/st": str(int(result["n/st"])),
        }

        if result["n/ep"] > 0:
            #if stop_fn returns true on training data we perform a test and check if stop_fn also
            #returns true on testing data. If so, we stop everything by setting stop_fn_flag = True
            if trainer.test_in_train and trainer.stop_fn and trainer.stop_fn(result["rew"]):
                assert trainer.test_collector is not None
                test_result = test_episode(
                    trainer.policy, trainer.test_collector, trainer.test_fn, trainer.epoch,
                    trainer.episode_per_test, trainer.logger, trainer.env_step
                )

                if trainer.stop_fn(test_result["rew"]):
                    stop_fn_flag = True
                    trainer.best_reward = test_result["rew"]
                    trainer.best_reward_std = test_result["rew_std"]
                else:
                    trainer.policy.train()

        # returns results of training, not of testing!
        return data, result, stop_fn_flag

In [5]:
# understand policy_update_fn
def policy_update_fn(data, result):
        """Perform one on-policy update."""
        assert trainer.train_collector is not None
        
        losses = trainer.policy.update(
            0,
            trainer.train_collector.buffer,
            batch_size=trainer.batch_size,
            repeat=trainer.repeat_per_collect,
        )
        trainer.train_collector.reset_buffer(keep_statistics=True)
        step = max([1] + [len(v) for v in losses.values() if isinstance(v, list)])
        trainer.gradient_step += step
        trainer.log_update_data(data, losses)

In [6]:
epochs = 2
step_per_epoch = 100
global_iter_num = 1 #counts what? -> total number of iterations with the environment
stop_fn_flag = False #should be set to true if stop_fn otputs true
show_progress = True #flag indicating the visualization of a progress bar
trainer.reset()

for epoch in range(1,epochs+1):
    """Perform one epoch (both train and eval)."""
    global_iter_num += 1
    trainer.epoch+=1
    if global_iter_num > 1:
        # exit flag 1, when stop_fn succeeds in train_step or test_step
        if stop_fn_flag:
            print("stop iteration due to stop_fn_flag")
            break
    
    policy.train() #set policy in train mode
    epoch_stat = dict() #dictionary to save epoch statistics


    # perform n step_per_epoch
    progress = tqdm if show_progress else DummyTqdm
    with progress(total=step_per_epoch, desc=f"Epoch #{epoch}") as t:
        while t.n < t.total and not stop_fn_flag:
            data = dict()
            result = dict()
            #********************************************************************************************************************
            #PERFORM A COLLECT STEP *********************************************************************************************
            #********************************************************************************************************************
            if trainer.train_collector is not None:
                data, result, stop_fn_flag = train_step()
                t.update(result["n/st"]) #updates the counter t with the total number of steps taken in the last train_step update!
                if trainer.stop_fn_flag:
                    t.set_postfix(**data) #sets print output of progress bar
                    break #breaks while loop in case stop_fn is true
            #********************************************************************************************************************
            #********************************************************************************************************************
            #********************************************************************************************************************
            else:
                print("WARNING: o train_collector was found")
                assert trainer.buffer, "No train_collector or buffer specified"
                result["n/ep"] = len(trainer.buffer)
                result["n/st"] = int(trainer.gradient_step)
                t.update()

            #********************************************************************************************************************
            #PERFORM A POLICY UPDATE ********************************************************************************************
            #********************************************************************************************************************
            #updates the policy with the collected data and results 
            print(data)
            print(result)
            print(stop_fn_flag)
            policy_update_fn(data, result)
            t.set_postfix(**data)
            #********************************************************************************************************************
            #********************************************************************************************************************
            #********************************************************************************************************************


    # for offline RL
    if trainer.train_collector is None:
        trainer.env_step = trainer.gradient_step * trainer.batch_size
    
    # loggs training data if stop_fn_flag is not true jet, hence logs testing stats after each epoch
    if not trainer.stop_fn_flag:
        trainer.logger.save_data(trainer.epoch, trainer.env_step, trainer.gradient_step, trainer.save_checkpoint_fn)
        #********************************************************************************************************************
        # PERFORM A TESTING STEP ********************************************************************************************
        #********************************************************************************************************************
        if trainer.test_collector is not None:
            test_stat, trainer.stop_fn_flag = trainer.test_step()
            if not trainer.is_run: #if trainer isn't running 
                epoch_stat.update(test_stat)
        #********************************************************************************************************************
        #********************************************************************************************************************
        #********************************************************************************************************************
        

    # if trainer is not running -> meaning logs training stats after each epoch if trainer has finished 
    if not trainer.is_run:
        epoch_stat.update({k: v.get() for k, v in trainer.stat.items()})
        epoch_stat["gradient_step"] = trainer.gradient_step
        epoch_stat.update(
            {
                "env_step": trainer.env_step,
                "rew": trainer.last_rew,
                "len": int(trainer.last_len),
                "n/ep": int(result["n/ep"]),
                "n/st": int(result["n/st"]),
            }
        )
        info = gather_info(
            trainer.start_time, trainer.train_collector, trainer.test_collector,
            trainer.best_reward, trainer.best_reward_std
        )
        print("return")
        #TODO: do something with the info here -> or just return it

    else:
        print("no return cause trainer is still running")

Epoch #1:  32%|███▏      | 32/100 [00:02<00:05, 12.60it/s, env_step=32, len=0, loss=-0.001, n/ep=0, n/st=32, rew=0.00]

{'env_step': '32', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False


Epoch #1:  96%|█████████▌| 96/100 [00:03<00:00, 42.93it/s, env_step=64, len=0, loss=-0.001, n/ep=0, n/st=32, rew=0.00]

{'env_step': '64', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False
{'env_step': '96', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False


Epoch #1: 128it [00:03, 39.73it/s, env_step=128, len=0, loss=-0.000, n/ep=0, n/st=32, rew=0.00]                       


{'env_step': '128', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False
Epoch #1: test_reward: -14.351458 ± 13.403692, best_reward: -9.832229 ± 9.662032 in #0
return


Epoch #2:  32%|███▏      | 32/100 [00:00<00:00, 219.27it/s, env_step=160, len=0, loss=-0.000, n/ep=0, n/st=32, rew=0.00]

{'env_step': '160', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False


Epoch #2:  64%|██████▍   | 64/100 [00:00<00:00, 193.74it/s, env_step=160, len=0, loss=-0.000, n/ep=0, n/st=32, rew=0.00]

{'env_step': '192', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False


Epoch #2:  96%|█████████▌| 96/100 [00:00<00:00, 187.74it/s, env_step=192, len=0, loss=-0.000, n/ep=0, n/st=32, rew=0.00]

{'env_step': '224', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False


Epoch #2: 128it [00:00, 185.01it/s, env_step=224, len=0, loss=-0.000, n/ep=0, n/st=32, rew=0.00]                        

{'env_step': '256', 'rew': '0.00', 'len': '0', 'n/ep': '0', 'n/st': '32'}
{'n/ep': 0, 'n/st': 32, 'rews': array([], dtype=float64), 'lens': array([], dtype=int64), 'idxs': array([], dtype=int64), 'rew': 0, 'len': 0, 'rew_std': 0, 'len_std': 0}
False


Epoch #2: 128it [00:00, 180.18it/s, env_step=256, len=0, loss=-0.000, n/ep=0, n/st=32, rew=0.00]


Epoch #2: test_reward: -14.213970 ± 13.288224, best_reward: -9.832229 ± 9.662032 in #0
return


In [1]:
train_env

NameError: name 'train_env' is not defined