In [1]:
import numpy as np
import laserhockey.hockey_env as h_env
import gymnasium as gym
from importlib import reload
import time
import torch
import DDPG
import TD3
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

In [2]:
plt.rcParams.update({'font.size': 6})

In [3]:
np.set_printoptions(suppress=True)
reload(h_env)

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


<module 'laserhockey.hockey_env' from 'C:\\Users\\lenna\\Desktop\\RL\\Project\\RL-Hockey\\laserhockey\\hockey_env.py'>

In [4]:
### moving average to smooth out rewards ###
def moving_average(data, win_size):
    data = np.asarray(data)
    averages = []
    for i in range(len(data)-win_size):
        averages.append(np.sum(data[i:i+win_size])/win_size)
    return averages

### opponent that performs random actions
class Random_opponent():
    def __init__(self, keep_mode=True):
        self.keep_mode = keep_mode
    def act(self, obs):
        if self.keep_mode:
            return np.random.uniform(-1,1,4)
        return np.random.uniform(-1,1,3)

In [5]:
### function for saving train/test statistics
def save_statistics(type, config, rewards, net_losses, wins, losses, winrate):
    train_type = config["test"]*"test" + (1-config["test"])*"train"
    with open(f'./results/{type}_hockey_{config["name"]}_{config["mode"]}_{train_type}_stats.pkl', 'wb') as f:
        pickle.dump({"Experiment setup" : config, "Rewards": rewards, "losses": net_losses, "wins": wins, "losses":losses, "winrate": winrate}, f)

In [6]:
### training and testing (if config["test"] is set to True) ###
def train_hockey(agent_type, agent1, agent2, config):
    save_as1=f'./results/{agent_type}_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    save_as2=f'./results/{agent_type}_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    if config["mode"]=="normal" or config["mode"]=="weak" or config["mode"]=="selfplay":
        env = h_env.HockeyEnv()
    elif config["mode"]=="defense":
        env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE)
    elif config["mode"]=="attack":
        env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_SHOOTING)
        
    if config["mode"]=="normal":
        player2 = h_env.BasicOpponent(weak=False)
    elif config["mode"]=="weak":
        player2 = h_env.BasicOpponent()
    elif config["mode"]=="defense" or config["mode"]=="attack":
        player2 = Random_opponent()
    elif config["mode"]=="selfplay":
        player2 = agent2
        
    player1 = agent1
    obs_agent2 = env.obs_agent_two()
    if type(agent1).__name__=="TD3Agent":
        train_losses = np.empty((0,4))
    else:
        train_losses = np.empty((0,2))
    rewards = []
    wins, losses, rewards = 0, 0, []
    eps = 1 # entirely random actions for initial 
    desc = "Training..."
    if config["test"]:
        desc="Testing..."
        eps = 0.0
    for i in tqdm(range(config["episodes"]), desc=desc, unit="episodes", colour="green"):
        obs, info = env.reset()
        d = False
        ep_r = 0
        old_r = 0
        while not d:
            if config["render"]:
                env.render()
            a1 = player1.act(obs, eps=eps)
            a2 = player2.act(obs_agent2)
            obsnew, r, d, _, info = env.step(np.hstack([a1,a2]))
            if info['winner']==1:
                wins += 1
            elif info['winner']==-1:
                losses += 1
            if not config["test"]:
                player1.store_transition((obs, a1, r, obsnew, d))
                if config["prio_replay"] and abs(r)>0:
                   for k in range(5):
                       player1.store_transition((obs, a1, r, obsnew, d))
                if config["mode"]=="selfplay":
                    player2.store_transition((obs, a2, r, obsnew, d))
            obs_agent2 = env.obs_agent_two()
            #print(abs(obs-obsnew), abs(old_r-r))
            obs=obsnew
            ep_r +=r
            #print(r)
            old_r = r
        if not config["test"] and i>100:
            eps = config["eps"]
            loss = player1.train(config["iter_fit"])
            train_losses = np.concatenate((train_losses, np.asarray(loss)))
            #if config["mode"]=="selfplay":
                #_ = player2.train(config["iter_fit"])
        '''if (i+1)%500==0:
            config["learning_rate_critic"] = config["learning_rate_critic"]*0.5
            config["learning_rate_actor"] = config["learning_rate_actor"]*0.5'''
        rewards.append(ep_r)
    print(f'Wins: {wins}')
    print(f'Losses: {losses}')
    winrate = wins/max(1,losses)
    print(f'W/L: {winrate}')
    env.close()
    save_statistics(agent_type, config, rewards, train_losses, wins, losses, winrate)
    if not config["test"]:
        torch.save(player1.state(), save_as1)
        #if config["mode"]=="selfplay":
            #torch.save(player2.state(), save_as2)       
    return train_losses, rewards

In [7]:
def train_gym(agent1, config):
    save_as1=f'./results/{config["agent_type"]}_{config["env_type"]}_{config["name"]}_{config["mode"]}_agent.pth'
    player1 = agent1
    train_losses = np.empty((0,4))
    if config["agent_type"] == "DDPG":
        train_losses = np.empty((0,2))
    rewards = []
    if config["env_type"] == "walker":
        env = gym.make("BipedalWalker-v3", hardcore=False)
    if config["env_type"] == "pendulum":
        env = gym.make("Pendulum-v1")
    #eps = 1 # entirely random actions for initial 
    desc = "Training..."
    eps = config["eps"]
    if config["test"]:
        desc="Testing..."
        eps = 0.0
    for i in tqdm(range(config["episodes"]), desc=desc, unit="episodes", colour="green"):
        obs, info = env.reset()
        d = False
        steps = 0
        ep_r = 0
        while not d:
            steps += 1
            if config["render"]:
                env.render()
            a1 = player1.act(obs, eps=eps)
            obsnew, r, d, _, info = env.step(a1)
            if not config["test"]:
                player1.store_transition((obs, a1, r, obsnew, d))
                if config["prio_replay"] and abs(r)>0:
                   for k in range(5):
                       player1.store_transition((obs, a1, r, obsnew, d))
            obs=obsnew
            ep_r +=r
            #rewards.append(r)
            if steps>1000:
                break
        if not config["test"]:
            eps = config["eps"]
            loss = player1.train(config["iter_fit"])
            train_losses = np.concatenate((train_losses, np.asarray(loss)))
            #if config["mode"]=="selfplay":
                #_ = player2.train(config["iter_fit"])
        '''if (i+1)%500==0:
            config["learning_rate_critic"] = config["learning_rate_critic"]*0.5
            config["learning_rate_actor"] = config["learning_rate_actor"]*0.5'''
        rewards.append(ep_r)
    env.close()
    save_statistics(config["agent_type"], config, rewards, train_losses, wins=None, losses=None, winrate=None)
    if not config["test"]:
        torch.save(player1.state(), save_as1)
        #if config["mode"]=="selfplay":
            #torch.save(player2.state(), save_as2)       
    return train_losses, rewards

In [8]:
### initializes agents and executes training procedure ###
def init_train(config):
    agent_type = config["agent_type"]
    if config["env_type"] == "hockey":
        env = h_env.HockeyEnv()
    else:
        if config["env_type"] == "walker":
            env = gym.make("BipedalWalker-v3", hardcore=False)
        if config["env_type"] == "pendulum":
            env = gym.make("Pendulum-v1")
    # turn off the respective parts of TD3 to analyze separately
    if agent_type == "CDQ":
        config["smoothing_clip"] = 0
        config["update_policy_every"] = 1
    if agent_type == "TPS":
        config["cdq"] = False
        config["update_policy_every"] = 1
    if agent_type == "DPU":
        config["cdq"] = False
        config["smoothing_clip"] = 0

    if agent_type =="DDPG":
        agent1 = DDPG.DDPGAgent(env.observation_space, env.action_space, discount=config["discount"], eps=config["eps"],
                              update_target_every=config["update_target_every"], update_policy_every=config["update_policy_every"], 
                              hidden_sizes_actor=config["hidden_sizes_actor"],hidden_sizes_critic=config["hidden_sizes_critic"],
                              smoothing_std=config["smoothing_std"], smoothing_clip=config["smoothing_clip"],
                              learning_rate_actor=config["learning_rate_actor"], learning_rate_critic=config["learning_rate_critic"])
    else:
        agent1 = TD3.TD3Agent(env.observation_space, env.action_space, discount=config["discount"], eps=config["eps"],
                              update_target_every=config["update_target_every"], update_policy_every=config["update_policy_every"], 
                              hidden_sizes_actor=config["hidden_sizes_actor"],hidden_sizes_critic=config["hidden_sizes_critic"],
                              smoothing_std=config["smoothing_std"], smoothing_clip=config["smoothing_clip"],
                              learning_rate_actor=config["learning_rate_actor"], learning_rate_critic=config["learning_rate_critic"])
    agent2 = None
    if config["mode"] == "selfplay":
        agent2 = TD3.TD3Agent(agent_type, env.observation_space, env.action_space, discount=config["discount"], eps=config["eps"],
                          update_target_every=config["update_target_every"], update_policy_every=config["update_policy_every"], 
                          hidden_sizes_actor=config["hidden_sizes_actor"],hidden_sizes_critic=config["hidden_sizes_critic"],
                          smoothing_std=config["smoothing_std"], smoothing_clip=config["smoothing_clip"],
                          learning_rate_actor=config["learning_rate_actor"], learning_rate_critic=config["learning_rate_critic"])
    if config["checkpoint1"]:
        agent1.restore_state(torch.load(config["checkpoint1"]))
    if config["checkpoint2"]:
        agent2.restore_state(torch.load(config["checkpoint2"]))
    env.close() 
    if config["env_type"]=="hockey":
        print("train hock")
        losses_wea, rewards_wea = train_hockey(agent_type, agent1, agent2, config)
    else:
        losses_wea, rewards_wea = train_gym(agent1,config)
    rewards_wea_avg = moving_average(rewards_wea, 100)
    if not config["test"]:
        plt.figure(figsize=(3,2))
        plt.plot(rewards_wea_avg)
        plt.title(f'{type(agent1).__name__}_wea_{config["mode"]}')
        plt.show()
        plt.figure(figsize=(3,2))
        plt.plot(moving_average(losses_wea[:,0],100))
        plt.title(f'{type(agent1).__name__}_wea_{config["mode"]}')
        plt.show()
        plt.figure(figsize=(3,2))
        plt.plot(moving_average(losses_wea[:,1],100))
        plt.title(f'{type(agent1).__name__}_wea_{config["mode"]}')
        plt.show()
        if not agent_type =="DDPG":
            plt.figure(figsize=(3,2))
            plt.plot(moving_average(losses_wea[:,2][losses_wea[:,2] != np.array(None)],100))
            plt.title(f'{type(agent1).__name__}_wea_{config["mode"]}')
            plt.show()
            plt.figure(figsize=(3,2))
            plt.plot(moving_average(losses_wea[:,3][losses_wea[:,3] != np.array(None)],100))
            plt.title(f'{type(agent1).__name__}_wea_{config["mode"]}')
            plt.show()
    
    '''checkpoint = f'./results/{type(agent).__name__}_hockey_{episodes}-eps{eps}-{mode}.pth' '''

        
    '''agent2 = DDPG.DDPGAgent(env.observation_space, env.action_space, 
                           discount=1, eps=eps, update_target_every=update_target_every, hidden_sizes_actor=hidden_sizes_actor,hidden_sizes_critic=hidden_sizes_critic)
    agent2.restore_state(torch.load(checkpoint2))'''

In [9]:
start_config = {
    "name" : "name",
    "agent_type" : "TD3",
    "env_type" : "hockey",
    "test" : False,
    "render" : False,
    "episodes" : 10000,
    "mode" : "normal",
    "eps" : 0.1,
    "discount":0.99,
    "update_target_every":100,
    "update_policy_every":2,
    "hidden_sizes_actor" : [128,128],
    "hidden_sizes_critic" : [128,128,64],
    "iter_fit" : 20,
    "smoothing_std"  : 0.0005,
    "smoothing_clip" : 0.00025,
    "checkpoint1" : None,
    "checkpoint2" : None,
    "learning_rate_critic": 0.001,
    "learning_rate_actor": 0.001,
    "buffer_size" : 250000,
    "theta" : 0.01,
    "prio_replay" : False
}
# lr of 0.0001 for both seems to work best
# small buffer size seems to be better
# eps 0.1 seems to be best
# discount 1 shows best results (winning later isn't worse than earlier?)
# 20 iterations with policy delay 2 best

In [11]:
for agent_type in ["CDQ", "TPS", "DPU", "TD3", "DDPG"]:
    print("AGENT", agent_type)
    config = start_config.copy()
    config["episodes"] = 1000
    config["agent_type"] = agent_type
    config["env_type"] = "pendulum"
    config["name"] = f'env_{config["env_type"]}_agent_{agent_type}'
    init_train(config)
    config["checkpoint1"] = f'./results/{config["agent_type"]}_hockey_{config["name"]}_{config["mode"]}_agent.pth'

AGENT CDQ


Training...:  28%|[32m████████████████▋                                           [0m| 279/1000 [02:57<07:39,  1.57episodes/s][0m


KeyboardInterrupt: 

In [None]:
config = start_config.copy()
config["episodes"] = 1000
config["env_type"] = "walker"
config["name"] = f"walker"
config["mode"] = "normal"
agent_type="TD3"
init_train(config)
'''config["checkpoint1"] = f'./results/{agent_type}_hockey_{config["name"]}_{config["mode"]}_agent.pth'
# test agent
config["mode"] = "normal"
config["test"] = True
#config["episodes"] = 10
#config["render"] = True
init_train(agent_type, env_type, config)'''

In [None]:
config = start_config.copy()
config["episodes"] = 1000
config["agent_type"] = "DDPG"
config["env_type"] = "walker"
config["name"] = f"walker"
config["mode"] = "normal"
init_train(config)

In [None]:

config = start_config.copy()
config["episodes"] = 5000
config["name"] = f"noslide"
#config["theta"] = theta
config["mode"] = "normal"
agent_type="TD3"
env_type="hockey"
init_train(config)
config["checkpoint1"] = f'./results/{config["agent_type"]}_hockey_{config["name"]}_{config["mode"]}_agent.pth'
# test agent
config["mode"] = "normal"
config["test"] = True
#config["episodes"] = 10
#config["render"] = True
init_train(agent_type, env_type, config)

In [None]:
config = start_config.copy()
config["episodes"] = 5000
config["name"] = f"lrdecay"
config["mode"] = "normal"
agent_type="TD3"
env_type="hockey"
init_train(agent_type, env_type, config)
config["checkpoint1"] = f'./results/{agent_type}_hockey_{config["name"]}_{config["mode"]}_agent.pth'
# test agent
config["mode"] = "normal"
config["test"] = True
#config["episodes"] = 10
#config["render"] = True
init_train(agent_type, env_type, config)

In [None]:

for pri in [True, False]:
    config = start_config.copy()
    config["episodes"] = 5000
    config["prio_replay"] = pri
    config["name"] = f"prio_{pri}"
    config["mode"] = "normal"
    agent_type="TD3"
    env_type="hockey"
    init_train(agent_type, env_type, config)
    config["checkpoint1"] = f'./results/{agent_type}_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    # test agent
    config["mode"] = "normal"
    config["test"] = True
    #config["episodes"] = 10
    #config["render"] = True
    init_train(agent_type, env_type, config)

In [None]:

pri = True
config["name"] = f"prio_{pri}"
config["mode"] = "normal"
agent_type="TD3"
env_type="hockey"
config["checkpoint1"] = f'./results/{agent_type}_hockey_{config["name"]}_{config["mode"]}_agent.pth'
env = h_env.HockeyEnv()
agent1 = TD3.TD3Agent(env.observation_space, env.action_space, discount=config["discount"], eps=config["eps"],
                              update_target_every=config["update_target_every"], update_policy_every=config["update_policy_every"], 
                              hidden_sizes_actor=config["hidden_sizes_actor"],hidden_sizes_critic=config["hidden_sizes_critic"],
                              smoothing_std=config["smoothing_std"], smoothing_clip=config["smoothing_clip"],
                              learning_rate_actor=config["learning_rate_actor"], learning_rate_critic=config["learning_rate_critic"])

agent1.restore_state(torch.load(config["checkpoint1"]))
print(next(agent1.Q1.parameters())[0])
print(next(agent1.Q_target1.parameters())[0])
#for param in agent1.Q.parameters()

In [None]:
agent = "TD3Agent"
env = "hockey"
name = "iter20_up_ev2"
mode = "normal"
with (open(f"results/{agent}_{env}_{name}_{mode}_train_stats.pkl", "rb")) as openfile:
    loaded_stats = pickle.load(openfile)
print(loaded_stats["Experiment setup"])
rewards = loaded_stats["Rewards"]
plt.plot(moving_average(rewards,100))

In [None]:
### training without defense/attack ###

for gamma in [0.99, 0.97, 0.95]:
    config = start_config.copy()
    config["name"] = "discount_new"
    config["discount"] = gamma
    config["episodes"] = 5000
    config["mode"] = "normal"
    init_train(config)
    config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    # test agent
    config["test"] = True
    config["episodes"] = 5000
    init_train(config)


config = start_config.copy()
for lr in [0.001, 0.0005, 0.0001, 0.00005, 0.00001]:
    config = start_config.copy()
    config["episodes"] = 5000
    config["learning_rate_critic"] = lr
    config["learning_rate_actor"] = lr
    #config["use_target_net"] = False
    config["name"] = f"lr_slidewin"
    config["mode"] = "normal"
    init_train(config)
    config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    # test agent
    config["mode"] = "normal"
    config["test"] = True
    #config["episodes"] = 10
    #config["render"] = True
    init_train(config)
    config["test"] = False


'''for lr_crit in [0.001, 0.0001, 0.00001]:
    config = start_config.copy()
    config["name"] = "paramtest4"
    config["mode"] = "normal"
    print("LR CRITIC:", lr_crit)
    config["learning_rate_critic"] = lr_crit
    init_train(config)
    config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    # test agent
    config["mode"] = "normal"
    config["test"] = True
    init_train(config)
    config["test"] = False'''

In [None]:
env = h_env.HockeyEnv()
agent1 = TD3.TD3Agent(env.observation_space, env.action_space, discount=config["discount"], eps=config["eps"],
                          update_target_every=config["update_target_every"], update_policy_every=config["update_policy_every"], 
                          hidden_sizes_actor=config["hidden_sizes_actor"],hidden_sizes_critic=config["hidden_sizes_critic"],
                          smoothing_std=config["smoothing_std"], smoothing_clip=config["smoothing_clip"],
                          learning_rate_actor=config["learning_rate_actor"], learning_rate_critic=config["learning_rate_critic"])
env.close()
#print(agent1.Q1.state_dict)
Q2 = agent1.Q_target1.parameters()
for ii, param in enumerate(agent1.Q1.parameters()):
    print(ii)
    print("q1",param)
    with torch.no_grad():
        param +=1
    #print(next(Q2))
for ii, param in enumerate(agent1.Q1.parameters()):
    print(ii)
    print("q1",param)

In [None]:
# selfplay
config = start_config.copy()
config["episodes"] = 1000
config["name"] = "20ktest"
for i in range(1):
    if i>0:
        config["mode"] = "selfplay"
    config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    config["checkpoint2"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
    config["name"] = f"selfplay{i}"
    config["mode"] = "selfplay"
    config["test"] = False
    init_train(config)
    config["test"] = True
    config["mode"] = "normal"
    config["checkpoint2"] = None
    init_train(config)

In [None]:

config["test"] = True
config["mode"] = "normal"
config["checkpoint2"] = None
init_train(config)

In [None]:
### TRAINING CAMP ###
config = start_config.copy()
# defense training
config["name"] = "traincamp_new"
config["mode"] = "defense"
config["episodes"] = 1000
init_train(config)
config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
# test agent trained on defense
config["mode"] = "normal"
config["test"] = True
init_train(config)

# shoot training
config["mode"] = "attack"
config["test"] = False
init_train(config)
config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
# test agent trained on defense AND shooting
config["mode"] = "normal"
config["test"] = True
init_train(config)

# regular training
config["mode"] = "normal"
config["test"] = False
init_train(config)
config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
# test agent trained on defense AND shooting
config["mode"] = "normal"
config["test"] = True
init_train(config)

In [None]:
### test 0 eps vs 0.1 ###
# test agent
config = start_config.copy()
config["name"] = "param_explore3"
config["mode"] = "normal"
#config["episodes"] = 3000
init_train(config)
config["test"] = True
config["checkpoint1"] = f'./results/TD3Agent_hockey_{config["name"]}_{config["mode"]}_agent.pth'
init_train(config)

In [None]:
test = True
render = True
episodes=10000
mode = "selfplay"
eps = 0.1
update_target_every=100
update_policy_every=20
hidden_sizes_actor = [128,128]
hidden_sizes_critic = [128,128,64]
iter_fit = 32
std  = 0.0005
c = std/2


env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE)
agent = TD3.TD3Agent(env.observation_space, env.action_space, discount=1, eps=eps, 
                     update_target_every=update_target_every, update_policy_every=update_policy_every, 
                     hidden_sizes_actor=hidden_sizes_actor,hidden_sizes_critic=hidden_sizes_critic,
                     smoothing_std=std, smoothing_clip=c)
checkpoint = None
if test:
    checkpoint = f'./results/{type(agent).__name__}_hockey_{episodes}-eps{eps}-{mode}.pth'
    episodes=1000
    mode='normal'
env.close()
if checkpoint is not None:
    agent.restore_state(torch.load(checkpoint))
    
#mode = "normal"
losses_wea, rewards_wea = train(agent, mode=mode, episodes=episodes, eps=eps, test=test, iter_fit=iter_fit, render=render)
rewards_wea_avg = moving_average(rewards_wea, 20)
plt.plot(rewards_wea_avg)
plt.title(f'{type(agent).__name__}_wea_{eps}')
plt.show()


In [None]:


mode='normal'
episodes=10000
env = h_env.HockeyEnv(mode=h_env.HockeyEnv.TRAIN_DEFENSE)
agent = DDPG.DDPGAgent(env.observation_space, env.action_space, 
                       discount=1, eps=eps, update_target_every=update_target_every, hidden_sizes_actor=hidden_sizes_actor,hidden_sizes_critic=hidden_sizes_critic)
checkpoint = None
if test:
    checkpoint = f'./results/{type(agent).__name__}_hockey_{episodes}-eps{eps}-{mode}.pth'
    episodes=1000
    mode='normal'
env.close()
if checkpoint is not None:
losses_wea, rewards_wea = train(agent, mode=mode, episodes=episodes, eps=eps, test=test, iter_fit=iter_fit, render=render)
rewards_wea_avg = moving_average(rewards_wea, 20)
plt.plot(rewards_wea_avg)
plt.title(f'{type(agent).__name__}_wea_{eps}')
plt.show()


In [None]:
A = torch.randn(5)
b = torch.clamp(A, 0.2,0.5)
print(b)

In [None]:
episodes = 1000
eps = 0
checkpoint = f'./results/DDPG_hockey_{episodes}-eps{eps}-weak.pth'
agent = DDPG.DDPGAgent(env.observation_space, env.action_space, discount=1, eps=eps)
agent.restore_state(torch.load(checkpoint))
losses_wea,  rewards_wea = train(agent, mode="weak", episodes=1000, eps=0)

In [None]:
rewards_weak_avg = moving_average(rewards_wea, 20)

plt.plot(rewards_weak_avg)

In [None]:


rewards_att_avg = moving_average(rewards_att, 20)
plt.plot(rewards_def_avg.)
plt.title(f'def_{eps}')
plt.show()
plt.plot(rewards_att_avg)
plt.title(f'att_{eps}')
plt.show()

In [None]:
rewards_def_avg = moving_average(rewards_def, 20)

In [None]:

plt.plot(rewards_def_avg)
plt.title(f'schwanzus{eps}')