## Experience Replay

In [1]:
class ExpirienceReplay:
    def __init__(self, size=10000):
        self.data = deque(maxlen=size)
    
    def add(self, transition):
        self.data.append(transition)
        
    def sample(self, size):
        batch = random.sample(self.data, size)
        return list(zip(*batch))

## Agent

In [2]:
class Agent:
    def __init__(self, action_size):
        pass 

    def act(self, state):
        pass


class EpsilonGreedyAgent(Agent):
    def __init__(self, action_size, agent, eps=0.1):
        super().__init__(action_size)
        self.agent = agent
        self.eps = eps
        self.action_size = action_size

    def act(self, state):
        if random.random() < self.eps:
            return random.randint(self.action_size)
        else:
            return self.agent.act(state)

## Trainer

In [3]:
class Trainer:
    def __init__(self, agent, experience_replay_size=1000000, batch_size=64):
        self.agent = agent
        self.buffer = ExpirienceReplay(experience_replay_size)
        self.batch_size = batch_size

    def consume_transitions(self, transitions):
        for t in transitions:
            self.buffer.add(t)
    
    def update(self):
        batch = self.buffer.sample(self.batch_size)

        state, action, next_state, reward, done = batch
        state = torch.tensor(np.array(state, dtype=np.float32))
        action = torch.tensor(np.array(action, dtype=np.int))
        next_state = torch.tensor(np.array(next_state, dtype=np.float32))
        reward = torch.tensor(np.array(reward, dtype=np.float32))
        done = torch.tensor(np.array(done, dtype=np.float32))

        metrics = self._update_agent((state, action, next_state, reward, done))
        return metrics


    def _update_agent(self, batch):
        pass

## Environment Worker

In [4]:
class EnvWorker:
    def __init__(self, env_name, agent):
        self.env = gym.make(env_name)
        self.agent = agent
        self.current_state = self.env.reset()

    def collect_transitions(self, num_transitions):
        transititons = []
        for _ in range(num_transitions):
            action = self.agent.act(self.current_state)
            next_state, reward, done, _ = self.env.step(action)
            transitions.append([
                    self.current_state, 
                    action, 
                    next_state, 
                    reward, 
                    done
            ])
            self.current_state = self.env.reset() if done else next_state
        return transitions
        
    def collect_trajectories(self, num_trajectories):
        trajectories = []
        for _ in range(num_trajectories):
            transitions = []
            state = self.env.reset()
            done = False
            while not done:
                action = self.agent.act(state)
                next_state, reward, done, _ = self.env.step(action)
                transitions.append([
                        state, 
                        action, 
                        next_state, 
                        reward, 
                        done
                ])
                state = self.env.reset() if done else next_state
            trajectories.append(transitions)

## Train

In [5]:
def train(agent: Agent, exploration_agent: Agent, trainer: Trainer, 
          env_name, experience_per_update, updates_total,
          updates_per_evalutaion, episodes_per_evaluation):
    
    exploration_env_worker = EnvWorker(env_name, exploration_agent)
    exploitation_env_worker = EnvWorker(env_name, agent)

    for step in range(updates_total):
        transitions = exploration_env_worker.collect_transitions(
            experience_per_update)
        trainer.consume_transitions(transitions)
        metrics = trainer.update()

        if step % updates_per_evalutaion == 0:
            trajectories = exploitation_env_worker.collect_trajectories(
                episodes_per_evaluation)
            total_rewards = [sum([t[3] for t in transitions])
                             for transitions in trajectories]
            print(f"Step {step} | Mean reward: {np.mean(total_rewards)}")

In [6]:
agent = Agent() # Здесь будет агент конкретного алгоритма
trainer = Trainer(agent, 500000, 64) # Опять же Trainer для конкретного алгоритма
exploration_agent = EpsilonGreedyAgent(agent)

train(agent, exploration_agent, trainer, "LunarLander-v2", 4, 100000, 1000, 10)

TypeError: __init__() missing 1 required positional argument: 'action_size'