In [4]:
import numpy as np


class HealthBarEnvironment:
    def __init__(self, initial_health=100, max_health=100, min_health=0):
        self.initial_health = initial_health
        self.max_health = max_health
        self.min_health = min_health
        self.current_health = initial_health
        self.action_space = ['up', 'down', 'left', 'right']
        self.state_space = 1  # We'll keep it simple with a 1-dimensional state space for health

    def reset(self):
        self.current_health = self.initial_health
        return self.current_health

    def step(self, action):
        assert action in self.action_space, "Invalid action"

        # Define the effects of each action on health
        if action == 'up':
            # Randomly increase health
            self.current_health += np.random.randint(1, 10)
        elif action == 'down':
            # Randomly decrease health
            self.current_health -= np.random.randint(1, 10) * 2
        elif action == 'left':
            self.current_health -= np.random.randint(1, 10) * 3
            pass
        elif action == 'right':
            self.current_health -= np.random.randint(1, 10) * 4
            pass

        # Clip the health value to stay within the min and max health bounds
        self.current_health = np.clip(
            self.current_health, self.min_health, self.max_health)

        # Calculate the reward
        reward = self.calculate_reward()

        # Check if the episode is done (health reaches minimum)
        done = self.current_health <= self.min_health

        # Additional info can be returned if needed
        info = {}

        return self.current_health, reward, done, info

    def calculate_reward(self):
        # In this simple example, the reward is simply the current health value
        return self.current_health


# Let's test our environment
env = HealthBarEnvironment()
state = env.reset()
done = False
total_reward = 0

while not done:
    action = np.random.choice(env.action_space)
    next_state, reward, done, _ = env.step(action)
    total_reward += reward
    print(
        f"Action: {action}, Next State: {next_state}, Reward: {reward}, Done: {done} Current Health: {env.current_health}")

print(f"Total Reward: {total_reward}")

Action: up, Next State: 100, Reward: 100, Done: False Current Health: 100
Action: left, Next State: 94, Reward: 94, Done: False Current Health: 94
Action: right, Next State: 66, Reward: 66, Done: False Current Health: 66
Action: left, Next State: 48, Reward: 48, Done: False Current Health: 48
Action: left, Next State: 27, Reward: 27, Done: False Current Health: 27
Action: right, Next State: 0, Reward: 0, Done: True Current Health: 0
Total Reward: 335


In [5]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = []
        self.gamma = 0.95  # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = self._build_model()

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam())
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state,verbose=0)
        return np.argmax(act_values[0])  # returns action

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state, verbose=0)
            if done:
                target[0][action] = reward
            else:
                Q_future = max(self.model.predict(next_state,verbose=0)[0])
                target[0][action] = reward + Q_future * self.gamma
            self.model.fit(state, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [6]:
import random

# Initialize the DQN agent
state_size = env.state_space
action_size = len(env.action_space)
agent = DQNAgent(state_size, action_size)

# Number of episodes for agent to play through
episodes = 50

# Batch size for agent to learn from
batch_size = 32

for e in range(episodes):
    # Reset state at the start of each new episode of the game
    state = env.reset()
    state = np.reshape(state, [1, state_size])

    done = False
    total_reward = 0
    while not done:
        # Agent takes action
        action = agent.act(state)
        next_state, reward, done, _ = env.step(env.action_space[action])
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        # print('action:', action, 'reward:', reward, 'next_state:', next_state, 'done:', done)
        # Remember the previous state, action, reward, and done
        agent.remember(state, action, reward, next_state, done)

        # make next_state the new current state for the next frame.
        state = next_state
        total_reward += reward

        # done becomes True when the game ends
        if done:
            print(
                f"episode: {e}/{episodes}, score: {total_reward}, e: {agent.epsilon:.2}")

        # train the agent with the experience of the episode
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

print(f"Total Reward: {total_reward}")

episode: 0/50, score: 154, e: 1.0
episode: 1/50, score: 528, e: 1.0
episode: 2/50, score: 380, e: 1.0
episode: 3/50, score: 411, e: 0.98
episode: 4/50, score: 754, e: 0.92
episode: 5/50, score: 1071, e: 0.84
episode: 6/50, score: 344, e: 0.81
episode: 7/50, score: 794, e: 0.73
episode: 8/50, score: 639, e: 0.68
