In [None]:
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay

class REINFORCE:
    def __init__(self, env, input_shape, alpha=1e-4, gamma=0.99, learning_rate=0.001, batch_size=64, epsilon=0.99, epsilon_decay=0.993, epsilon_min=0.01):
        self.env = env
        self.state_shape = input_shape
        self.action_shape = len(env.possible_actions)
        self.gamma = gamma
        self.alpha = alpha
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.model = self.build_policy_network()
        self.states = []
        self.gradients = []
        self.rewards = []
        self.probs = []
        self.total_rewards = []
        self.batch_rewards = []
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
    
    def build_policy_network(self):
        # Enhanced network architecture
        model = Sequential()
        model.add(Input(shape=self.state_shape))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_shape, activation='softmax'))
        
        optimizer = Adam(learning_rate=self.learning_rate)

        model.compile(loss='categorical_crossentropy', optimizer=optimizer)
        return model

    def hot_encode_action(self, action):
        action_encoded = np.zeros(self.action_shape)
        action_encoded[action] = 1
        return action_encoded

    def remember(self, state, action, action_prob, reward):
        encoded_action = self.hot_encode_action(action)
        self.gradients.append(encoded_action - action_prob)
        self.states.append(state)
        self.rewards.append(reward)
        self.probs.append(action_prob)

    def compute_action(self, state):
        if np.random.rand() < self.epsilon:
            # Exploration: choose a random action
            action = np.random.choice(self.action_shape)
            action_probability_distribution = np.ones(self.action_shape) / self.action_shape
        else:
            # Exploitation: choose action based on policy
            state = state.reshape([1, state.shape[0]])
            action_probability_distribution = self.model.predict(state).flatten()
            action_probability_distribution /= np.sum(action_probability_distribution)
            action = np.random.choice(self.action_shape, 1, p=action_probability_distribution)[0]
        
        return action, action_probability_distribution

    def get_discounted_rewards(self, rewards):
        discounted_rewards = []
        cumulative_total_return = 0
        for reward in rewards[::-1]:
            cumulative_total_return = (cumulative_total_return * self.gamma) + reward
            discounted_rewards.insert(0, cumulative_total_return)

        mean_rewards = np.mean(discounted_rewards)
        std_rewards = np.std(discounted_rewards)
        norm_discounted_rewards = (discounted_rewards - mean_rewards) / (std_rewards + 1e-7)
        return norm_discounted_rewards

    def train_policy_network(self):
        states = np.vstack(self.states)
        gradients = np.vstack(self.gradients)
        rewards = np.vstack(self.rewards)
        discounted_rewards = self.get_discounted_rewards(rewards)
        gradients *= discounted_rewards
        y_train = self.alpha * gradients + self.probs

        # Gradient clipping
        self.model.optimizer.get_gradients = lambda loss, vars: [
            tf.clip_by_value(g, -1., 1.) for g in tf.gradients(loss, vars)]

        history = self.model.train_on_batch(states, y_train)
        self.states, self.probs, self.gradients, self.rewards = [], [], [], []
        return history

    def train(self, episodes):
        env = self.env
        total_rewards = np.zeros(episodes)
        reward_history = []
        episode_count = 0

        for episode in range(episodes):
            state = env.reset()
            done = False
            episode_reward = 0

            while not done:
                action, prob = self.compute_action(state)
                next_state, reward, done = env.step(action)
                self.remember(state, action, prob, reward)
                state = next_state
                episode_reward += reward
                env.render(delay=0, framerate=360)

                if done:
                    episode_count += 1
                
                # do a batch update every 2 episodes
                
                    
                if episode_count % 2 == 0:
                    states = np.vstack(self.states)
                    gradients = np.vstack(self.gradients)
                    rewards = np.vstack(self.rewards)
                    discounted_rewards = self.get_discounted_rewards(rewards)
                    gradients *= discounted_rewards
                    y_train = self.alpha * gradients + self.probs
                    self.train_policy_network()
                    episode_count = 0

            total_rewards[episode] = episode_reward
            reward_history.append(episode_reward)

            # Epsilon decay
            self.epsilon = max(self.epsilon_min, self.epsilon_decay * self.epsilon)

            print(f'Episode: {episode}, Reward: {episode_reward}, Epsilon: {self.epsilon}')

            
            # Improved early stopping based on recent reward history
            if len(reward_history) > 10:
                last_10_rewards = reward_history[-10:]
                if all(reward > 0 for reward in last_10_rewards):
                    differences = [abs(last_10_rewards[i] - last_10_rewards[i-1]) for i in range(1, 10)]
                    if all(diff < 200 for diff in differences):
                        print('The difference between each of the last 10 positive rewards is less than 200, stopping training')
                        break

        self.total_rewards = total_rewards
        return reward_history
    
    def test(self, episodes, render=False):
        env = self.env
        total_rewards = np.zeros(episodes)

        for episode in range(episodes):
            state = env.reset()
            done = False
            episode_reward = 0

            while not done:
                action = self.compute_action_for_testing(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                episode_reward += reward

                if render:
                    env.render()

            total_rewards[episode] = episode_reward
            print(f'Episode: {episode}, Reward: {episode_reward}')

        average_reward = np.mean(total_rewards)
        print(f'Average Reward: {average_reward}')
        return total_rewards

    def compute_action_for_testing(self, state):
        state = state.reshape([1, state.shape[0]])
        action_probability_distribution = self.model.predict(state).flatten()
        action = np.argmax(action_probability_distribution)
        return action

    def hot_encode_action(self, action):

        action_encoded=np.zeros(self.action_shape)
        action_encoded[action]=1

        return action_encoded
    
    def remember(self, state, action, action_prob, reward):

        encoded_action=self.hot_encode_action(action)
        self.gradients.append(encoded_action-action_prob)
        self.states.append(state)
        self.rewards.append(reward)
        self.probs.append(action_prob)
  

In [None]:
# play env
env= RCMazeEnv()

alpha = 0.0001
gamma = 0.99
learning_rate = 0.001
EPSILON = 0.99
epsilon_decay = 0.93

N_EPISODES=20

env.init_pygame()

state = env.reset() # reset to env 
Agent = REINFORCE(env,input_shape=state.shape, epsilon=EPSILON, epsilon_decay=epsilon_decay , alpha=alpha, gamma=gamma, learning_rate=learning_rate)

rewards = Agent.train(N_EPISODES)
env.close_pygame()
env.close()

In [None]:
# test
env = RCMazeEnv()
state = env.reset()

env.init_pygame()

from keras.models import load_model
Agent = REINFORCE(env, input_shape=state.shape)
Agent.model = load_model('./models_old/REINFORCE_RCmaze.h5')

done = False

rewards = []

while not done:
    env.render(delay=100, framerate=360)
   
    action, prob = Agent.compute_action(state)
    state, reward, done = env.step(action)
    rewards.append(reward)
    
    env.render()
    if done:
        print('done in ', len(rewards), 'steps')
        break
      
env.close()
print(sum(rewards))
env.close_pygame()
