# Genetic algorithm for Pacman

In [None]:
from IPython.display import clear_output

In [None]:
!pip install atari_py
!pip install gym
!python -m atari_py.import_roms /work/ROMS/
clear_output()

In [None]:
# dependencies
import gym
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
%matplotlib inline

import numpy as np
import time

from tensorflow.keras.layers import Dense, Concatenate
from tensorflow.keras import Input, Model

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model

from tensorflow.keras.layers import Dense, Concatenate
from tensorflow.keras import Input, Model

# Import openai Gym 
import gym
import atari_py

In [None]:
# testing the environment
env = gym.make('MsPacman-ram-v0')

print('Observation-space:', env.observation_space)
print('Available actions', env.unwrapped.get_action_meanings())

render = lambda : plt.imshow(env.render(mode='rgb_array')) # plotting function
plt.rcParams["figure.figsize"] = (7,10)
render()
plt.savefig('ms-pacman.pdf', bbox_inches='tight')

In [None]:
# Import the RNN

rnn = load_model('rnn_predictor_9_actions.hdf5')

def predict_rnn(state, action, hidden, cell_values):
    state_input = np.concatenate([state, [action]])

    # Construct rnn input
    rnn_input = [np.array([[state_input]]),np.array([hidden]),np.array([cell_values])]

    # Predict hidden state given the current state and action
    mdn, h, c = rnn.predict(rnn_input)
    
    return h[0], c[0]

# Call the predictor

def predict_controller(controller, hidden, ram):

    concatenated_input = np.concatenate([hidden,ram])
    concatenated_input = np.expand_dims(concatenated_input, axis=0)
    
    action = np.argmax(controller.predict(concatenated_input), axis=-1)[0]
    
    return action

### Genetic Algorithm

In [None]:
# genetic algorithm

def choose_action(weights, combined):

    '''
    Choose an action:
        input - weights and environment state
        output - action (either left or right)
    '''
    
    # action depends on sum of weights*inputs
    results = sum(weights*combined)/100

    # choose action
    if results > 42 and results <= 44:
        action = 1
    elif results > 44 and results <= 46:
        action = 2
    elif results > 46 and results <= 48:
        action = 3
    elif results > 48 and results <= 50:
        action = 4
    elif results > 50 and results <= 52:
        action = 5
    elif results > 52 and results <= 54:
        action = 6
    elif results > 52 and results <= 54:
        action = 7
    elif results > 54 and results <= 56:
        action = 8
    else: 
        action = 1

    return action

def evaluate_agent(weights):

    '''
    Evaluate the current weight configuration (the agent):
        input - weights
        output - reward associated with the input values, average action
    '''

    # collector
    total_reward = 0
    total_action = 0
    counter_action = 0
    action_collector = []

    # set the environment
    env.reset()
    state, reward, done, info = env.step(1)
    
    runs = 5
    action = 1 # first action
    hidden = np.zeros(256)
    cell_values = np.zeros(256)
    
    for i in range(runs):

        # stop game when done-flag is raised
        while done != True:



            # use concatenated RNN + state
            state, reward, done, info = env.step(action)   # take action

            prediction_h, predict_c  = predict_rnn(state, action, hidden, cell_values)
            hidden = prediction_h
            cell_values = predict_c

            combined = np.concatenate([state, prediction_h])
            action = choose_action(weights, combined)         # choose action

            total_reward += reward                         # accumulate rewards
            total_action += action                         # track taken actions    
            counter_action += 1
        
            action_collector.append(action)

        done = False

        env.reset()
        state, reward, done, info = env.step(1)
    
    # compute the average action and the average reward
    avg_action = total_action/counter_action
    avg_reward = total_reward/runs

    # commmand line outputs
    print('Total reward', avg_reward)
    print('Avg. action', avg_action)

    return total_reward/runs, avg_action, action_collector


def genetic_agent(runs):

    '''
    Genetic Agent (main loop):
        input - amount of runs, possibly old weights
        output - monitoring variables + save new weights
    '''

    print('Total number of runs:', runs)
  
    highest_rewards = []
    episode_of_change = []

    weights = 0.76*np.random.random(384)
    reward_old = -500
    #data = np.load('img-GA/pacman-1000-weights-sigma1.2-sparse-action4-evaldepth5.npz')
    #weights = data['arr_0']
    weights_old = weights
    reward_old, _, _ = evaluate_agent(weights_old)

    highest_rewards.append(reward_old)

    generation = 1
    print('Generation:', generation)

    generation_distance = []
    old_generation = 0
    action_collector = []
    total_action_collector = []
    reward_collector = []
    best_generation_action = []
    elapse_time_col = []

    for i in range(runs):

        total_action_collector = []

        t = time.time()

        print('')
        print('Round', i)
        
        # mutation
        w_pos = 0.8*np.random.random(384)
        w_neg = -0.8*np.random.random(384)
        w_update = w_pos + w_neg

        w_update = w_update*np.random.randint(2, size=384) # sparse input

        weights_new = weights_old + w_update

        reward_new, avg_action, actions = evaluate_agent(weights_new)

        total_action_collector.append(actions)
        action_collector.append(avg_action)
        reward_collector.append(reward_new)

        current_generation = i

        elapsed = time.time() - t
        print('Elapse time', round(elapsed, 3))
        elapse_time_col.append(elapsed)

        if i == 0:
            best_generation_action.append(total_action_collector[0])

        if i%100 == 0:
            print('weights old', weights_old)
            print('weight update', w_update)

        # check whether new weight configuration is better than the old one
        if reward_new > reward_old:

            weights_old = weights_new   # update weights
            reward_old = reward_new     # update best reward
            generation += 1             # increase generation

            # collect results for plotting

            highest_rewards.append(reward_new)
            generation_distance.append(current_generation - old_generation)
            episode_of_change.append(current_generation)
            best_generation_action.append(total_action_collector[0])

            # commmand line outputs
            print('')
            print('########## WEIGHT CHANGE ############')
            print('Generation:', generation)
            print('Rounds betw. generations:', current_generation - old_generation)
            
            old_generation = current_generation
    
    # save final weights
    np.savez('pacman-1000-weights-sigma0.8-sparse-RNN-action4-evaldepth5.npz', weights_old)

    return highest_rewards, generation_distance, weights_old, action_collector, reward_collector, episode_of_change, best_generation_action, elapse_time_col

In [None]:
collect = []

# compute the average result value to calibrate the action score
for i in range(1000):
    env.reset()
    state, reward, done, info = env.step(1)
    w = 0.74*np.random.random(len(state))
    collect.append(sum(state*w)/100)
    # print(sum(state*weights)/100)

print(sum(collect)/len(collect))

### Running the Agent

In [None]:
# initialize the environment
env = gym.make('MsPacman-ram-v0')
env.reset()
state, reward, done, info = env.step(0)

# run the genetic algorithm
highest_rewards, gen_dis, weights, actions, all_rewards, change_episode, all_actions, elapse_time  = genetic_agent(runs=100)

In [None]:
all_rewards

mean_reward = sum(all_rewards)/len(all_rewards)

counter = 0
for i in all_rewards:
    counter += (i - mean_reward) ** 2

variance = counter / len(all_rewards)
std = variance ** 0.5

print(all_rewards)

print('mean', mean_reward)
print('variance', variance)
print('std', std)

### Plotting Results

In [None]:
# unique, counts = np.unique(all_actions[0], return_counts=True)
# distribution = dict(zip(unique, counts))

import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 15})

# Create a figure instance
fig = plt.figure(figsize=(12, 8))

data_to_plot = []
for i in range(len(all_actions)):
    data_to_plot.append(all_actions[i])

# Create a figure instance
fig = plt.figure()

# Create an axes instance
ax = fig.add_axes([0,0,1,1])

# Create the boxplot
bp = ax.violinplot(data_to_plot, showmeans = True, showextrema = True, showmedians = True)

for partname in ('cbars','cmins','cmaxes','cmeans','cmedians'):
    vp = bp[partname]
    vp.set_edgecolor('slategray')
    vp.set_linewidth(1)

for pc in bp['bodies']:
    pc.set_facecolor('slategray')
    pc.set_edgecolor('slategray')
    pc.set_alpha(1)

plt.grid()
plt.xlabel('Best Generation') 
plt.ylabel('Probability per action')
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(0, len(data_to_plot) +1, 1))
plt.title("Action distribution") 
plt.tight_layout()
plt.savefig('img-GA/pacman-action-distr-1000-sigma0.8-RNN-sparse-action4-evaldepth5.pdf', bbox_inches='tight')
plt.show()

In [None]:
# plotting

# plot the obtained results
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 22})

# Reward improvement per generation
plt.figure(figsize=(12, 8))
plt.plot(highest_rewards, color='slategray', marker = 'o', markersize=12)
#plt.style.use('ggplot')
plt.grid()
plt.xticks(range(0, len(gen_dis)+1, 1))
plt.xlabel('Generation') 
plt.ylabel('Obtained reward') 
plt.title("Best reward obtained by each Generation")
plt.savefig('img-GA/pacman-reward-1000-sigma0.8-RNN-sparse-action4-evaldepth5.pdf')
plt.show() 

# Distance between generations
plt.figure(figsize=(12, 8))
plt.bar(range(1, len(gen_dis)+1, 1), gen_dis, color='slategray')
#plt.style.use('ggplot')
plt.grid()
plt.xticks(range(1, len(gen_dis)+1, 1))
plt.xlabel('Generation')
plt.ylabel('Mutations between Generations')
plt.title("Distance between Generations")
plt.savefig('img-GA/pacman-distance-1000-simga0.8-RNN-sparse-action4-evaldepth5.pdf')
plt.show() 


# Plot all rewards

# change red and gray colour?
window=20
xcoords = change_episode

plt.figure(figsize=(12,8))
rolling_mean = pd.Series(all_rewards).rolling(window).mean()
std = pd.Series(all_rewards).rolling(window).std()
plt.grid()
plt.plot(rolling_mean)
plt.fill_between(range(len(all_rewards)),rolling_mean-std, rolling_mean+std, color='orange', alpha=0.4)
# plt.axvline(x=30, color='slategray', linestyle='-')

for xc in zip(xcoords):
    plt.axvline(x=xc, label='line at x = {}'.format(xc), c='slategray')

plt.title('Reward Moving Average ({}-episode window)'.format(window))
plt.grid()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.grid()
plt.savefig('img-GA/pacman-reward-over-1000-sigma0.8-RNN-sparse-action4-evaldepth5.pdf')
plt.show()

# Plot all action

plt.figure(figsize=(12,8))
rolling_mean = pd.Series(actions).rolling(window).mean()
std = pd.Series(actions).rolling(window).std()
plt.plot(rolling_mean)
plt.fill_between(range(len(actions)),rolling_mean-std, rolling_mean+std, color='orange', alpha=0.4)

for xc in zip(xcoords):
    plt.axvline(x=xc, label='line at x = {}'.format(xc), c='slategray')

plt.title('Actions Moving Average ({}-episode window)'.format(window))
plt.grid()
plt.savefig('img-GA/pacman-actions-over-1000-sigma0.8-RNN-sparse-action4-evaldepth5.pdf', bbox_inches='tight')
plt.xlabel('Episode')
plt.ylabel('Action')
plt.show()

In [None]:
total = sum(elapse_time)

plt.figure(figsize=(12, 8))
plt.plot(elapse_time, color='slategray')
plt.xlabel('Episode') 
plt.ylabel('Time')
plt.title("Elapse time (total " + str(round(total, 2)) + " sec.)" ) 
plt.grid()
plt.savefig('img-GA/pacman-elapse-time-1000-sigma0.8-RNN-sparse-action4-evaldepth5.pdf', bbox_inches='tight')
plt.show()