# Setting up Environment

In [None]:
!rm -rf ML_main_project

In [None]:
#Getting the task files and gym from git
!git clone https://github.com/ganeshalamuru/ML_main_project.git ML_main_project

In [None]:
import gym
from gym import spaces, logger
from gym.utils import seeding
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
from random import randint
import glob
import io
import base64
import keras
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as opt
from torch.distributions import Categorical
from torch.utils.tensorboard import SummaryWriter
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from collections import Counter
from collections import defaultdict
from collections import deque
from statistics import median, mean

In [None]:
# Render the start state of the cartpole game
env = gym.make("CartPole-v1")
env.reset()
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
env.close()

# Naive Q Learning

In [None]:
max_episode_length = 500
num_test_episodes = 100

In [None]:
# Naive Q Learning
# Q Learning requires discrete observation spaces. 
# Obsevation space of cartpole is continuous and must be first made discrete.
class CartPole():
    def __init__(self, buckets=(1, 1, 6, 12,), n_episodes=100, num_test_episodes = 100, min_alpha=0.1, min_epsilon=0.1, gamma=0.9, adaptive_rate_divisor=25, monitor=False):
        self.buckets = buckets # down-scaling feature space to discrete range
        self.n_episodes = n_episodes # training episodes 
        self.num_test_episodes = num_test_episodes # test episodes
        self.min_alpha = min_alpha # learning rate
        self.min_epsilon = min_epsilon # exploration rate
        self.gamma = gamma # discount factor
        self.adaptive_rate_divisor = adaptive_rate_divisor # Divide by this constant for the adaptive learning,exploration rates

        self.env = gym.make('CartPole-v1')
        if monitor: self.env = gym.wrappers.Monitor(self.env, 'tmp/cartpole-1', force=True) # record results for upload

        # initialising Q-table
        self.Q = np.zeros(self.buckets + (self.env.action_space.n,))

    # Discretizing input space to make Q-table
    def discretize(self, obs):
        upper_bounds = [self.env.observation_space.high[0], 0.5, self.env.observation_space.high[2], math.radians(50)]
        lower_bounds = [self.env.observation_space.low[0], -0.5, self.env.observation_space.low[2], -math.radians(50)]
        ratios = [(obs[i] + abs(lower_bounds[i])) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(obs))]
        new_obs = [int(round((self.buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
        new_obs = [min(self.buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
        return tuple(new_obs)

    # Choosing action based on epsilon-greedy policy: Choose a random action if random number generated <= epsilon
    def choose_action(self, state, epsilon):
        return self.env.action_space.sample() if (np.random.random() <= epsilon) else np.argmax(self.Q[state])

    # Updating Q-value of state-action pair based on the update equation
    def update_q(self, state_old, action, reward, state_new, alpha):
        self.Q[state_old][action] += alpha * (reward + self.gamma * np.max(self.Q[state_new]) - self.Q[state_old][action])

    # Adaptive learning of Exploration Rate where t is episode number
    def get_epsilon(self, t):
        return max(self.min_epsilon, min(1, 1.0 - math.log10((t + 1) / self.adaptive_rate_divisor)))

    # Adaptive learning of Learning Rate whre t is episode number
    def get_alpha(self, t):
        return max(self.min_alpha, min(1.0, 1.0 - math.log10((t + 1) / self.adaptive_rate_divisor)))

    def train(self):

        scores = []
        for e in range(self.n_episodes):
            # As states are continuous, discretize them into buckets
            current_state = self.discretize(self.env.reset())

            # Get adaptive learning alpha and epsilon decayed over time
            alpha = self.get_alpha(e)
            epsilon = self.get_epsilon(e)
            done = False
            i = 0

            while not done:
                # Render environment
                #self.env.render()

                # Choose action according to greedy policy and take it
                action = self.choose_action(current_state, epsilon)
                obs, reward, done, info = self.env.step(action)
                new_state = self.discretize(obs)

                # Update Q-Table
                self.update_q(current_state, action, reward, new_state, alpha)
                current_state = new_state
                i += 1 
            scores.append(i)
        # Print some stats
        print('Average Score:',sum(scores)/len(scores))
        plt.plot(scores)
        plt.ylabel('Scores')
        plt.xlabel('Training Episodes')
        plt.show()
        
      
    def evaluate(self):

        self.env = gym.make('CartPole-v1')
        scores = []
        for e in range(self.num_test_episodes):
            # As states are continuous, discretize them into buckets
            current_state = self.discretize(self.env.reset())
            done = False
            i = 0

            while not done:
                # Render environment
                #self.env.render()

                # Choose action and take it
                action = self.choose_action(current_state,0)
                obs, reward, done, info = self.env.step(action)
                new_state = self.discretize(obs)
                current_state = new_state
                i += 1
            scores.append(i)
        # Print some stats
        print('Average Score:',sum(scores)/len(scores))
        plt.plot(scores)
        plt.ylabel('Scores')
        plt.xlabel('Test Episodes')
        plt.show()


In [None]:
Q_Learning_Model = CartPole(n_episodes=100)
Q_Learning_Model.train()

In [None]:
Q_Learning_Model.evaluate()

In [None]:
with open('Q-Learning-Model-Best.pkl', 'wb') as output:
    pickle.dump(Q_Learning_Model, output, pickle.HIGHEST_PROTOCOL)


# Deep Q Learning

In [None]:
# Deep Q Learning
# Hyperparameters
BATCH_SIZE = 64
LR = 0.01
EPSILON = 0.9
DECAY = 0.995
GAMMA = 0.95
TARGET_UPDATE_INTERVAL = 100
REPLAY_BUFFER_CAPACITY = 2000
env = gym.make("CartPole-v1")
STATE_DIM = env.observation_space.shape[0]
ACTION_DIM = env.action_space.n
N_TRAIN_EPISODES = 1000
N_TEST_EPISODES = 100

# Initialise weights from normal distribution
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.normal_(m.weight, 0.0, 0.1)

# Store the experiences in the replay buffer
class ReplayBuffer:
    def __init__(self, size):
        self.size = size
        self.memory = deque([], maxlen=size)

    def push(self, x):
        self.memory.append(x)

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done

    def get_len(self):
        return len(self.memory)


# Neural Network Definition
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()

        self.fc1 = nn.Linear(STATE_DIM, 50)
        self.fc2 = nn.Linear(50, ACTION_DIM)

        self.apply(init_weights)
# Forward Propagation of an Input
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Agent who plays the game
class Agent(object):
    def __init__(self):
        self.dqn, self.target_dqn = DQN(), DQN()

        self.learn_step_counter = 0
        self.memory_counter = 0
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_CAPACITY)
        self.optimizer = opt.Adam(self.dqn.parameters(), lr=LR)
        self.loss_fn = nn.MSELoss()

    # Choosing action based on epsilon-greedy policy: Choose a random action if random number generated <= epsilon
    def choose_action(self, s,epsilon):
        s = torch.unsqueeze(torch.FloatTensor(s), 0)

        if np.random.uniform() > epsilon:
            qs = self.dqn.forward(s)
            action = torch.max(qs, 1)[1].data.numpy()
            action = action[0]
        else:
            action = env.action_space.sample()

        return action

    # Update parameters of the NN
    def update_params(self):
        # update target network
        if self.learn_step_counter % TARGET_UPDATE_INTERVAL == 0:
            self.target_dqn.load_state_dict(self.dqn.state_dict())
        self.learn_step_counter += 1

        # sample batch of transitions
        states, actions, rewards, next_states, dones = self.replay_buffer.sample(
            BATCH_SIZE
        )

        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions.astype(int).reshape((-1, 1)))
        rewards = torch.FloatTensor(rewards).unsqueeze(1)
        next_states = torch.FloatTensor(next_states)
        dones = torch.FloatTensor(np.float32(dones)).unsqueeze(1)

        # get q values
        q_current = self.dqn(states).gather(1, actions)
        q_next = self.target_dqn(next_states).detach()
        q_target = rewards + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
        q_loss = self.loss_fn(q_current, q_target)

        # backpropagate
        self.optimizer.zero_grad()
        q_loss.backward()
        self.optimizer.step()

    def train(self):

        scores = []
        for i in range(N_TRAIN_EPISODES):
            state = env.reset()
            episode_reward = 0
            step = 0
            self.epsilon = EPSILON
            while True:
              # env.render()
              action = self.choose_action(state,self.epsilon)
              self.epsilon *= DECAY
              self.epsilon = max(0.1,self.epsilon)

              # take action
              next_state, reward_orig, done, _ = env.step(action)
              step += 1

              # modify the reward function
              x, x_dot, theta, theta_dot = next_state
              r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
              r2 = (
              env.theta_threshold_radians - abs(theta)
              ) / env.theta_threshold_radians - 0.5
              reward = r1 + r2

              self.replay_buffer.push((state, action, reward, next_state, done))
              self.memory_counter += 1

              episode_reward += reward_orig

              if self.memory_counter > REPLAY_BUFFER_CAPACITY:
                self.update_params()

              if done:
                print(
                    "Episode: {}, Reward: {}, step: {}".format(
                        i, round(episode_reward, 2), step
                    )
                )

              if done:
                break

              state = next_state
            scores.append(step)
        # Print some stats
        print('Average Score:',sum(scores)/len(scores))
        plt.plot(scores)
        plt.ylabel('Scores')
        plt.xlabel('Train Episodes')
        plt.show()

    def evaluate(self):

        env = gym.make('CartPole-v1')
        scores = []
        for i in range(N_TEST_EPISODES):
            state = env.reset()
            episode_reward = 0
            step = 0

            while True:
              # env.render()
              action = self.choose_action(state,0)

              # take action
              next_state, reward_orig, done, _ = env.step(action)
              step += 1
              state = next_state
              if done:
                break

            scores.append(step)
        # Print some stats
        print('Average Score:',sum(scores)/len(scores))
        plt.plot(scores)
        plt.ylabel('Scores')
        plt.xlabel('Test Episodes')
        plt.show()      




In [None]:
Deep_Q_Learning_Model = Agent()
Deep_Q_Learning_Model.train()

In [None]:
Deep_Q_Learning_Model.evaluate()

In [None]:
with open('Deep-Q-Learning-Model-Best-Extra.pkl', 'wb') as output:
    pickle.dump(Deep_Q_Learning_Model, output, pickle.HIGHEST_PROTOCOL)

# PPO

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, n_latent_var):
        super(ActorCritic, self).__init__()

        # actor
        self.action_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, action_dim),
                nn.Softmax(dim=-1)
                )
        
        # critic
        self.value_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, 1)
                )
        
    def forward(self):
        raise NotImplementedError
        
    def act(self, state, memory):
        state = torch.from_numpy(state).float().to(device) 
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        
        return action.item()
    
    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        
        state_value = self.value_layer(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy
        
class PPO:
    def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def update(self, memory):   
        # Monte Carlo estimate of state rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        
        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())
                
            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
        
        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
    def evaluate(self,num_episodes=100):
        env_name = "CartPole-v1"
        env = gym.make(env_name)
        memory = Memory()
        all_episode_rewards = []
        x_axis = []
        y_axis = []
        for i in range(num_episodes):
            ep_reward = 0
            done = False
            state = env.reset()
            while not done:
                action = self.policy_old.act(state, memory)
                state, reward, done, _ = env.step(action)
                ep_reward+=reward
            all_episode_rewards.append(ep_reward)
            x_axis.append(i)
            y_axis.append(ep_reward)
        mean_episode_reward = np.mean(all_episode_rewards)
        print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)
        plt.plot(x_axis,y_axis)
        plt.ylabel('Scores')
        plt.xlabel('Test Episodes')
        plt.show()
        return mean_episode_reward

In [None]:
def main():
    ############## Hyperparameters ##############
    env_name = "CartPole-v1"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = 2
    render = False
    solved_reward = 500         # stop training if avg_reward > solved_reward
    log_interval = 100          # print avg reward in the interval
    max_episodes = 3500        # max training episodes
    max_timesteps = 500         # max timesteps in one episode
    n_latent_var = 64           # number of variables in hidden layer
    update_timestep = 700      # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99               # discount factor
    K_epochs = 8                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    #############################################
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    print(lr,betas)
    
    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    x_axis = []
    y_axis = []
    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1
            
            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            
            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                timestep = 0
            
            running_reward += reward
            if render:
                env.render()
            if done:
                break
                
        avg_length += t
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            x_axis.append(i_episode/log_interval)
            y_axis.append(running_reward)  
            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
    with open('./PPO_CartPole-v1.pkl', 'wb') as output:
        pickle.dump(ppo, output, pickle.HIGHEST_PROTOCOL)    
    # Print some stats
    plt.plot(x_axis,y_axis)
    plt.ylabel('Scores(averaged over 100)')
    plt.xlabel('Train Episodes(per 100)')
    plt.show()   
if __name__ == '__main__':
    main()

# Genetic Algorithm

In [None]:
import subprocess

In [None]:
gym_path = gym.__file__
gym_path = gym_path.replace('/__init__.py','')
print(gym_path)

In [None]:
# The model obtained from task2.py seems to be the best (for all three tasks)
subprocess.call('cp ./ML_main_project/task2.py {}/envs/classic_control/cartpole.py'.format(gym_path), shell=True)

In [None]:
env = gym.make('CartPole-v1')

ind = env.observation_space.shape[0]
act_dim = env.action_space.n 

In [None]:
# Training Hyperparameters
test_runs = 10
num_generations = 1500

# For evaluation
episode_length = 500
num_test_episodes = 100

In [None]:
# Custom Neural Network
class NN():
    act_space_n = act_dim
    def __init__(self,obs,in_w,in_b,hid_w,out_w):
        super().__init__()
        self.action = self.calculate_act(obs,in_w,in_b,hid_w,out_w)

    def calculate_act(self,obs,in_w,in_b,hid_w,out_w):
        obs = obs/max(np.max(np.linalg.norm(obs)),1)
        in_layer = self.reLu(np.dot(obs,in_w)+in_b.T)
        hid_layer_1 = self.reLu(np.dot(in_layer,hid_w))
        hid_layer_2 = np.dot(hid_layer_1,out_w)
        output = self.reLu(hid_layer_2)
        output = self.softmax(output)
        output = output.argsort().reshape(1,NN.act_space_n)
        return output[0][0] # discrete action

    def reLu(self,x):
        return np.maximum(0,x)

    def softmax(self,x):
        x = np.exp(x)/np.sum(np.exp(x))
        return x

In [None]:
# HYPERPARAMETERS
in_node_num = 4
hid_node_num = 2
mutation_power = 10
crossover_generations = 10

class Agent():
	act_space_n = act_dim

	def __init__(self):
		super().__init__()
		self.env = gym.make("CartPole-v1")

	def mutate(self,new_dna):
		j = np.random.randint(0,len(new_dna))
		if ( 0 < j < mutation_power): # controlling rate of mutation
			for i in range(j):
				n = np.random.randint(0,len(new_dna)) #random position for mutation
				new_dna[n] = new_dna[n] + np.random.rand()

		mut_dna = new_dna
		return mut_dna

	def crossover(self,Dna_list):
		newDNA_list = []
		newDNA_list.append(Dna_list[0])
		newDNA_list.append(Dna_list[1]) 

		for l in range(crossover_generations):  # number of generations after crossover
			j = np.random.randint(0,len(Dna_list[0]))
			new_dna = np.append(Dna_list[0][:j], Dna_list[1][j:])

			mut_dna = self.mutate(new_dna)
			newDNA_list.append(mut_dna)

		return newDNA_list

	def intial_gen(self):

		in_w = []
		in_b = []

		hid_w = []
		out_w = [] 

		in_node = in_node_num
		hid_node = hid_node_num

		for i in range(self.test_runs):

			in_w.append(np.random.rand(ind,in_node))

			in_b.append(np.random.rand((in_node)))

			hid_w.append(np.random.rand(in_node,hid_node))

			out_w.append(np.random.rand(hid_node, Agent.act_space_n))

		return [in_w, in_b, hid_w, out_w] # This is the generation


	def run_env(self,in_w,in_b,hid_w,out_w):
		obs = self.env.reset()
		award = 0
		for t in range(episode_length):
			action = NN(obs,in_w,in_b,hid_w,out_w).action
			obs, reward, done, _ = self.env.step(action)
			award += reward 
			if done:
				break
		return award       

	def rand_run(self):
		award_set = []
		generations = self.intial_gen()
		for episode in range(self.test_runs):
			in_w  = generations[0][episode]
			in_b = generations[1][episode]
			hid_w =  generations[2][episode]
			out_w =  generations[3][episode]
			award = self.run_env(in_w,in_b,hid_w,out_w)
			award_set = np.append(award_set,award)

		gen_award = [generations, award_set]
		return gen_award

	def evolve(self,award_set, generations):

		good_award_idx = award_set.argsort()[-2:][::-1] # best 2 are selected 
		good_generation = []
		DNA_list = []

		new_input_weight = []
		new_input_bias = []

		new_hidden_weight = []

		new_output_weight =[]

		new_award_set = []


		#Extraction of all weight info into a single sequence
		for index in good_award_idx:
			
			w1 = generations[0][index]
			dna_in_w = w1.reshape(w1.shape[1],-1)

			b1 = generations[1][index]
			dna_b1 = np.append(dna_in_w, b1)
			w2 = generations[2][index]
			dna_whid = w2.reshape(w2.shape[1],-1)
			dna_w2 = np.append(dna_b1,dna_whid)
			
			wh = generations[3][index]
			dna = np.append(dna_w2, wh)

			DNA_list.append(dna) # take 2 dna for good generation

		newDNA_list = self.crossover(DNA_list)

		for newdna in newDNA_list: # collection of weights from dna info
			
			newdna_in_w1 = np.array(newdna[:generations[0][0].size]) 
			new_in_w = np.reshape(newdna_in_w1, (-1,generations[0][0].shape[1]))
			new_input_weight.append(new_in_w)

			new_in_b = np.array([newdna[newdna_in_w1.size:newdna_in_w1.size+generations[1][0].size]]).T #bias
			new_input_bias.append(new_in_b)

			sh = newdna_in_w1.size + new_in_b.size
			newdna_in_w2 = np.array([newdna[sh:sh+generations[2][0].size]])
			new_hid_w = np.reshape(newdna_in_w2, (-1,generations[2][0].shape[1]))
			new_hidden_weight.append(new_hid_w)

			sl = newdna_in_w1.size + new_in_b.size + newdna_in_w2.size
			new_out_w = np.array([newdna[sl:]]).T
			new_out_w = np.reshape(new_out_w, (-1,generations[3][0].shape[1]))
			new_output_weight.append(new_out_w)

			new_award = self.run_env(new_in_w, new_in_b, new_hid_w, new_out_w) #bias
			new_award_set = np.append(new_award_set,new_award)

		new_generation = [new_input_weight,new_input_bias,new_hidden_weight,new_output_weight]
		return new_generation, new_award_set

	def train(self,test_runs,num_generations):
		self.test_runs = test_runs
		self.num_generations = num_generations
		gen_award = self.rand_run()
		current_gens = gen_award[0]
		current_award_set = gen_award[1]
		best_gen =[]
		A =[]
		max_avg = 0
		for n in range(num_generations):
			new_generation, new_award_set = self.evolve(current_award_set, current_gens)
			current_gens = new_generation
			current_award_set = new_award_set
			avg = np.average(current_award_set)
			a = np.amax(current_award_set)
			if avg == episode_length:
				best_gen = np.array([current_gens[0][0],current_gens[1][0],current_gens[2][0],current_gens[3][0]])
				self.best_gen = best_gen
				
			print("generation: {}, average: {}".format(n+1, avg))
			A = np.append(A, avg)
			
		plt.plot(A)
		plt.xlabel('Generations')
		plt.ylabel('Avg Score')
		plt.grid()

		print('Average score:',mean(A))
		print('Median score:',median(A))
		return plt.show()

	def test_run_env(self,in_w,in_b,hid_w,out_w):
		obs = self.env.reset()
		award = 0
		for t in range(episode_length):
			action = NN(obs,in_w,in_b,hid_w,out_w).action
			obs, reward, done, info = self.env.step(action)
			award += reward
			
			if done:
				break
		self.env.close()
		return award

	def test_run_n_times(self,in_w, in_b, hid_w,out_w,n_times):
		average = []
		for i in range(n_times):
			average.append(self.test_run_env(in_w,in_b,hid_w,out_w))
			
		plt.plot(average)
		plt.xlabel('Episodes')
		plt.ylabel('Score')
		plt.grid()
		return mean(average)

	def evaluate(self,n_times):
		param = self.best_gen
		self.env = gym.make('CartPole-v1')
		in_w  = param[0]
		in_b  = param[1]
		hid_w = param[2]
		out_w = param[3]
		return self.test_run_n_times(in_w, in_b, hid_w,out_w,n_times)



In [None]:
# Training
# Note: Sometimes the random seed can cause the model to fail after a few generations..Please rerun again then.
GA_model = Agent()
GA_model.train(test_runs,num_generations)
with open('GA-Model-Best.pkl', 'wb') as output:
    pickle.dump(GA_model, output, pickle.HIGHEST_PROTOCOL)
