## **Grid Word**

In [None]:
import numpy as np
import gym 
from gym import Env, spaces


"""
W = Win State 
L = Lose State
A = Agent
"""


# Highest grid settings
Grid_Rows= 50
Grid_Col = 50
WIN_STATE = (20, 20)
LOSE_STATE = (19, 19)
"""
# This grid settings are high dimension and difficult to render.
# Required Grid Settings
Grid_Rows= 16
Grid_Col = 16
WIN_STATE = (12, 13)
LOSE_STATE = (13, 14)


# Lowest grid settings
# Use this setting to if you want to render the grid
Grid_Rows= 10
Grid_Col = 10
WIN_STATE = (8, 6)
LOSE_STATE = (9, 7)
"""


class grid_word:
    
    def __init__(self):
        self.print_grid = np.zeros([Grid_Rows, Grid_Col])
        self.print_grid[WIN_STATE] = 9
        self.print_grid[LOSE_STATE] = 2
        self.state = (np.random.randint(0, Grid_Rows), np.random.randint(0, Grid_Col))
        self.done = False
        self.all_actions = [0,1,2,3]
        
        self.action_space = spaces.Discrete(4)
        self.observation_space = spaces.Box(low=np.array([0,0]), high = np.array([Grid_Rows, Grid_Col]), dtype=np.int16)
        self.reward = 0
        self.P = {}
        for i in range(Grid_Rows):
            for j in range(Grid_Col):
                self.P[(i, j)] = [0.25,0.25,0.25,0.25]
    
    def move(self,state, action):
       
        if action == 0:
                next_position = (state[0] - 1, state[1])
        elif action == 1:
                next_position = (state[0] + 1, state[1])
        elif action == 2:
                next_position = (state[0], state[1] - 1)
        else:
                next_position = (state[0], state[1] + 1)
        # Keep the next state within specified range. 
        if (next_position[0] >= 0) and (next_position[0] <= (Grid_Rows -1)):
            if (next_position[1] >= 0) and (next_position[1] <= (Grid_Col -1)):
                #if next_position != (1, 1):
                    reward_ = self.get_reward(next_position)
                    return next_position, reward_
                    
        return state, 0
    
    def get_reward(self, state):
        self.done = False
        if state == WIN_STATE:
            self.reward = 100
            self.done = True
        
        elif state == LOSE_STATE:
            self.reward = -100
            self.done = True
        elif (state[0] == 0) and (state[1] <= (Grid_Col -1)):
            self.reward = -10
        elif (state[0] <= (Grid_Col -1)) and (state[1] == 0 ):
            self.reward = -10
        else:
            self.reward = 0
            self.done = False
        return self.reward
    
    def step(self, state, action):
        next_state, reward_ = self.move(state, action)
        self.state = next_state
        
        return next_state, reward_, self.done
        
    
    def render(self, next_state):
        print("---------------------------------")
        print("Current State is ", np.asarray(next_state))
        print("The reward is   ",self.reward)
        print("---------------------------------")
        
    def reset(self):
        self.state = (np.random.randint(0, Grid_Rows), np.random.randint(0, Grid_Col))
        #self.state = (2,2)
        return self.state
        
        
    def render_grid(self):
        print(f"Grid Size = {Grid_Rows} X {Grid_Col} :: Total States = {Grid_Col*Grid_Rows}")
        print(f"Agent = A : Win State = W : Lose State = L")
        self.print_grid[self.state] = 1
        for i in range(0, Grid_Rows):
            print("-----------------------------------------------------------------")
            out = '| '
            for j in range(0, Grid_Col):
                if self.print_grid[i, j] == 1:
                    token = 'A'
                if self.print_grid[i, j] == 2:
                    token = 'L'
                if self.print_grid[i, j] == 0:
                    token = ' '
                if self.print_grid[i, j] == 9:
                    token = 'W'
                out += token + ' | '
            print(out)
        print("-----------------------------------------------------------------")
        
        self.print_grid[self.state] = 0
        self.print_grid[WIN_STATE] = 9
        self.print_grid[LOSE_STATE] = 2
        

In [None]:

# Test Section

env = grid_word()
print(env.action_space.n)
state = env.reset()
for i in range(100):
    action = env.action_space.sample()
    n_state, reward, done = env.step(state, action)
    print(f"State = {state}; Action = {action}; Next State = {n_state}; Reward = {reward}; Done = {done} ")
    state = n_state


## **Value Iteration **

In [None]:
import matplotlib.pyplot as plt
Q_table = {}
for i in range(Grid_Rows):
    for j in range(Grid_Col):
        Q_table[(i, j)] = [0,0,0,0,0]


all_states = []
for i in range(Grid_Rows):
    for j in range(Grid_Col):
        all_states.append((i,j))

env = grid_word()

def get_value_table(env):

    #set the number of iterations
    num_iterations = 2000
    
    #set the threshold number for checking the convergence of the value function
    threshold = 1e-40
    
    #we also set the discount factor
    gamma = 0.1
    con_value = 0
    c_act = 0
    
    #now, we will initialize the value table, with the value of all states to zero
    value_table = np.zeros((Grid_Rows, Grid_Col))
    
    
    #for every iteration
    for i in range(num_iterations):
        
        #table (state values) from the previous iteration
        updated_value_table = np.copy(value_table)
        
        
        for s in all_states:
            for a in env.all_actions :
                    s_, reward, done = env.step(s, a)
                    Q_table[s][a] = reward + gamma * updated_value_table[s_]
            value_table[s] = max(Q_table[s])
            
            #if done == True:
            #    env.reset()
        
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
            
            c_act = a
            con_value = i 
            break
    
    return value_table, con_value, c_act

def get_policy(value_table):
    
    #set the discount factor
    gamma = 1.0
    policy = np.zeros((Grid_Rows, Grid_Col))
    for s in all_states:
            for a in env.all_actions :
                s_, reward, done = env.step(s, a)                
                Q_table[s][a] = reward + gamma * value_table[s_] 
            policy[s] = np.argmax(Q_table[s]) 
            if done == True:
                    env.reset()
    
    
    
    return policy


convergance_value = []
calculate_conv = []
con_answer = []
for rep in range(20):
    value_table, con_value, c_act = get_value_table(env) # Based on value iteration
    
    optimal_policy = get_policy(value_table)
    calculate_conv.append(con_value)
    
    if rep % 1 == 0:
        val = (sum(calculate_conv)/1)
        convergance_value.append(val)
        calculate_conv = []
        con_answer.append(c_act)
        #print(val)
        
        

print("Computed Value Table based on Value Iteration")
print(value_table)
 # Based on value table
print("Optimal Policy Table based on Value Iteration")
print(optimal_policy)


print("Average value for convergance ", round((sum(convergance_value)/len(convergance_value))))

plt.figure(figsize=(10,5)) 
plt.plot(convergance_value, "g--", markersize=20, linewidth=5)
plt.legend(["Convergance Value"], fontsize=20)
plt.title("Iteration taken to converge [Value Iteration]", fontsize=20)
plt.xlabel("Repeatations", fontsize=20)
plt.ylabel("Convergance Point", fontsize=20)
plt.xlim(0,20)
plt.grid()
plt.show()

plt.figure(figsize=(10,5)) 
plt.plot(con_answer, "g--", markersize=20, linewidth=5)
plt.legend(["Convergance Answer"], fontsize=20)
plt.title("Action Value at convergance [Value Iteration]", fontsize=20)
plt.xlabel("Repeatations", fontsize=20)
plt.ylabel("Action Value", fontsize=20)
plt.xlim(0,20)
plt.grid()
plt.show()

#np.save("Convergance_Action_Value_Itetration_0_7g.npy",con_answer , allow_pickle=True)
#np.save("Convergance_Value_Value_Itetration_10_10g.npy", convergance_value, allow_pickle=True)



#--------------------------------------------------------------------------------  
# Value Iteration Play


state = env.reset()
episode_count = []
win_count = []
step_count = []
lose_count = []
count_w = 0
count_l = 0
i = 0
for j in range(100):
    state = env.reset()
    episode_count.append(j)
    win_count.append(count_w)
    lose_count.append(count_l)
    step_count.append(i)
    for i in range(100):
        action = optimal_policy[state]
        #print(state, action)
        n_state, reward, done = env.step(state, action)
        
        state = n_state
        if env.done:
            if reward == 100:
                count_w += 1
            if reward == -100:
                count_l += 1
            
            break
    

print("Game Results Optimal Policy")
print(f"Total Games played 100: [Win Games {count_w}]: [Lose Games {count_l}]")
avg_step = sum(step_count)/100
print(f"Average Steps to win 100 games = {round(avg_step)}")

print(f"Optimal Policy Accuracy  [{(count_w/100)*100}%]")
#Visualization
plt.figure(figsize=(15,10)) 
plt.bar(episode_count, step_count)
plt.title("Steps Taken to Solve the Grid Word by [Value Iteration]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
       
plt.figure(figsize=(15,10))   
plt.plot(step_count, "g-", markersize=20, linewidth=5)
plt.title("Steps Taken to Solve the Grid Word by [Value Iteration]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
 

plt.figure(figsize=(15,10)) 
plt.plot(win_count, "g-", markersize=20, linewidth=5)
plt.plot(lose_count, "r-", markersize=20, linewidth=5)
plt.title("Games Win by [Value Iteration] : Total Games = 100", fontsize=20)
plt.legend(["WIN", "LOSE"], fontsize=20)
plt.ylabel("Win Games", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()

#--------------------------------------------------------------------------------
# Random Policy Play


state = env.reset()
episode_count = []
win_count = []
step_count = []
lose_count = []
count_w = 0
count_l = 0
i = 0
for j in range(100):
    state = env.reset()
    episode_count.append(j)
    win_count.append(count_w)
    lose_count.append(count_l)
    step_count.append(i)
    for i in range(100):
        action = env.action_space.sample() #optimal_policy[state]
        #print(state, action)
        n_state, reward, done = env.step(state, action)
        
        state = n_state
        if env.done:
            if reward == 100:
                count_w += 1
            if reward == -100:
                count_l += 1
            
            break
    

env.render_grid()


print("Game Results Random Policy")
print(f"Total Games played 100: [Win Games {count_w}]: [Lose Games {count_l}]")
avg_step = sum(step_count)/100
print(f"Average Steps to win 100 games = {round(avg_step)}")

print(f"Random Policy Accuracy  [{(count_w/100)*100}%]")

#Visualization
plt.figure(figsize=(15,10)) 
plt.bar(episode_count, step_count)
plt.title("Steps Taken to Solve the Grid Word by [Random Policy]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
       
plt.figure(figsize=(15,10))   
plt.plot(step_count, "g-", markersize=20, linewidth=5)
plt.title("Steps Taken to Solve the Grid Word by [Random Policy]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
 

plt.figure(figsize=(15,10)) 
plt.plot(win_count, "g-", markersize=20, linewidth=5)
plt.plot(lose_count, "r-", markersize=20, linewidth=5)
plt.title("Games Win by [Random Policy] : Total Games = 100", fontsize=20)
plt.legend(["WIN", "LOSE"], fontsize=20)
plt.ylabel("Win Games", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()

## **Policy Iteration**

In [None]:
Q_table = {}
for i in range(Grid_Rows):
    for j in range(Grid_Col):
        Q_table[(i, j)] = [0,0,0,0]

#print(Q_table)        
state_size = Grid_Rows * Grid_Col
#action_size = 4
all_state = []
for i in range(Grid_Rows):
    for j in range(Grid_Col):
        all_state.append((i,j))
#policy_ = np.zeros((Grid_Rows, Grid_Col))
        
#value_table = np.zeros((Grid_Rows, Grid_Col))
#updated_value_table = np.copy(value_table)
env = grid_word()
state = env.reset()
GAMMA = 0.1
threshold = 1e-20

#num_iterations = 1000
#

#num_steps = 100


def get_value_function(policy):
    num_iterations = 1000
    threshold = 1e-40
    gamma = GAMMA
    value_table = np.zeros((Grid_Rows, Grid_Col))
# First we computer the value table based on the random policy
    for i in range(num_iterations):
        updated_value_table = np.copy(value_table)
        for s in all_state:
                action = policy[s] #env.action_space.sample() #
                s_, reward = env.move(s, action)
                value_table[s] = reward + gamma * updated_value_table[s_]
        
            
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
            break
    return value_table

def extract_policy(value_table):
    gamma = GAMMA
    policy_ = np.zeros((Grid_Rows, Grid_Col))
    for s in all_state:
        for a in env.all_actions:
            s_, reward = env.move(s, a)
            Q_table[s][a] = reward + gamma * value_table[s_]
        policy_[s] = np.argmax(Q_table[s])
            
    return policy_, a

def policy_iteration(env):
    con_value = 0
    c_act = 0
    num_iterations = 1000
    policy_  = np.zeros((Grid_Rows, Grid_Col))
    for i in range(num_iterations):
        value_function = get_value_function(policy_)
        new_policy, a = extract_policy(value_function)
        if (np.all(policy_ == new_policy)):
            c_act = a
            con_value = i
            break
        policy_ = new_policy
    return policy_, con_value, a

env = grid_word()
convergance_value = []
calculate_conv = []
con_answer = []
for rep in range(20):
    optimal_policy, con_value, a = policy_iteration(env)
    calculate_conv.append(con_value)
    
    if rep % 1 == 0:
        val = (sum(calculate_conv)/1)
        convergance_value.append(val)
        calculate_conv = []
        con_answer.append(a)



#print("Computed Value Table based on Policy Iteration")
#print(value_table)
 # Based on value table
print("Optimal Policy Table based on Policy Iteration")
print(optimal_policy)


print("Average value for convergence ", round((sum(convergance_value)/len(convergance_value))))

plt.figure(figsize=(10,5)) 
plt.plot(convergance_value, "g--", markersize=20, linewidth=5)
plt.legend(["Convergence Value"], fontsize=20)
plt.title("Iteration taken to converge [Policy Iteration]", fontsize=20)
plt.xlabel("Repeatations", fontsize=20)
plt.ylabel("Convergence Point", fontsize=20)
plt.xlim(0,20)
plt.grid()
plt.show()

plt.figure(figsize=(10,5)) 
plt.plot(con_answer, "g--", markersize=20, linewidth=5)
plt.legend(["Convergence Answer"], fontsize=20)
plt.title("Action Value at convergence [Policy Iteration]", fontsize=20)
plt.xlabel("Repeatations", fontsize=20)
plt.ylabel("Action Value", fontsize=20)
plt.xlim(0,20)
plt.grid()
plt.show()

#np.save("Convergance_Action_Policy_Itetration_0_7g.npy",con_answer , allow_pickle=True)
#np.save("Convergance_Value_Policy_Itetration_10_10g.npy", convergance_value, allow_pickle=True)


  
# Value Iteration Play


state = env.reset()
episode_count = []
win_count = []
step_count = []
lose_count = []
count_w = 0
count_l = 0
i = 0
for j in range(100):
    state = env.reset()
    episode_count.append(j)
    win_count.append(count_w)
    lose_count.append(count_l)
    step_count.append(i)
    for i in range(100):
        action = optimal_policy[state]
        #print(state, action)
        n_state, reward, done = env.step(state, action)
        
        state = n_state
        if env.done:
            if reward == 100:
                count_w += 1
            if reward == -100:
                count_l += 1
            
            break
    

print("Game Results Optimal Policy")
print(f"Total Games played 100: [Win Games {count_w}]: [Lose Games {count_l}]")
avg_step = sum(step_count)/100
print(f"Average Steps to win 100 games = {round(avg_step)}")

print(f"Optimal Policy Accuracy  [{(count_w/100)*100}%]")
#Visualization
plt.figure(figsize=(15,10)) 
plt.bar(episode_count, step_count)
plt.title("Steps Taken to Solve the Grid Word by [Policy Iteration]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
       
plt.figure(figsize=(15,10))   
plt.plot(step_count, "g-", markersize=20, linewidth=5)
plt.title("Steps Taken to Solve the Grid Word by [Policy Iteration]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
 

plt.figure(figsize=(15,10)) 
plt.plot(win_count, "g-", markersize=20, linewidth=5)
plt.plot(lose_count, "r-", markersize=20, linewidth=5)
plt.title("Games Win by [Policy Iteration] : Total Games = 100", fontsize=20)
plt.legend(["WIN", "LOSE"], fontsize=20)
plt.ylabel("Win Games", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()




state = env.reset()
episode_count = []
win_count = []
step_count = []
lose_count = []
count_w = 0
count_l = 0
i = 0
for j in range(100):
    state = env.reset()
    episode_count.append(j)
    win_count.append(count_w)
    lose_count.append(count_l)
    step_count.append(i)
    for i in range(100):
        action = env.action_space.sample() #optimal_policy[state]
        #print(state, action)
        n_state, reward, done = env.step(state, action)
        
        state = n_state
        if env.done:
            if reward == 100:
                count_w += 1
            if reward == -100:
                count_l += 1
            
            break
    

env.render_grid()


print("Game Results Random Policy")
print(f"Total Games played 100: [Win Games {count_w}]: [Lose Games {count_l}]")
avg_step = sum(step_count)/100
print(f"Average Steps to win 100 games = {round(avg_step)}")

print(f"Random Policy Accuracy  [{(count_w/100)*100}%]")

#Visualization
plt.figure(figsize=(15,10)) 
plt.bar(episode_count, step_count)
plt.title("Steps Taken to Solve the Grid Word by [Random Policy]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
       
plt.figure(figsize=(15,10))   
plt.plot(step_count, "g-", markersize=20, linewidth=5)
plt.title("Steps Taken to Solve the Grid Word by [Random Policy]", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.ylabel("Win Step Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()
 

plt.figure(figsize=(15,10)) 
plt.plot(win_count, "g-", markersize=20, linewidth=5)
plt.plot(lose_count, "r-", markersize=20, linewidth=5)
plt.title("Games Win by [Random Policy] : Total Games = 100", fontsize=20)
plt.legend(["WIN", "LOSE"], fontsize=20)
plt.ylabel("Win Games", fontsize=20)
plt.xlabel("Game Number", fontsize=20)
plt.xlim(0,100)
plt.grid()
plt.show()