# Reinforcement Learning
# Ehsan Kouchaki

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

## Problem 1

In [None]:
env = gym.make('FrozenLake8x8-v0')
env.reset()
V = np.zeros(64)
theta = 1e-8
gama = 1
pi_a_s = 1/4;
while True:
    delta = 0
    for s in range(64):
        v = V[s]
        
        tmp = 0        
        for a in range(4):
            for Probability, sprime, r, end_of_episode in env.env.P[s][a]:
                tmp += pi_a_s * Probability * (r + gama * V[sprime])                
        V[s] = tmp
        delta = max(delta, np.abs(v - V[s]))
        
    if delta < theta: break
V = np.reshape(V, (8,8))
np.set_printoptions(precision=2)
print(V)

plt.figure(figsize=(10,10))
plt.subplot(1, 2, 1)
plt.imshow(V)
plt.subplot(1, 2, 2)
plt.imshow(1 / (1 + np.exp(-1000000*V)))

## Problem 2

In [None]:
env = gym.make('FrozenLake8x8-v0')
env.reset()
theta = 1e-8
gama = 0.9

# Initialization
Pi = np.zeros(64)
V = np.zeros(64)

V0 = []
V00 = []
while True:
    
    # Policy Evaluation
    while True:
        delta = 0
        for s in range(64):
            v = V[s]            
            tmp = 0        
            a = Pi[s]
            for Probability, sprime, r, end_of_episode in env.env.P[s][a]:
                tmp += Probability * (r + gama * V[sprime])                
            V[s] = tmp
            delta = max(delta, abs(v - V[s]))
            if s == 0: V0.append(tmp)
        if delta < theta: break
    V00.append(V[0])
    
    # Policy Improvement
    policy_stable = True
    for s in range(64):
        old_action = Pi[s]
        vi = np.zeros((4,1))        
        for a in range(4):
            for Probability, sprime, r, end_of_episode in env.env.P[s][a]:
                vi[a][0] += Probability * (r + gama * V[sprime])
        Pi[s] = np.argmax(vi)
        if old_action != Pi[s]:
            policy_stable = False
    if policy_stable: break
Vstar = np.reshape(V, (8,8))
Pistar = np.reshape(Pi, (8,8)).astype(int)        

np.set_printoptions(precision=5)
np.set_printoptions(suppress=True)
print("The state value matrix is: \n", Vstar, "\n")
print("The Optimum Policy matrix is: \n", Pistar, "\n")

# plot of the optimum policy ac a guidance matrix
PiMask = np.zeros(64)    # ceating the mask matrix for assigning O to hole and target cells at the end just for plotting
PiMask[19] = 4; PiMask[29] = 4; PiMask[35] = 4; PiMask[41] = 4; PiMask[42] = 4    # hole cells
PiMask[46] = 4; PiMask[49] = 4; PiMask[52] = 4; PiMask[54] = 4; PiMask[59] = 4    # hole sells
PiMask[63] = 4    # target cell
string = "LDRUO"
Piplot = Pistar + np.reshape(PiMask, (8,8)).astype(int)
for i in range(8):
    print(['{i:j}'.join(string[Piplot[i, j]]) for j in range(8)])

# plot of changes of state value of the start cell
plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.plot(V00)
plt.subplot(1, 2, 2)
plt.plot(V0)


## Problem 3

In [None]:
env = gym.make('FrozenLake8x8-v0')
env.reset()
gama = 0.9
policy = Pistar.flatten()
V = dict()
Returns = defaultdict(list)
V0 = []

for _ in range(500000):
    
    # Generating episode using Pistar
    episode = []
    done = True
    while True:
        if done: St, Rt, done    = env.reset(), 0, False
        else: St, Rt, done, _ = env.step(At)
        At = policy[St]
        episode.append((St, Rt, done, At))
        if done: break
    
    G = 0
    for t in range(len(episode)-2,-1,-1):
        St = episode[t][0]
        Rt_p1 = episode[t+1][1]
        G = gama * G + Rt_p1
        
        notseen = True
        for i in range(t):
            if St == episode[i][0]: notseen = False
        if notseen:
            Returns[St].append(G)
            if len(Returns[St]) == 1: V[St] = np.average(Returns[St])
            else: V[St] = V[St] + (G - V[St]) / len(Returns[St])                  
            if St == 0: V0.append(V[St])
            
plt.plot(V0)
Vi = np.zeros((64,1))
for i in V: Vi[i] = V[i]
print("The state value matrix is: \n\n", Vi.reshape(8,8))

## Problem 4

In [None]:
env = gym.make('FrozenLake8x8-v0')
env.reset()
gama = 0.9

# initialize
Q = np.zeros((64 , 4))
C = np.zeros((64 , 4))
Pi = np.zeros((64 , 1))

for _ in range(1000000):
    
    # Grnerating episode using b
    episode = []
    done = True
    while True:
        if done: St, Rt, done    = env.reset(), 0, False
        else: St, Rt, done, _ = env.step(At)
        At = np.random.randint(4)    # b = np.random.randint(4)
        episode.append((St, Rt, done, At))
        if done: break
    
    G = 0
    W = 1
    
    for t in range(len(episode)-2,-1,-1):
        St = episode[t][0]
        Rt_p1 = episode[t+1][1]
        At = episode[t][3]
        G = gama * G + Rt_p1
        C[St][At] += W
        Q[St][At] += (W / C[St][At]) * (G - Q[St][At])
        Pi[St] = np.argmax(Q[St])
        if At != Pi[St]: break
        W = W / 0.25    # b_At_St = 0.25

print(Pi.reshape(8,8))
print(Q)