In [1]:
import numpy as np
import pandas as pd

In [50]:
#gra
#niech zaczyna w dolnym lewnym rogu
start = (6,0)
end = [(6, 6), (0, 0)]
acts = ['UP', 'DOWN', 'LEFT', 'RIGHT']
eps=0.1

#ruchy
def take_action(state, action):
    r,c = state

    if action == 'UP':
        next_state = (max(r - 1, 0), c)
    elif action == 'DOWN':
        next_state = (min(r + 1, 6), c)
    elif action == 'LEFT':
        next_state = (r, max(c - 1, 0))
    elif action == 'RIGHT':
        if c == 5:
            next_state = (r, 6)
        else:
            next_state = (r, min(c + 2, 6))
    reward = -1

    return next_state, reward

# update polityki
def update_policy(state, policy, epsilon, Q, A=acts):
    best_actions = [a for a in A if Q[state][a] == max(Q[state].values())]
    policy[state] = {a: 0 for a in A}
    for a in A:
        if (len(A)-len(best_actions))==0:
            policy[state][a]=1/len(A)
        elif a in best_actions:
            policy[state][a] = (1 - epsilon)/len(best_actions)
        else:
            policy[state][a] = epsilon/(len(A)-len(best_actions))
    return policy


#stosowanie polityki
def choose_action(state, policy):
    acts = list(policy[state].keys())
    probabilities = list(policy[state].values())
    return np.random.choice(acts, p=probabilities)

# Algorytm mc
def first_visit_mc_control(epsilon=eps, episodes=5000, A=acts):
    policy = {}
    Q = {}
    returns = {}
    count= 0 #do zliczania epizodów bez zmiany polityki
    sum_G=0

    #nie mamy informacji zadnych jeszcze, wiec random
    for i in range(7):
        for j in range(7):
            state = (i, j)
            policy[state] = {a: 0.25 for a in A}
            Q[state] = {a: 0 for a in A}
            returns[state] = {a: [] for a in A}

    for ep in range(episodes):
        state = start
        states = [state]
        rewards = [0]
        G=0
        actions=[]

        while state not in end:
            action = choose_action(state,policy)
            next_state, reward = take_action(state, action)
            states.append(next_state)
            actions.append(action)
            rewards.append(reward)
            state = next_state

        for t in reversed(range(len(states) - 1)):
            state = states[t]
            action = actions[t]
            reward = rewards[t + 1]
            G = reward + G
            if (state, action) not in zip(states[:t], actions[:t]):
                old_policy=policy.copy()
                returns[state][action].append(G)
                Q[state][action] = np.mean(returns[state][action])
                new_policy=update_policy(state, policy, epsilon, Q)
                if new_policy!=old_policy:
                    count=0
                policy=new_policy
        count+=1
        if count==100:
            break
        sum_G+=G
    return ep, sum_G
#algorytm td

def sarsa(alpha, episodes=5000, gamma=1.0, epsilon=eps):
    count=0
    Q = {}
    policy = {}
    sum_G=0
    for i in range(7):
        for j in range(7):
            state = (i, j)
            Q[state] = {a:0 if state in end else np.random.rand() for a in acts}#random dla nieterminalnych stanów
            policy[state] = {a: 0.25 for a in acts}
    for ep in range(episodes):
        state = start
        action = choose_action(state,policy)
        while state not in end:
            next_state, reward = take_action(state, action)
            sum_G+=reward
            old_policy=policy.copy()
            new_policy=update_policy(state, policy, epsilon, Q)
            next_action = choose_action(next_state,policy)
            Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])
            state = next_state
            action=next_action
            if new_policy!=old_policy:
                count=0
            policy=new_policy
        count+=1
        if count==100:
            break
            
    return ep, sum_G

In [51]:
# badanie zbieznosci 
avg_ep_mc=0
avg_ep_sarsa_a1=0
avg_ep_sarsa_a2=0
avg_ep_sarsa_a3=0
avg_ep_sarsa_a4=0
avg_ep_sarsa_a5=0
avg_ep_sarsa_a6=0
avg_ep_sarsa_a7=0
avg_ep_sarsa_a8=0
reward1=0
reward2=0
reward3=0
reward5=0
reward4=0
reward6=0
reward7=0
reward8=0
reward9=0
for i in range(10):
    ep1, Q1=first_visit_mc_control()
    avg_ep_mc+=ep1
    reward1+=Q1
    ep2, Q2=sarsa(alpha=0.05)
    avg_ep_sarsa_a1+=ep2
    reward2+=Q2
    ep3, Q3=sarsa(alpha=0.1)
    avg_ep_sarsa_a2+=ep3
    reward3+=Q3
    ep4, Q4=sarsa(alpha=0.3)
    avg_ep_sarsa_a3+=ep4
    reward4+=Q4
    ep5, Q5=sarsa(alpha=0.5)
    avg_ep_sarsa_a4+=ep5
    reward5+=Q5
    ep6, Q6=sarsa(alpha=0.2)
    avg_ep_sarsa_a5+=ep6
    reward6+=Q6
    ep7, Q7=sarsa(alpha=0.6)
    avg_ep_sarsa_a6+=ep7
    reward7+=Q7
    ep8, Q8=sarsa(alpha=0.23)
    avg_ep_sarsa_a7+=ep8
    reward8+=Q8
    ep9, Q9=sarsa(alpha=0.27)
    avg_ep_sarsa_a8+=ep9
    reward9+=Q9
    
print(f"Average number of episodes for Monte Carlo: {avg_ep_mc/10}, reward: {reward1/(avg_ep_mc)}")
print(f"Average number of episodes for SARSA (alpha=0.05): {avg_ep_sarsa_a1/10}, reward: {reward2/(avg_ep_sarsa_a1)}")
print(f"Average number of episodes for SARSA (alpha=0.1): {avg_ep_sarsa_a2/10}, reward: {reward3/(avg_ep_sarsa_a2)}")
print(f"Average number of episodes for SARSA (alpha=0.3): {avg_ep_sarsa_a3/10}, reward: {reward4/(avg_ep_sarsa_a3)}")
print(f"Average number of episodes for SARSA (alpha=0.5): {avg_ep_sarsa_a4/10}, reward: {reward5/(avg_ep_sarsa_a4)}")
print(f"Average number of episodes for SARSA (alpha=0.2): {avg_ep_sarsa_a5/10}, reward: {reward6/(avg_ep_sarsa_a5)}")
print(f"Average number of episodes for SARSA (alpha=0.6): {avg_ep_sarsa_a6/10}, reward: {reward7/(avg_ep_sarsa_a6)}")
print(f"Average number of episodes for SARSA (alpha=0.23): {avg_ep_sarsa_a7/10}, reward: {reward8/(avg_ep_sarsa_a7)}")
print(f"Average number of episodes for SARSA (alpha=0.27): {avg_ep_sarsa_a8/10}, reward: {reward9/(avg_ep_sarsa_a8)}")

Average number of episodes for Monte Carlo: 209.4, reward: -11.929799426934098
Average number of episodes for SARSA (alpha=0.05): 4096.5, reward: -5.74761381667277
Average number of episodes for SARSA (alpha=0.1): 2530.8, reward: -5.430930930930931
Average number of episodes for SARSA (alpha=0.3): 1157.8, reward: -5.117809638970461
Average number of episodes for SARSA (alpha=0.5): 3475.0, reward: -3.970043165467626
Average number of episodes for SARSA (alpha=0.2): 1243.0, reward: -5.545454545454546
Average number of episodes for SARSA (alpha=0.6): 4999.0, reward: -3.876015203040608
Average number of episodes for SARSA (alpha=0.23): 1501.5, reward: -5.0651348651348655
Average number of episodes for SARSA (alpha=0.27): 1292.2, reward: -5.055022442346386
