In [1]:
# Import necessary libraries
import numpy as np  
import pandas as pd  
import time

#Hyperparameters:
learning_rate = 0.1  
discount_factor = 0.99  
exploration_rate = 0.1  
num_episodes = 1000

#Parameters:
N_STATES = 6  
ACTIONS = ['left', 'right']
EPSILON = 0.9 
ALPHA = 0.1 
GAMMA = 0.9
MAX_EPOCHES = 13
FRESH_TIME = 0.3

#Initializing the Q-table:
q_table = pd.DataFrame(
    np.zeros((N_STATES, len(ACTIONS))),
    columns=ACTIONS
)

print("Initialized Q-table:")
print(q_table)


Initialized Q-table:
   left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.0
5   0.0    0.0


In [2]:
import pandas as pd

def build_q_table(n_states, actions):
    """
    Build a Q-table with all zero initial values.

    Parameters:
    - n_states (int): Number of states
    - actions (list): List of possible actions

    Returns:
    - pd.DataFrame: Initialized Q-table
    """
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),  
        columns=actions
    )
    return table

#Building the Q-table:
q_table = build_q_table(N_STATES, ACTIONS)

#Printing the initialized Q-table:
print(q_table)


   left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.0
5   0.0    0.0


In [3]:
def choose_action(state, q_table):

    random_value = np.random.rand()

    #Explore vs non-explored states:
    if random_value > EPSILON or q_table.loc[state].max() == 0: 
        action = np.random.choice(ACTIONS)
    else:  
        action = q_table.loc[state].idxmax()

    return action


sample_action = choose_action(0, q_table)
print(sample_action)


right


In [4]:
def get_env_feedback(S_current, A):
#Setting reward 'R' for correct exploration:
    if A == 'right':
        if S_current == N_STATES - 2:  
            S_next = 'terminal' 
            R = 1 
        else:
            S_next = S_current + 1
            R = 0
    elif A == 'left':
        if S_current == 0:
            S_next = 0
        else:
            S_next = S_current - 1
        R = 0

    return S_next, R


In [5]:
def update_env(S, episode, step_counter):

    env_list = ['-'] * (N_STATES - 1) + ['T'] 
    if S == 'terminal':
        interaction = 'Episode {}: total_steps = {}'.format(episode + 1, step_counter)
        print('{}\n'.format(interaction), end='')
        time.sleep(2)
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


sample_action = 'left'
S_current = 4
sample_feedback = get_env_feedback(S_current, sample_action)
print(sample_feedback)


(3, 0)


In [6]:
def reinforce_learning():
    q_table = build_q_table(N_STATES, ACTIONS)

    #Start training:
    for episode in range(MAX_EPOCHES):
        step_counter = 0
        S_current = 0
        is_terminated = False

        update_env(S_current, episode, step_counter) 

        while not is_terminated:
            A = choose_action(S_current, q_table)
            S_next, R = get_env_feedback(S_current, A) 

            #Predicting the future reward:
            q_predict = q_table.loc[S_current, A]

            if S_next != 'terminal':
                q_target = R + GAMMA * q_table.loc[S_next].max()
            else:
                q_target = R
                is_terminated = True

            #Q Table updation with new knowledge of surroundings:
            q_table.loc[S_current, A] += ALPHA * (q_target - q_predict)

            S_current = S_next
            step_counter += 1
            update_env(S_current, episode, step_counter)

    return q_table


if __name__ == "__main__":
    q_table = reinforce_learning()
    print('\r\nQ-table:\n')
    print(q_table)


----oTEpisode 1: total_steps = 24
----oTEpisode 2: total_steps = 5
----oTEpisode 3: total_steps = 6
----oTEpisode 4: total_steps = 7
----oTEpisode 5: total_steps = 5
----oTEpisode 6: total_steps = 5
----oTEpisode 7: total_steps = 5
----oTEpisode 8: total_steps = 5
----oTEpisode 9: total_steps = 5
----oTEpisode 10: total_steps = 5
----oTEpisode 11: total_steps = 5
----oTEpisode 12: total_steps = 5
----oTEpisode 13: total_steps = 5

Q-table:

   left     right
0   0.0  0.004239
1   0.0  0.024903
2   0.0  0.108445
3   0.0  0.340790
4   0.0  0.745813
5   0.0  0.000000
