In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
import random
import pendulum

In [2]:
# we can get the integration step used in the simulation
print(f'dt is {pendulum.DELTA_T}')

# we can get the size of its state and control vector
print(f'number of states {pendulum.NUMBER_STATES} and number of controls {pendulum.NUMBER_CONTROLS}')
print('the states are indexed as follows: theta, omega')

# we can get the maximum velocity of the pendulum (omega)
print(f'the max velocity is {pendulum.MAX_VELOCITY} rad/seconds')


dt is 0.1
number of states 2 and number of controls 1
the states are indexed as follows: theta, omega
the max velocity is 6.0 rad/seconds


In [3]:
# the next_state function allows to compute the next state given a current state and action
# This is going to be very helpful to run an episode!

# assume we set x = [theta, omega] = [0,0] and u = 5, we can get the next state using
x = np.array([0.,0.])
u = 5
x_next = pendulum.get_next_state(x, u)

print(f'the next state is {x_next}')


the next state is [0.02227801 0.48969119]


In [4]:
# we can also simulate the robot but we need to provide a controller of the following form
def dummy_controller(x):
    """
        the prototype of a controller is as follows
        x is a column vector containing the state of the robot
        
        this controller needs to return a scalar
        you may want to modify this controller to use the policy table to compute control output
    """
    # here we do nothing and just return a 0 control
    return 0.


# we can now simulate for a given number of time steps - here we do 10 seconds
T = 10.
x0 = np.array([1.4,0.])
t, x, u = pendulum.simulate(x0, dummy_controller, T)


     

In [5]:
# we can plot the results
plt.figure()

plt.subplot(2,1,1)
plt.plot(t, x[0,:])
plt.legend(['theta'])

plt.subplot(2,1,2)
plt.plot(t, x[1,:])
plt.legend(['omega'])

# we can also plot the control
plt.figure()
plt.plot(t[:-1], u.T)
plt.legend(['u1'])
plt.xlabel('Time [s]')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Text(0.5, 0, 'Time [s]')

In [6]:

# now we can also create an animation
pendulum.animate_robot(x)



In [7]:
# we don't want 2pi to be in the set because it's the same as 0
# we generate 50 equally spaced points for theta
discretized_theta = np.linspace(0, 2*np.pi, 50, endpoint=False)

# we generate 50 equally spaced points for omega
discretized_omega = np.linspace(-6, 6, 50)

# now given an arbitrary continuous state theta
theta_arbitrary = 0.6234
omega_arbitrary = 1.234

# we can find the index of the closest element in the set of discretized states
index_in_discretized_theta = np.argmin(np.abs(discretized_theta - theta_arbitrary))
index_in_discretized_omega = np.argmin(np.abs(discretized_omega - omega_arbitrary))

# and find the closed discretized state
closest_theta_state = discretized_theta[index_in_discretized_theta]
closest_omega_state = discretized_omega[index_in_discretized_omega]

print(f'the discretized theta closest to {theta_arbitrary} is {closest_theta_state} with index {index_in_discretized_theta}')
print(f'the discretized omega closest to {omega_arbitrary} is {closest_omega_state} with index {index_in_discretized_omega}')


the discretized theta closest to 0.6234 is 0.6283185307179586 with index 5
the discretized omega closest to 1.234 is 1.3469387755102042 with index 30


In [8]:
# here is some code to plot results, assuming a policy and a value function are given
# this can be used to answer questions in both Part 1 and 2

value_function = np.zeros([50,50])
policy = np.zeros([50,50])

# we plot the value function
plt.figure(figsize=[6,6])
plt.imshow(value_function, extent=[0., 2*np.pi, -6, 6], aspect='auto')
plt.xlabel('Pendulum Angle')
plt.ylabel('Velocity')
plt.title('Value Function')

# we plot the policy
plt.figure(figsize=[6,6])
plt.imshow(policy, extent=[0., 2*np.pi, -6, 6], aspect='auto')
plt.xlabel('Pendulum Angle')
plt.ylabel('Velocity')
plt.title('Policy')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Policy')

In [9]:
def get_cost(x,u):
    theta = x[0]
    theta_dot = x[1]
    
    cost1 = theta - np.pi
    cost2 = 0.01 * (theta_dot**2)
    cost3 = 0.0001 * (u**2)
#     print(cost1)
    
#     print(cost2)

#     print(cost3)
    
    cost = (cost1**2) + cost2 + cost3
    
    return cost
    

In [10]:
check = get_cost(x0,u[0])

In [11]:
check

3.0331449710379372

In [12]:
def get_policy_and_value_function(q_table):
    
    dim1 = q_table.shape[0]
    dim2 = q_table.shape[1]
    dim3 = q_table.shape[2]
    
    optimal_policy = np.zeros(shape=(dim1, dim2))
    optimal_value = np.zeros(shape=(dim1, dim2))
    
    
    
    for i in range(dim1):
        for j in range(dim2):
            minQ = 1000000
            optU = -1
            for k in range(dim3):
                if q_table[i][j][k] < minQ:
                    minQ = q_table[i][j][k]
                    optU = k
            optimal_policy[i][j] = optU
            optimal_value[i][j] = minQ
            
            
    return optimal_policy, optimal_value
            
            
            
    
    

In [15]:
def q_learning(q_table):
    numSteps = 100
    numEps = 6000
    epsilon = 0.4
    alpha = 0.99
    learningRate = 0.2
    randomStep = 0
    nonRandom = 0
    learningCosts = np.zeros(numEps)
    
    
    posActions = np.array([-5, 0, 5])
    
    x0 = np.array([0,0.])
    

    # we don't want 2pi to be in the set because it's the same as 0
    # we generate 50 equally spaced points for theta
    discretized_theta = np.linspace(0, 2*np.pi, 50, endpoint=False)

    # we generate 50 equally spaced points for omega
    discretized_omega = np.linspace(-6, 6, 50)

    
    
    for i in range(numEps):
#         print(x0)
        x = x0
        
        for ii in range(numSteps):
        
#             optimal_policy, optimal_value = get_policy_and_value_function(q_table)        
            
            
            
            dice = random.random()
            
            # we can find the index of the closest element in the set of discretized states
            index_in_discretized_theta_n = np.argmin(np.abs(discretized_theta - x[0]))
            index_in_discretized_omega_n = np.argmin(np.abs(discretized_omega - x[1]))           

            options = q_table[index_in_discretized_theta_n][index_in_discretized_omega_n]
            
            
            if dice <= epsilon:
                idU = random.randrange(3)
                minQ = options[idU]
                randomStep += 1
            else:
                minQ = 100000
                idU = -1
                nonRandom += 1
                for j in range(3):
                    if options[j] < minQ:
                        minQ = options[j]
                        idU = j

            u = posActions[idU]
            x_n1 = pendulum.get_next_state(x,u)
            cost = get_cost(x,u)
            

            
            value =  minQ           


            # we can find the index of the closest element in the set of discretized states
            index_in_discretized_theta_n1 = np.argmin(np.abs(discretized_theta - x_n1[0]))
            index_in_discretized_omega_n1 = np.argmin(np.abs(discretized_omega - x_n1[1]))


            
            
            value_n1 = q_table[index_in_discretized_theta_n1][index_in_discretized_omega_n1][idU]

            dt = cost + (alpha*value_n1) - value
            
            q_table[index_in_discretized_theta_n][index_in_discretized_omega_n][idU] = q_table[index_in_discretized_theta_n][index_in_discretized_omega_n][idU] + (learningRate*dt)
            
            x = x_n1
            
            learningCosts[i] += (alpha**ii)*cost
            
#         print(i)
#         print(learningCosts[i])
    
    
    print(randomStep)
    print(nonRandom)
    
    return q_table, learningCosts
        
        
    
    

In [16]:
q_table = np.zeros([50,50,3])
discretizedState = np.zeros([2])
posActions = np.array([-5, 0, 5])


for i in range(50):
    for j in range(50):
        discretizedState[0] = discretized_theta[i]
        discretizedState[1] = discretized_omega[j]
        
        for k in range(3):
            q_table[i][j][k] = 0
#             q_table[i][j][k] = get_cost(discretizedState,posActions[k])
q_table, learningCosts = q_learning(q_table)

240256
359744


In [17]:
policy,value_function = get_policy_and_value_function(q_table)

# we plot the value function
plt.figure(figsize=[6,6])
plt.imshow(value_function, extent=[0., 2*np.pi, -6, 6], aspect='auto')
plt.xlabel('Pendulum Angle')
plt.ylabel('Velocity')
plt.title('Value Function')

# we plot the policy
plt.figure(figsize=[6,6])
plt.imshow(policy, extent=[0., 2*np.pi, -6, 6], aspect='auto')
plt.xlabel('Pendulum Angle')
plt.ylabel('Velocity')
plt.title('Policy')

# # we plot the policy
# plt.figure()
# plt.imshow(learningCosts,  aspect='auto')
# plt.xlabel('Episode')
# plt.ylabel('Cost')
# plt.title('Costs')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'Policy')

In [18]:
# we can also simulate the robot but we need to provide a controller of the following form
def learned_controller(x):
    
    
    index_in_discretized_theta_n = np.argmin(np.abs(discretized_theta - x[0]))
    index_in_discretized_omega_n = np.argmin(np.abs(discretized_omega - x[1]))  
    
    idU = policy[index_in_discretized_theta_n][index_in_discretized_omega_n]
#     print(policy[index_in_discretized_theta_n][index_in_discretized_omega_n])
    idU = int(idU)

#     print(idU)
    u = posActions[idU]
    # here we do nothing and just return a 0 control
    return u


# we can now simulate for a given number of time steps - here we do 10 seconds
T = 10.
x0 = np.array([0,0.])
t, x, u = pendulum.simulate(x0, learned_controller, T)

In [19]:
# now we can also create an animation
pendulum.animate_robot(x)

In [20]:
eps = np.arange(0,(learningCosts.shape[0]))

In [21]:
fig,ax = plt.subplots()
ax.plot(eps,learningCosts)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x121325970>]

In [22]:
# we can plot the results
plt.figure()

plt.subplot(2,1,1)
plt.plot(t, x[0,:])
plt.legend(['theta'])

plt.subplot(2,1,2)
plt.plot(t, x[1,:])
plt.legend(['omega'])

# we can also plot the control
plt.figure()
plt.plot(t[:-1], u.T)
plt.legend(['u1'])
plt.xlabel('Time [s]')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Text(0.5, 0, 'Time [s]')