In [1]:
# First step is to get the environment gym. 
import gym 

In [2]:
# Second step is to use the "make" function to setup the environment:
env = gym.make("LunarLander-v2")

# Let's run a simulation where we are randomly samping among the four possible controls to see if we can get the lander to last longer in the environment and land on the landing pad: 

In [3]:
import numpy as np

In [4]:
# Third step is to "reset" the environment to its initial state:
env.reset()

# Number of simulation steps. 
num_steps = 1_000
# Let's just apply a force of one unit to the right. 
for k in range(num_steps):
    # Fourth step is to take an action or a control as we call it. 
    random_control = np.random.randint(4)
    next_state, reward, isDone, _ = env.step(random_control)
    env.render( )
    print("The next state is ", next_state)
    print("The reward is ", reward)
    
    # Check to see if the episode is over:
    if isDone==True:    
        print("The episode is over!")
        break;

The next state is  [ 0.01345749  1.4247155   0.68060154  0.2936702  -0.01541825 -0.15257731
  0.          0.        ]
The reward is  -0.2894893061386483
The next state is  [ 0.02010317  1.4307169   0.6701571   0.26666176 -0.02094522 -0.11054999
  0.          0.        ]
The reward is  0.8087955717098521
The next state is  [ 0.02668104  1.436129    0.66162837  0.24047543 -0.02475553 -0.07621299
  0.          0.        ]
The reward is  0.7658321124987151
The next state is  [ 0.0331851   1.4409504   0.6523883   0.21424322 -0.02670675 -0.0390278
  0.          0.        ]
The reward is  1.0101640455477832
The next state is  [ 0.03976078  1.4451715   0.6613456   0.1875381  -0.03045288 -0.07492983
  0.          0.        ]
The reward is  -0.9187437814952364
The next state is  [ 0.04633646  1.4487928   0.6613578   0.16086872 -0.03419725 -0.07489439
  0.          0.        ]
The reward is  -0.07795126205394354
The next state is  [ 0.05284185  1.4518211   0.6525318   0.1345431  -0.0361662  -0.03

The next state is  [ 0.38654557  1.0408556   0.64063567 -0.7212583   0.0474457  -0.01269482
  0.          0.        ]
The reward is  0.6005123072588503
The next state is  [ 0.39283496  1.0253203   0.6299042  -0.6904286   0.04648921 -0.01912962
  0.          0.        ]
The reward is  4.03659128997806
The next state is  [ 0.39912423  1.0091851   0.6299042  -0.7170954   0.04553274 -0.01912959
  0.          0.        ]
The reward is  -0.6158182201228328
The next state is  [ 0.4053256   0.99244696  0.618868   -0.74395794  0.04678744  0.02509384
  0.          0.        ]
The reward is  -0.15851347107510946
The next state is  [ 0.41152698  0.97510874  0.61886805 -0.7706247   0.04804213  0.02509393
  0.          0.        ]
The reward is  -0.8267597873098111
The next state is  [ 0.41772842  0.95717067  0.61886805 -0.79729146  0.04929682  0.0250939
  0.          0.        ]
The reward is  -0.8146677198988357
The next state is  [ 0.42388964  0.93998736  0.61483    -0.7637455   0.05057152  0.025

# Well, we can see from rendering that different thrusters were fired at different stages, but the lander still crashed into the surface of the planet. Let's see if we can do better by just randomly sampling the do-nothing and main thruster. Using the other actions randomly seemed to cause the lander to tip over faster and spiral out of control. 

# Four discrete actions available: do nothing, fire left orientation engine, fire main engine, fire right orientation engine.

In [None]:
# Third step is to "reset" the environment to its initial state:
env.reset()

controls = [0, 2]

# Number of simulation steps. 
num_steps = 1_000
# Let's just apply a force of one unit to the right. 
for k in range(num_steps):
    # Fourth step is to take an action or a control as we call it. 
    random_control = np.random.choice(controls)
    next_state, reward, isDone, _ = env.step(random_control)
    env.render( )
    print("The next state is ", next_state)
    print("The reward is ", reward)
    
    # Check to see if the episode is over:
    if isDone==True:    
        print("The episode is over!")
        break;

# This is a bit better as the lander does not crash, but it is using the main thruster control a bit too much. Let's reduce the probability that the main thruster control is not used so it can descend more and not fly off the screen. Let's use p=[2/3, 1/3] for our probabiliteies of each control

In [4]:
# Third step is to "reset" the environment to its initial state:
env.reset()

# Trying a subset of the 4 possible controls. 
controls = [0, 2]
# Probabilities of taking each of the two controls from above. 
# p = [2/3, 1/3]

# experiment with p a bit:
p = [0.7, 0.3]

# Number of simulation steps. 
num_steps = 1_000
# Let's just apply a force of one unit to the right. 
for k in range(num_steps):
    # Fourth step is to take an action or a control as we call it. 
    random_control = np.random.choice(controls, p=p)
    next_state, reward, isDone, _ = env.step(random_control)
    env.render( )
    print("The next state is ", next_state)
    print("The reward is ", reward)
    
    # Check to see if the episode is over:
    if isDone==True:    
        print("The episode is over!")
        break;

The next state is  [ 2.5577546e-04  1.4109697e+00  1.2922491e-02 -1.1738715e-02
 -2.8626935e-04 -2.8964651e-03  0.0000000e+00  0.0000000e+00]
The reward is  0.1678003797517249
The next state is  [ 3.8347245e-04  1.4101058e+00  1.2922948e-02 -3.8396426e-02
 -4.3101882e-04 -2.8952877e-03  0.0000000e+00  0.0000000e+00]
The reward is  -2.233552274408595
The next state is  [6.4258574e-04 1.4101164e+00 2.5417533e-02 4.6820517e-04 5.1904146e-05
 9.6593127e-03 0.0000000e+00 0.0000000e+00]
The reward is  1.245940360365904
The next state is  [1.0480881e-03 1.4109312e+00 3.9354503e-02 3.6211357e-02 1.2359214e-03
 2.3682747e-02 0.0000000e+00 0.0000000e+00]
The reward is  -3.305656821695538
The next state is  [0.00145359 1.4111458  0.0393509  0.00953747 0.00241947 0.02367324
 0.         0.        ]
The reward is  1.1590642618513982
The next state is  [ 0.00185909  1.4107604   0.03934749 -0.01712517  0.00360308  0.02367475
  0.          0.        ]
The reward is  -0.3221166564892428
The next state i

The next state is  [ 0.02401495  1.027367   -0.00638584 -0.53633803  0.12717319  0.05905986
  0.          0.        ]
The reward is  4.700361284581516
The next state is  [ 0.02398071  1.0147052  -0.00638571 -0.5630054   0.1301262   0.05905984
  0.          0.        ]
The reward is  -1.6959351625261547
The next state is  [ 0.02394648  1.0014433  -0.00638557 -0.58967274  0.1330792   0.05905982
  0.          0.        ]
The reward is  -1.6359798573135436
The next state is  [ 0.023876    0.9889089  -0.01039667 -0.5573871   0.13640878  0.06659146
  0.          0.        ]
The reward is  3.8426175591311678
The next state is  [ 0.02380543  0.97577465 -0.01039649 -0.58405465  0.13973835  0.0665914
  0.          0.        ]
The reward is  -1.6860590058251432
The next state is  [ 0.02373486  0.9620406  -0.0103963  -0.6107223   0.14306791  0.06659136
  0.          0.        ]
The reward is  -1.6261521740135265
The next state is  [ 0.02366428  0.9477066  -0.0103961  -0.63738984  0.14639747  0.066

# This stochastic policy approach and with the reduced control set got very close to landing on the pad. One of your HW problems will be to experiment with these probability values until you successfully land on the pad; screenshot your result and comment on what probabilities for each control worked the best for your autonomous agent! I eventually did get it to land on the pad, but its velocity was to high still: 

![Screen%20Shot%202022-05-05%20at%202.57.08%20PM.png](attachment:Screen%20Shot%202022-05-05%20at%202.57.08%20PM.png)

# Applying Q-learning to this example will help determine the best policy, that is what is the optimal control sequence for every state. 