If you're on a server, run

```
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ./xvfb start
    %env DISPLAY=:1
```

# Crossentropy method

This notebook will teach you to solve reinforcement learning with crossentropy method.
We will train a neural network policy for continuous state space game.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

In [None]:
import gym

env = gym.make("CartPole-v0")
env.reset()
n_actions = env.action_space.n

plt.imshow(env.render("rgb_array"))

In [None]:
state = env.reset()
print("first:",state)
print("actions:",range(n_actions))

In [None]:
new_state, reward, done,_ = env.step(0) #action 0
print("new state:",new_state)
print("reward:",reward)
print("is game over?:",done)


### Neural network agent

We'll use scikit-learn built-in neural networks for this one. Technically you could use any method with partial fit, even gradient boosting.

In [None]:

from sklearn.neural_network import MLPClassifier
agent = MLPClassifier(hidden_layer_sizes=(20,20),
                      activation='tanh',
                      warm_start=True, #keep progress between .fit(...) calls
                      max_iter=1 #make only 1 iteration on each .fit(...)
                     )
#initialize agent to the dimension of state an amount of actions
agent.partial_fit([env.reset()]*n_actions,range(n_actions), classes = range(n_actions));


### Play the game

In [None]:
#example: predict action probabilities
print("predictions:",agent.predict_proba([state])[0])

In [None]:
#Play for one step
probas = <predict action probabilities>

a = <choose action>

state, r, done, _ = <perform step>

plt.imshow(env.render('rgb_array'))
plt.show()

Okay, now let's repeat this until game ends.

In [None]:
def generate_session(t_max=1000):
    
    states,actions = [],[]
    total_reward = 0
    
    s = env.reset()
    
    for t in range(t_max):
        
        probs = <predict array of action probabilities>
        
        assert probs.shape == (n_actions,)
        
        a = <sample action with such probabilities>
        
        new_s,r,done,info = env.step(a)
        
        #record sessions like you did before
        states.append(s)
        actions.append(a)
        total_reward+=r
        
        s = new_s
        if done: break
            
    return states,actions,total_reward
        

In [None]:
states,actions,reward = generate_session(t_max=10)
print("Total reward:",reward)

## Learning with CrossEntropy Method

In [None]:
n_samples = 100  #play this many games
percentile = 70  #delete 70% worst, train on the rest

In [None]:
#Play n_samples games
sessions = <generate a list of games with generate_session>

In [None]:
batch_states,batch_actions,batch_rewards = map(np.array,zip(*sessions))
print("All states:",batch_states)
print("All actions:",batch_actions)
print("Rewards:",batch_rewards)

In [None]:
threshold = <select percentile of your samples>
print("Taking games with R > ",threshold)

In [None]:
#selector:
batch_rewards>threshold

In [None]:
elite_states = batch_states[batch_rewards>threshold]
elite_actions = batch_actions[batch_rewards>threshold]

elite_states, elite_actions = map(np.concatenate,[elite_states,elite_actions])

print(elite_states[:3])
print(elite_actions[:3])

In [None]:
<fit agent to take elite_actions from elite_states>

### Full algorithm

In [None]:
n_samples = 100  #play this many games
percentile = 70  #delete 70% worst, train on the rest

for i in range(100):
    #generate new sessions
    sessions = <generate new sessions>

    batch_states,batch_actions,batch_rewards = map(np.array,zip(*sessions))
    #batch_states: a list of lists of states in each session
    #batch_actions: a list of lists of actions in each session
    #batch_rewards: a list of floats - total rewards at each session

    threshold = <select percentile of your samples>
    
    elite_states = <select states from sessions where rewards are above threshold>
    elite_actions = <select actions from sessions where rewards are above threshold>
    
    elite_states, elite_actions = map(np.concatenate,[elite_states,elite_actions])
    #elite_states: a list of states from top games
    #elite_actions: a list of actions from top games
    
    <fit agent to predict elite_actions(y) from elite_states(X)>


    print("mean reward = %.5f\tthreshold = %.1f"%(np.mean(batch_rewards),threshold))

# Results

In [None]:
#record sessions
import gym.wrappers
env = gym.wrappers.Monitor(gym.make("CartPole-v0"),directory="videos",force=True)
sessions = [generate_session() for _ in range(100)]
env.close()

#upload to gym
#gym.upload("./videos/",api_key="<your_api_key>") #you'll need me later

In [None]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices