## CartPole Gym

### Imports

In [1]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout

import numpy as np
import gym

Using TensorFlow backend.


### Creating the environment

In [2]:
env = gym.make('CartPole-v0')

In [3]:
action_space = env.action_space.n
print(action_space)

2


In [4]:
def getStateSize():
    state=env.reset()
    action = env.action_space.sample()
    obs, _, _, _ = env.step(action)
    return len(obs)

In [5]:
state_space = getStateSize()
print(state_space)

4


### Random games test

In [6]:
def some_random_games_first():
    for episode in range(5):
        env.reset()
        for t in range(500):
            env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                env.close()
                break
            

### Creating the model

In [6]:
model = Sequential()

model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(action_space, activation='softmax'))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

Instructions for updating:
Colocations handled automatically by placer.


### Generating data and training

In [7]:
def initial_data(number_of_games, game_turns, acceptable_score):

    X = []
    y = []
    one_hot = [0 for i in range(action_space)]

    for i in range(number_of_games):
        env.reset()
        game_memory = []
        prev_obs = []
        score = 0

        for turn in range(game_turns):

            action = env.action_space.sample()
            new_obs, reward, done, info = env.step(action)
            # summing the final score
            score += int(reward)

            if turn > 0:
                game_memory.append([prev_obs, int(action)])

            prev_obs = new_obs

            if done:
                break
                
        if score >= acceptable_score:
            for data in game_memory:
                X.append(np.array(data[0]).reshape(1, len(data[0])))
                predicted_action = list(one_hot)
                predicted_action[data[1]] = 1
                y.append(np.array(predicted_action).reshape(1, action_space))
    print('{} examples were made.'.format(len(X)))
    return np.array(X).reshape(-1, 1, len(data[0])), np.array(y).reshape(-1, 1, action_space)

In [9]:
X, y = initial_data(3000, 200, 60)

2986 examples were made.


In [10]:
model.fit(x=X, y=y, epochs=1, verbose=2, validation_split=0.2)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 2388 samples, validate on 598 samples
Epoch 1/1
 - 0s - loss: 0.6736 - acc: 0.5942 - val_loss: 0.6679 - val_acc: 0.6154


<keras.callbacks.History at 0x13a6c44a8>

### Playing

In [8]:
def play_game(n_games, model=None):
    for i in range(n_games):
        env.reset()
        prev_obs = []
        score = 0
        done = False
        while not done:
            env.render()
            if (model == None) or (len(prev_obs) < 1):
                action = env.action_space.sample()
            else:
                # otherwise we use our model to choose an
                # action based on the current observation (state)
                action = np.argmax(model.predict(prev_obs.reshape(-1, 1, state_space)))
            new_obs, reward, done, _ = env.step(action)
            prev_obs = new_obs
            score += reward
                
        env.close()
        print('Final score: {}'.format(score))

In [12]:
play_game(10, model)

Final score: 200.0
Final score: 200.0
Final score: 200.0
Final score: 200.0
Final score: 200.0
Final score: 200.0
Final score: 200.0
Final score: 200.0
Final score: 200.0
Final score: 200.0
