In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gym
import gym_classification

In [44]:
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.layers import Dense, Embedding
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras import backend as K

## Import data and scale to input 

In [50]:
from keras.datasets import imdb

# Parameters
max_features = 10000
maxlen = 100

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [51]:
# Pad sequences to get same len in all samples
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [46]:
X = x_train
y = y_train

batch_size = 10

#input_shape = (maxlen,1)


 # Initialization of the enviroment
env = gym.make('gym_classification:RLClassification-v0')

# Fill values
env.init_dataset(X,y,batch_size=batch_size) #,output_shape=input_shape


# RL parameters
valid_actions = env.action_space
num_actions = valid_actions.n
print("Actions: {}".format(valid_actions))
epsilon = .1  # exploration
num_episodes = 30
iterations_episode = 100

decay_rate = 0.99
gamma = 0.001


# Network arquitecture
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(Bidirectional(LSTM(128, recurrent_dropout=0.2, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.05))
model.add(Dense(20, activation="relu"))
model.add(Dense(num_actions, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())


# history saving
reward_chain = []
loss_chain = []



# Main loop
for epoch in range(num_episodes):
    loss = 0.
    total_reward_by_episode = 0
    # Reset enviromet, actualize the data batch
    states = env.reset()

    done = False

    # Define exploration to improve performance
    exploration = 1
    # Iteration in one episode
    q = np.zeros([batch_size,num_actions])
    
    i_iteration = 0
    for i_iteration in range(iterations_episode):
    #while not done:
        i_iteration += 1

        # get next action
        if exploration > 0.001:
            exploration = epsilon*decay_rate**(epoch*i_iteration)            

        if np.random.rand() <= exploration:
            actions = np.random.randint(0, num_actions,batch_size)
        else:
            q = model.predict(states)
            actions = np.argmax(q,axis=1)

        # apply actions, get rewards and new state
        next_states, reward, done, _ = env.step(actions)        
        q_prime = model.predict(next_states)

        indx = np.argmax(q_prime,axis=1)
        sx = np.arange(len(indx))
        # Update q values
        targets = reward + gamma * q[sx,indx]  
        q[sx,actions] = targets

        # Train network, update loss
        loss += model.train_on_batch(states, q)[0]

        # Update the state
        states = next_states

        total_reward_by_episode += int(sum(reward))

    if next_states.shape[0] != batch_size:
            break # finished df
    reward_chain.append(total_reward_by_episode)    
    loss_chain.append(loss)

    print("\rEpoch {:03d}/{:03d} | Loss {:4.4f} | Tot reward x episode {:03d} ".format(epoch,
          num_episodes ,loss, total_reward_by_episode))


Actions: Discrete(2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 256)         263168    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 256)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 20)                5140      
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 42        
Total params: 1,548,350
Trainable params: 1,548,350
Non-trainable params: 0
_____________________________________________

In [None]:
# Evaluate model
from sklearn.metrics import classification_report
q_prime = model.predict(x_test)
predictions = np.argmax(q_prime,axis=1)

In [None]:
# Print classification report
print(classification_report(y_test,predictions))