In [1]:
# -*- coding: utf-8 -*-
import re
import time
import datetime 
import operator
import numpy as np
import pandas as pd 
import collections
import unicodedata
import collections
import seaborn as sns
import collections
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt

from tqdm import tqdm
from collections import Counter
from datetime import datetime, date, timedelta
from IPython.display import Image


%matplotlib inline

pylab.rcParams['figure.figsize'] = 10,8

plt.style.use('bmh')
colors = ['#348ABD', '#A60628', '#7A68A6', '#467821', '#D55E00', 
          '#CC79A7', '#56B4E9', '#009E73', '#F0E442', '#0072B2']

In [2]:
reward_table = [
    [ -250 , -15000, -405 , -15000],
    [-309 , -15000, -400 , 405  ],
    [-262 , -15000, -255 , 400  ],
    [-231 , -15000, -77  , 255  ],
    [-61  , -15000, 0    , 77   ],
    [0    , -15000, 0    , 0    ],
    [0    , -15000, -15000, 0    ],
    [-325 , 250  , -452 , -15000],
    [-270 , 309  , -325 , 452  ],
    [-200 , 262  , -190 , 325  ],
    [-125 , 231  , -10  , 190  ],
    [-2   , 61   , 0    , 10   ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , -15000, 0    ],
    [-192 , 325  , -390 , -15000],
    [-169 , 270  , -285 , -390 ],
    [-105 , 200  , -132 , 285  ],
    [-10  , 125  , -5   , 132  ],
    [0    , 2    , 0    , 5    ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , -15000, 0    ],
    [-117 , 192  , -350 , -15000],
    [-67  , 169  , -235 , 350  ],
    [-8   , 105  , -26  , 235  ],
    [0    , 10   , 0    , 26   ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , -15000, 0    ],
    [-38  , 117  , -250 , -15000],
    [0    , 67   , -148 , 250  ],
    [0    , 8    , -3   , 148  ],
    [0    , 0    , 0    , 3    ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , -15000, 0    ],
    [0    , 38   , -195 , -15000],
    [0    , 0    , -193 , 195  ],
    [0    , 0    , -5   , 193  ],
    [0    , 0    , 0    , 5    ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , 0    , 0    ],
    [0    , 0    , -15000, 0    ],
    [-15000, 0    , -255 , -15000],
    [-15000, 0    , -190 , 255  ],
    [-15000, 0    , -8   , 190  ],
    [-15000, 0    , 0    , 8   ],
    [-15000, 0    , 0    , 0    ],
    [-15000, 0    , 0    , 0    ],
    [-15000, 0    , -15000, 0    ]
]; 

In [3]:
def softmax(f):
    f = f - np.max(f)
    p = np.exp(f) / np.sum(np.exp(f))
    return p

In [4]:
softmax_reward_table = np.apply_along_axis(softmax, 1, reward_table)

In [5]:
def action_is_allowed(learner, state, action):

    if (action == 0 and not(state > learner.num_states - learner.servo_num_states - 1)):
        return True
    elif (action == 1 and not(state < learner.servo_num_states)):
        return True
    elif (action == 2 and not((state%learner.servo_num_states) == (learner.servo_num_states-1))):
        return True
    elif (action == 3 and not(state%learner.servo_num_states==0)):
        return True
    else:
        return False

In [6]:
class QLearner(object):
    def __init__(self, 
                 servo_num_states, 
                 num_actions, 
                 alpha, 
                 gamma, 
                 random_action_rate,
                 random_action_decay_rate, 
                 warm_up_period, 
                 action_penalty,
                 negative_reward_coef,
                 interest_zone_reward_coef,
                 initial_state,
                 scaling_point, 
                 scaling_factor):
        
        self.servo_num_states = servo_num_states
        self.num_states = servo_num_states**2
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.random_action_rate = random_action_rate
        self.random_action_decay_rate = random_action_decay_rate
        self.warm_up_period = warm_up_period
        self.state = initial_state
        self.action = 0
        self.action_penalty = action_penalty
        self.qtable = np.zeros((self.num_states, self.num_actions))
        self.num_iteration = 0
        self.last_reward = 0
        self.negative_reward_coef = negative_reward_coef
        self.interest_zone_reward_coef = interest_zone_reward_coef
        self.scaling_point = scaling_point
        self.scaling_factor = scaling_factor
        
    def set_initial_state(self, action):
        """
        @summary: Sets the initial state and returns an action
        @param state: The initial state
        @returns: The selected action
        """
        self.state = int(self.num_states/2)
        self.action = action #self.qtable[state].argsort()[-1]
        
    def get_next_state(self):
        
        next_state = None
        
        if (self.action == 0 and action_is_allowed(self, self.state, self.action)): 
            next_state = self.state + self.servo_num_states
        elif (self.action == 1 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state - self.servo_num_states
        elif (self.action == 2 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state + 1
        elif (self.action == 3 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state - 1 
        else:
            next_state = self.state;
            
        return next_state
        
    def move(self, state_prime, reward):
        """
        @summary: Moves to the given state with given reward and returns action
        @param state_prime: The new state
        @param reward: The reward
        @returns: The selected action
        """
        alpha = self.alpha
        gamma = self.gamma
        state = self.state
        action = self.action
        qtable = self.qtable
        action_prime = -1

        greedy_action = self.qtable[state_prime].argsort()[-1]

        
        choose_random_action = (1 - self.random_action_rate) <= np.random.uniform(0, 1)

        if choose_random_action:
            action_prime = np.random.randint(0, self.num_actions)
        else:
            action_prime = greedy_action

        if self.num_iteration > self.warm_up_period: # warm up period is over
            self.random_action_rate *= self.random_action_decay_rate

        qtable[state, action] = qtable[state, action] + alpha * (reward + gamma * qtable[state_prime, greedy_action] - qtable[state, action])

        self.state = state_prime
        self.action = action_prime    
        self.qtable = qtable

In [7]:
class naive_QLearner(object):
    def __init__(self, 
                 servo_num_states, 
                 num_actions, 
                 alpha, 
                 gamma, 
                 random_action_rate,
                 random_action_decay_rate, 
                 warm_up_period, 
                 action_penalty,
                 negative_reward_coef,
                 interest_zone_reward_coef,
                 initial_state,
                 scaling_point, 
                 scaling_factor,
                 eligibility_lambda):
        
        self.servo_num_states = servo_num_states
        self.num_states = servo_num_states**2
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.random_action_rate = random_action_rate
        self.random_action_decay_rate = random_action_decay_rate
        self.warm_up_period = warm_up_period
        self.state = initial_state
        self.action = 0
        self.action_penalty = action_penalty
#         self.qtable = np.random.uniform(low=-1, high=1, size=(self.num_states, self.num_actions))
        self.qtable = np.zeros((self.num_states, self.num_actions))
#         self.qtable = np.ones((self.num_states, self.num_actions)) * 0
        self.num_iteration = 0
        self.last_reward = 0
        self.negative_reward_coef = negative_reward_coef
        self.interest_zone_reward_coef = interest_zone_reward_coef
        self.scaling_point = scaling_point
        self.scaling_factor = scaling_factor
        self.traces_table = np.zeros((self.num_states, self.num_actions))
        self.eligibility_lambda = eligibility_lambda
    
    def set_initial_state(self, action):
        """
        @summary: Sets the initial state and returns an action
        @param state: The initial state
        @returns: The selected action
        """
        self.state = int(self.num_states/2)
        self.action = action #self.qtable[state].argsort()[-1]
        
    def get_next_state(self):
        
        next_state = None
        
        if (self.action == 0 and action_is_allowed(self, self.state, self.action)): 
            next_state = self.state + self.servo_num_states
        elif (self.action == 1 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state - self.servo_num_states
        elif (self.action == 2 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state + 1
        elif (self.action == 3 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state - 1 
        else:
            next_state = self.state;
            
        return next_state
        
    def move(self, state_prime, reward):
        """
        @summary: Moves to the given state with given reward and returns action
        @param state_prime: The new state
        @param reward: The reward
        @returns: The selected action
        """
        alpha = self.alpha
        gamma = self.gamma
        eligibility_lambda = self.eligibility_lambda
        state = self.state
        action = self.action
        qtable = self.qtable
        traces_table = self.traces_table
        action_prime = -1
        
            
        greedy_action = self.qtable[state_prime].argsort()[-1]
        
        choose_random_action = (1 - self.random_action_rate) <= np.random.uniform(0, 1)

        if choose_random_action:
            action_prime = np.random.randint(0, self.num_actions)
        else:
            action_prime = greedy_action

        if self.num_iteration > self.warm_up_period: # warm up period is over
            self.random_action_rate *= self.random_action_decay_rate

        theta = reward + gamma * qtable[state_prime, greedy_action] - qtable[state, action]
        traces_table[state, action] += 1 
        
        for s in xrange(self.num_states):
            for a in xrange(self.num_actions):
                qtable[s, a] = qtable[state, action] + alpha * theta * traces_table[s, a]
                traces_table[s, a] = gamma * eligibility_lambda * traces_table[s, a]

        self.state = state_prime
        self.action = action_prime    
        self.qtable = qtable

In [8]:
def get_final_reward(learner, reward_table, training_iteration):
    
    reward_list = []
    
#     learned_reward_table = np.zeros((learner.num_states, learner.num_actions))
    
    num_exploration_step = 200000
    
    verbose = True
    
    for iteration_step in xrange(training_iteration):

#         reward = reward_table[learner.state][learner.action] - learner.action_penalty

#         if learner.state % learner.servo_num_states > 2 and learner.state < 30:
#             reward = reward * learner.interest_zone_reward_coef if reward > 0 else reward
            
        if iteration_step < num_exploration_step: 
            
            reward = reward_table[learner.state][learner.action] 
#             random_noise = np.random.normal(0, np.abs(ideal_reward)*.25) if ideal_reward != 0 else 0 
#             noisy_reward = ideal_reward + random_noise - learner.action_penalty

#             if learner.state % learner.servo_num_states > 2 and learner.state < 30:
#                 reward = noisy_reward * learner.interest_zone_reward_coef if noisy_reward > 0 else noisy_reward * learner.negative_reward_coef
#             else:
#                 reward = noisy_reward

#             if learned_reward_table[learner.state][learner.action] != 0:
#                 learned_reward_table[learner.state][learner.action] = .5*(learned_reward_table[learner.state][learner.action] + reward)
#             else:
#                 learned_reward_table[learner.state][learner.action] = reward
            

            next_state = learner.get_next_state()

            learner.move(next_state, reward)

            learner.num_iteration += 1 

            if learner.num_iteration % learner.scaling_point == 0: learner.qtable *= learner.scaling_factor

            reward_list.append(reward)
        
        else:
            
#             if verbose:
#                 print "Begin simulation"
                
#             verbose = False
            
            reward = learned_reward_table[learner.state][learner.action]
            reward_list.append(reward)


            next_state = learner.get_next_state()
            learner.move(next_state, reward)
            learner.num_iteration += 1 

            if learner.num_iteration % learner.scaling_point == 0: 
                learner.qtable *= learner.scaling_factor
        
    return np.mean(reward_list)
    
    

In [9]:
def compute_equivalent_reward_table(params, num_simulation):
    
    reward_table_list = []
    
    for simulation_step in xrange(num_simulation):
    
        learner = QLearner(servo_num_states = 7,
                           num_actions=4,
                           alpha=float(params['alpha']),
                           gamma=float(params['gamma']),
                           random_action_rate= 1, #float(params['random_action_rate']),
                           random_action_decay_rate=float(params['random_action_decay_rate']),
                           warm_up_period= 1000, #int(params['warm_up_period']),
                           action_penalty=50,
                           negative_reward_coef=1, #float(params['negative_reward_coef']),
                           interest_zone_reward_coef = 1, #float(params['interest_zone_reward_coef']),
                           initial_state=10,
                           scaling_point = int(params['scaling_point']),
                           scaling_factor = float(params['scaling_factor']))
    
        learned_reward_table = np.zeros((learner.num_states, learner.num_actions))

        for iteration_step in xrange(learner.warm_up_period): 

            ideal_reward = reward_table[learner.state][learner.action] 
            random_noise = np.random.normal(0, np.abs(ideal_reward)*.25) if ideal_reward != 0 else 0 
            noisy_reward = ideal_reward + random_noise #- learner.action_penalty

    #         if learner.state % learner.servo_num_states > 2 and learner.state < 30:
    #             reward = noisy_reward * learner.interest_zone_reward_coef if noisy_reward > 0 else noisy_reward * learner.negative_reward_coef
    #         else:
    #             reward = noisy_reward

            reward = noisy_reward

            if learned_reward_table[learner.state][learner.action] != 0:
                learned_reward_table[learner.state][learner.action] = .5*(learned_reward_table[learner.state][learner.action] + reward)
            else:
                learned_reward_table[learner.state][learner.action] = reward

            next_state = learner.get_next_state()

            learner.move(next_state, learned_reward_table[learner.state][learner.action])

            learner.num_iteration += 1 

            if learner.num_iteration % learner.scaling_point == 0: learner.qtable *= learner.scaling_factor
            
        reward_table_list.append(learned_reward_table)
        

        return reward_table_list

In [10]:
best_params =  {'warm_up_period': 50, 
                'random_action_decay_rate': 0.2255776270452332, 
                'scaling_factor': 4.216451025227167, 
                'alpha': 0.939846274031598, 
                'scaling_point': 220, 
                'gamma': 0.2168574129380133, 
                'random_action_rate': 0.8730281965950317}

reward_table_list = compute_equivalent_reward_table(best_params, num_simulation=1)

In [11]:
from hyperopt import hp, fmin, tpe, hp, STATUS_OK, Trials
import datetime

best_score = np.inf

def score(params):
    
    global best_score    
    
    num_iteration = 2
    
    loss_vector = []
    
    for i in xrange(num_iteration):
    
        learner = naive_QLearner(servo_num_states = 7,
                               num_actions=4,
                               alpha=float(params['alpha']),
                               gamma=float(params['gamma']),
                               random_action_rate= float(params['random_action_rate']),
                               random_action_decay_rate=float(params['random_action_decay_rate']),
                               warm_up_period=int(params['warm_up_period']),
                               action_penalty=50,
                               negative_reward_coef=1, #float(params['negative_reward_coef']),
                               interest_zone_reward_coef = 1, #float(params['interest_zone_reward_coef']),
                               initial_state=24,
                               scaling_point = int(params['scaling_point']),
                               scaling_factor = float(params['scaling_factor']),
                               eligibility_lambda = float(params['eligibility_lambda']))
        #learner.set_initial_state(action=0)

        temp_loss = - get_final_reward(learner, reward_table, training_iteration = 25000) # negative because we want to minimize loss
        loss_vector.append(temp_loss)
        
    loss = np.mean(loss_vector)
    
    if loss < best_score: 
        print "Searching...", 
        print "New best score: {0:.6f}".format(-loss), 
        print params 
        print datetime.datetime.now().time()
        print
        best_score = loss

    return {'loss': loss, 'status': STATUS_OK}
    
def optimize(trials):
        
    space = {
             'alpha': hp.uniform('alpha', 0.1, 1),
             'gamma': hp.uniform('gamma', 0.1, 1),
             'random_action_rate': hp.uniform('random_action_rate', 0.8,1), 
             'random_action_decay_rate': hp.uniform('random_action_decay_rate', 0.1, 1),
             'warm_up_period': hp.choice('warm_up_period', np.arange(50,300,25)),
             'scaling_point': hp.choice('scaling_point', np.arange(5,300, 5)),
             'scaling_factor': hp.uniform('scaling_factor', 1., 5.),
             'eligibility_lambda': hp.uniform('eligibility_lambda', 0, 1)
#              'initial_state': hp.quniform('initial_state', 0, 47, 1)
#              'negative_reward_coef': hp.uniform('negative_reward_coef', 1, 1.5),
#              'interest_zone_reward_coef': hp.uniform('interest_zone_reward_coef', 1, 1.5)
            }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=150)

    print '------------------------' 
    print "Done."
    print "Best parameter setting:", best
    return best

In [None]:
%%time

#Trials object where the history of search will be stored
trials = Trials()

best_params = optimize(trials)

Searching... New best score: -7470.700180 {'warm_up_period': 50, 'scaling_factor': 3.346332373509635, 'eligibility_lambda': 0.32421081007985597, 'random_action_decay_rate': 0.31441926634329626, 'random_action_rate': 0.9233598188262387, 'alpha': 0.45407786670202144, 'scaling_point': 65, 'gamma': 0.9335975674063369}
21:48:32.988520

Searching... New best score: -4551.937600 {'warm_up_period': 175, 'scaling_factor': 2.580454309437858, 'eligibility_lambda': 0.044483116839559855, 'random_action_decay_rate': 0.9534692600560306, 'random_action_rate': 0.9203631301409738, 'alpha': 0.9163655519375941, 'scaling_point': 210, 'gamma': 0.9134186918860369}
21:48:53.107709

Searching... New best score: -3460.350780 {'warm_up_period': 200, 'scaling_factor': 3.8158163450920193, 'eligibility_lambda': 0.196200306662471, 'random_action_decay_rate': 0.6041113279034883, 'random_action_rate': 0.9826860881511794, 'alpha': 0.7476330592887798, 'scaling_point': 40, 'gamma': 0.5858346688513003}
21:49:35.622095

Se

In [None]:
def test_crawling(params, reward_table, training_iteration = 4000):
    
    state_vector = np.zeros(49)
    action_vector = np.zeros(4)
    reward_vector = []
    
    temp_reward = []

    learner = QLearner(servo_num_states = 7,
               num_actions=4,
               alpha=float(params['alpha']),
               gamma=float(params['gamma']),
               random_action_rate=float(params['random_action_rate']),
               random_action_decay_rate=float(params['random_action_decay_rate']),
               warm_up_period=int(params['warm_up_period']),
               action_penalty=50,
               negative_reward_coef=1.,#float(params['negative_reward_coef']),
               interest_zone_reward_coef = 1, #1.32, #float(params['interest_zone_reward_coef']),
               initial_state=24,
               scaling_point = int(params['scaling_point']),
               scaling_factor = float(params['scaling_factor']))
    
    for iteration_step in xrange(training_iteration):
    
        state_vector[learner.state] += 1
        action_vector[learner.action] += 1 
    
        reward = reward_table[learner.state][learner.action] - learner.action_penalty
        temp_reward.append(reward)

        next_state = learner.get_next_state()
        learner.move(next_state, reward)
        learner.num_iteration += 1 

        if learner.num_iteration % learner.scaling_point == 0: 
            learner.qtable *= learner.scaling_factor

        if iteration_step > training_iteration-30:
            print learner.state 
 
    
    print "\nReward per action: ",np.mean(temp_reward)

    
    normalize_state_vector = (state_vector/np.sum(state_vector)) * 100.
    state_map = pd.DataFrame(normalize_state_vector.reshape(learner.servo_num_states, -1))
    sns.heatmap(state_map, linewidths=1, annot=True)
    plt.title('State map')
    
    normalize_action_vector = (action_vector/np.sum(action_vector)) * 100.

best_params =  {'warm_up_period': 12.0,
 'random_action_decay_rate': 0.15894806640577777, 
'scaling_factor': 1.1080371257490829, 
'alpha': 0.999473040776306, 
'scaling_point': 75, 
'gamma': 0.9669250552091386, 
'random_action_rate': 0.8673007680007774} 

test_crawling(best_params, reward_table)

In [None]:
def test_crawling_naive_Q(params, reward_table, training_iteration = 500):
    
    state_vector = np.zeros(49)
    action_vector = np.zeros(4)
    reward_vector = []
    
    temp_reward = []

    learner = naive_QLearner(servo_num_states = 7,
                               num_actions=4,
                               alpha=float(params['alpha']),
                               gamma=float(params['gamma']),
                               random_action_rate=float(params['random_action_rate']),
                               random_action_decay_rate=float(params['random_action_decay_rate']),
                               warm_up_period=int(params['warm_up_period']),
                               action_penalty=50,
                               negative_reward_coef=1.,#float(params['negative_reward_coef']),
                               interest_zone_reward_coef = 1, #1.32, #float(params['interest_zone_reward_coef']),
                               initial_state=24,
                               scaling_point = int(params['scaling_point']),
                               scaling_factor = float(params['scaling_factor']),
                               eligibility_lambda = .5)
    
#     learner = QLearner(servo_num_states = 7,
#                num_actions=4,
#                alpha=float(params['alpha']),
#                gamma=float(params['gamma']),
#                random_action_rate=float(params['random_action_rate']),
#                random_action_decay_rate=float(params['random_action_decay_rate']),
#                warm_up_period=int(params['warm_up_period']),
#                action_penalty=50,
#                negative_reward_coef=1.,#float(params['negative_reward_coef']),
#                interest_zone_reward_coef = 1, #1.32, #float(params['interest_zone_reward_coef']),
#                initial_state=24,
#                scaling_point = int(params['scaling_point']),
#                scaling_factor = float(params['scaling_factor']))
        
    for iteration_step in xrange(training_iteration):
    
        state_vector[learner.state] += 1
        action_vector[learner.action] += 1 
    
        reward = reward_table[learner.state][learner.action] - learner.action_penalty
        temp_reward.append(reward)

        next_state = learner.get_next_state()
        learner.move(next_state, reward)
        learner.num_iteration += 1 

#         if learner.num_iteration % learner.scaling_point == 0: 
#             learner.qtable *= learner.scaling_factor

        if iteration_step > training_iteration-30:
            print learner.state 
 
    
    print "\nReward per action: ",np.mean(temp_reward)

    
    normalize_state_vector = (state_vector/np.sum(state_vector)) * 100.
    state_map = pd.DataFrame(normalize_state_vector.reshape(learner.servo_num_states, -1))
    sns.heatmap(state_map, linewidths=1, annot=True)
    plt.title('State map')
    
    normalize_action_vector = (action_vector/np.sum(action_vector)) * 100.

best_params =  {'warm_up_period': 12.0,
 'random_action_decay_rate': 0.15894806640577777, 
'scaling_factor': 1.1080371257490829, 
'alpha': 0.999473040776306, 
'scaling_point': 75, 
'gamma': 0.9669250552091386, 
'random_action_rate': 0.8673007680007774} 

test_crawling_naive_Q(best_params, reward_table)

In [None]:
def crawling_simulation(params, reward_table, training_iteration = 50000, num_simulation = 1):
    
    state_vector = np.zeros(49)
    action_vector = np.zeros(4)
    reward_vector = []
    
    
    for simulation in tqdm(xrange(num_simulation)):
        
        temp_reward = []

        learner = QLearner(servo_num_states = 7,
                   num_actions=4,
                   alpha=float(params['alpha']),
                   gamma=float(params['gamma']),
                   random_action_rate=float(params['random_action_rate']),
                   random_action_decay_rate=float(params['random_action_decay_rate']),
                   warm_up_period=int(params['warm_up_period']),
                   action_penalty=50,
                   negative_reward_coef=1.,#float(params['negative_reward_coef']),
                   interest_zone_reward_coef = 1, #1.32, #float(params['interest_zone_reward_coef']),
                   initial_state=24,
                   scaling_point = int(params['scaling_point']),
                   scaling_factor = float(params['scaling_factor']))#int(params['initial_state'])) #float(params['action_penalty']))

        exploration_learner = QLearner(servo_num_states = 7,
                                       num_actions=4,
                                       alpha=float(params['alpha']),
                                       gamma=float(params['gamma']),
                                       random_action_rate= 1, #float(params['random_action_rate']),
                                       random_action_decay_rate=float(params['random_action_decay_rate']),
                                       warm_up_period= 1500, #int(params['warm_up_period']),
                                       action_penalty=50,
                                       negative_reward_coef=1, #float(params['negative_reward_coef']),
                                       interest_zone_reward_coef = 1, #float(params['interest_zone_reward_coef']),
                                       initial_state=10,
                                       scaling_point = int(params['scaling_point']),
                                       scaling_factor = float(params['scaling_factor']))
        
        
        num_exploration_step = exploration_learner.warm_up_period
        
        learned_reward_table = np.zeros((learner.num_states, learner.num_actions))
        
        previous_reward = None
    
        for iteration_step in xrange(training_iteration):
            
    
            if iteration_step > training_iteration-30 and simulation == num_simulation-1:
                print learner.state 
            
            if iteration_step < num_exploration_step:
            
                ideal_reward = reward_table[exploration_learner.state][exploration_learner.action]                 
                random_noise = np.random.normal(0, np.abs(ideal_reward)*.25) if ideal_reward != 0 else 0 
                noisy_reward = ideal_reward + random_noise - exploration_learner.action_penalty
                    
                reward = noisy_reward
            
                if learned_reward_table[exploration_learner.state][exploration_learner.action] != 0:
                    learned_reward_table[exploration_learner.state][exploration_learner.action] = .5*(learned_reward_table[exploration_learner.state][exploration_learner.action] + reward)
                else:
                    learned_reward_table[exploration_learner.state][exploration_learner.action] = reward
                
                
                #temp_reward.append(learned_reward_table[learner.state][learner.action])

                next_state = exploration_learner.get_next_state()
            
                exploration_learner.move(next_state, learned_reward_table[exploration_learner.state][exploration_learner.action])

                exploration_learner.num_iteration += 1 

                if exploration_learner.num_iteration % exploration_learner.scaling_point == 0: 
                    exploration_learner.qtable *= exploration_learner.scaling_factor

            else:
                
                state_vector[learner.state] += 1
                action_vector[learner.action] += 1     
                    
                reward = learned_reward_table[learner.state][learner.action] - learner.action_penalty
                temp_reward.append(reward)
                
                next_state = learner.get_next_state()
                learner.move(next_state, reward)
                learner.num_iteration += 1 

                if learner.num_iteration % learner.scaling_point == 0: 
                    learner.qtable *= learner.scaling_factor
                
                    
        reward_vector.append(np.mean(temp_reward))
            
        
    normalize_state_vector = (state_vector/np.sum(state_vector)) * 100.
    state_map = pd.DataFrame(normalize_state_vector.reshape(learner.servo_num_states, -1))
    sns.heatmap(state_map, linewidths=1, annot=True)
    plt.title('State map after {} simulations in %'.format(num_simulation))
    
    normalize_action_vector = (action_vector/np.sum(action_vector)) * 100.
    
    #print len(reward_vector)
    print "\nReward per action: ",np.mean(reward_vector)
    #print normalize_action_vector
    
    return learner, learned_reward_table



best_params = {'warm_up_period': 96.0, 
               'scaling_factor': 1.6944560688012607, 
               'random_action_decay_rate': 0.545267375797949, 
               'random_action_rate': 0.10056735562002306, 
               'alpha': 0.6227095466939823, 
               'scaling_point': 190, 
               'gamma': 0.932459803334474}

best_params =   {'warm_up_period': 90.0, 
                 'scaling_factor': 3.6044006257743333, 
                 'random_action_decay_rate': 0.824709435216204, 
                 'random_action_rate': 0.834614308862227, 
                 'alpha': 0.3092353715331193, 
                 'scaling_point': 275, 
                 'gamma': 0.37419656297319936}

# best_params =  {'warm_up_period': 150, 
#                 'random_action_decay_rate': 0.3887743156769585, 
#                 'scaling_factor': 1.1566975970917208, 
#                 'alpha': 0.6187041752108351, 
#                 'scaling_point': 295, 
#                 'gamma': 0.9745242012202543, 
#                 'random_action_rate': 0.9449033859491366}

# best_params =  {'warm_up_period': 50, 
#                 'random_action_decay_rate': 0.2255776270452332, 
#                 'scaling_factor': 4.216451025227167, 
#                 'alpha': 0.939846274031598, 
#                 'scaling_point': 220, 
#                 'gamma': 0.2168574129380133, 
#                 'random_action_rate': 0.8730281965950317}

best_params = {'warm_up_period': 50, 'random_action_decay_rate': 0.9694900649014736, 'scaling_factor': 4.583933783369448, 'alpha': 0.1011166769702368, 'scaling_point': 75, 'gamma': 0.6864430566866447, 'random_action_rate': 0.8397212427297416}
learner, learned_reward_table = crawling_simulation(best_params, reward_table)

In [None]:
learned_reward_table[14]

In [None]:
reward_table[14]

In [None]:
class naive_QLearner(object):
    def __init__(self, 
                 servo_num_states, 
                 num_actions, 
                 alpha, 
                 gamma, 
                 random_action_rate,
                 random_action_decay_rate, 
                 warm_up_period, 
                 action_penalty,
                 negative_reward_coef,
                 interest_zone_reward_coef,
                 initial_state,
                 scaling_point, 
                 scaling_factor,
                 eligibility_lambda):
        
        self.servo_num_states = servo_num_states
        self.num_states = servo_num_states**2
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.random_action_rate = random_action_rate
        self.random_action_decay_rate = random_action_decay_rate
        self.warm_up_period = warm_up_period
        self.state = initial_state
        self.action = 0
        self.action_penalty = action_penalty
#         self.qtable = np.random.uniform(low=-1, high=1, size=(self.num_states, self.num_actions))
        self.qtable = np.zeros((self.num_states, self.num_actions))
#         self.qtable = np.ones((self.num_states, self.num_actions)) * 0
        self.num_iteration = 0
        self.last_reward = 0
        self.negative_reward_coef = negative_reward_coef
        self.interest_zone_reward_coef = interest_zone_reward_coef
        self.scaling_point = scaling_point
        self.scaling_factor = scaling_factor
        self.traces_table = np.zeros((self.num_states, self.num_actions))
        self.eligibility_lambda = eligibility_lambda
    
    def set_initial_state(self, action):
        """
        @summary: Sets the initial state and returns an action
        @param state: The initial state
        @returns: The selected action
        """
        self.state = int(self.num_states/2)
        self.action = action #self.qtable[state].argsort()[-1]
        
    def get_next_state(self):
        
        next_state = None
        
        if (self.action == 0 and action_is_allowed(self, self.state, self.action)): 
            next_state = self.state + self.servo_num_states
        elif (self.action == 1 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state - self.servo_num_states
        elif (self.action == 2 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state + 1
        elif (self.action == 3 and action_is_allowed(self, self.state, self.action)):
            next_state = self.state - 1 
        else:
            next_state = self.state;
            
        return next_state
        
    def move(self, state_prime, reward):
        """
        @summary: Moves to the given state with given reward and returns action
        @param state_prime: The new state
        @param reward: The reward
        @returns: The selected action
        """
        alpha = self.alpha
        gamma = self.gamma
        eligibility_lambda = self.eligibility_lambda
        state = self.state
        action = self.action
        qtable = self.qtable
        traces_table = self.traces_table
        action_prime = -1
        
            
        greedy_action = self.qtable[state_prime].argsort()[-1]
        
        choose_random_action = (1 - self.random_action_rate) <= np.random.uniform(0, 1)

        if choose_random_action:
            action_prime = np.random.randint(0, self.num_actions)
        else:
            action_prime = greedy_action

        if self.num_iteration > self.warm_up_period: # warm up period is over
            self.random_action_rate *= self.random_action_decay_rate

        theta = reward + gamma * qtable[state_prime, greedy_action] - qtable[state, action]
        traces_table[state, action] += 1 
        
        for s in xrange(self.num_states):
            for a in xrange(self.num_actions):
                qtable[s, a] = qtable[state, action] + alpha * theta * traces_table[s, a]
                traces_table[s, a] = gamma * eligibility_lambda * traces_table[s, a]

        self.state = state_prime
        self.action = action_prime    
        self.qtable = qtable

In [None]:
def test_crawling_naive_Q(params, reward_table, training_iteration = 500):
    
    state_vector = np.zeros(49)
    action_vector = np.zeros(4)
    reward_vector = []
    
    temp_reward = []

    learner = naive_QLearner(servo_num_states = 7,
                               num_actions=4,
                               alpha=float(params['alpha']),
                               gamma=float(params['gamma']),
                               random_action_rate=float(params['random_action_rate']),
                               random_action_decay_rate=float(params['random_action_decay_rate']),
                               warm_up_period=int(params['warm_up_period']),
                               action_penalty=50,
                               negative_reward_coef=1.,#float(params['negative_reward_coef']),
                               interest_zone_reward_coef = 1, #1.32, #float(params['interest_zone_reward_coef']),
                               initial_state=24,
                               scaling_point = int(params['scaling_point']),
                               scaling_factor = float(params['scaling_factor']),
                               eligibility_lambda = .5)
    
#     learner = QLearner(servo_num_states = 7,
#                num_actions=4,
#                alpha=float(params['alpha']),
#                gamma=float(params['gamma']),
#                random_action_rate=float(params['random_action_rate']),
#                random_action_decay_rate=float(params['random_action_decay_rate']),
#                warm_up_period=int(params['warm_up_period']),
#                action_penalty=50,
#                negative_reward_coef=1.,#float(params['negative_reward_coef']),
#                interest_zone_reward_coef = 1, #1.32, #float(params['interest_zone_reward_coef']),
#                initial_state=24,
#                scaling_point = int(params['scaling_point']),
#                scaling_factor = float(params['scaling_factor']))
        
    for iteration_step in xrange(training_iteration):
    
        state_vector[learner.state] += 1
        action_vector[learner.action] += 1 
    
        reward = reward_table[learner.state][learner.action] - learner.action_penalty
        temp_reward.append(reward)

        next_state = learner.get_next_state()
        learner.move(next_state, reward)
        learner.num_iteration += 1 

#         if learner.num_iteration % learner.scaling_point == 0: 
#             learner.qtable *= learner.scaling_factor

        if iteration_step > training_iteration-30:
            print learner.state 
 
    
    print "\nReward per action: ",np.mean(temp_reward)

    
    normalize_state_vector = (state_vector/np.sum(state_vector)) * 100.
    state_map = pd.DataFrame(normalize_state_vector.reshape(learner.servo_num_states, -1))
    sns.heatmap(state_map, linewidths=1, annot=True)
    plt.title('State map')
    
    normalize_action_vector = (action_vector/np.sum(action_vector)) * 100.

best_params =  {'warm_up_period': 12.0,
 'random_action_decay_rate': 0.15894806640577777, 
'scaling_factor': 1.1080371257490829, 
'alpha': 0.999473040776306, 
'scaling_point': 75, 
'gamma': 0.9669250552091386, 
'random_action_rate': 0.8673007680007774} 

test_crawling_naive_Q(best_params, reward_table)