In [1]:
class Q_learner_opt:
    def __init__(self, env, state_space, action_space, alpha, gamma, epsilon, decay_type, decay_steps, decay_end, random_policy):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.state_space = state_space
        self.action_space = action_space
        self.decay_type = decay_type
        self.decay_steps = decay_steps
        self.decay_end = decay_end
        self.random_policy = random_policy
  

    def Qtable(self, state_space, action_space, bin_size):
        """Create Q-table from discrete space"""
        bins = [np.linspace(-4.8,4.8,bin_size),
                np.linspace(-4,4,bin_size),
                np.linspace(-0.418,0.418,bin_size),
                np.linspace(-4,4,bin_size)]

        q_table = np.random.uniform(low=-1,high=1,size=([bin_size] * state_space + [action_space]))
        return q_table, bins
    
    def Discrete(self, state, bins):
        index = []
        for i in range(len(state)): index.append(np.digitize(state[i],bins[i]) - 1)
        return tuple(index)
        
    def train(self, episodes):
        #create performance list and q-table
        scores_list, master_list = [], []
        q_table, bins = self.Qtable(self.state_space, self.action_space, 30)
        
        #for decay function
        alpha  = self.alpha
        epsilon = self.epsilon
        gamma = self.gamma
        decay_end = self.decay_end
        decay_steps = self.decay_steps
        decay_type = self.decay_type
        
        if decay_type == 'epsilon':
            epsilon_diff = epsilon - decay_end
            decay_step = epsilon_diff/decay_steps
                    
        
        #create training loop
        for episode in range(1, episodes+1):
            
            #create initial time
            score = 0
            
            #intialise environment
            current_state = self.Discrete(self.env.reset(),bins)
            done = False #instantiate game loop

            #start q-learning loop
            while not done:
                
                if self.random_policy == True:
                    action = self.env.action_space.sample() #random
                    
                elif random.uniform(0, 1) < epsilon:
                    action = self.env.action_space.sample() #explore
                    
                else:
                    action = np.argmax(q_table[current_state]) #exploit

                #update Q-table
                observation, reward, done, info = self.env.step(action) 
                next_state = self.Discrete(observation,bins)
                score += reward
                
                #update q-table
                if not done:
                    max_future_q = np.max(q_table[next_state])
                    current_q = q_table[current_state+(action,)]
                    new_q = (1-alpha)*current_q + alpha*(reward + self.gamma*max_future_q)
                    q_table[current_state+(action,)] = new_q
                                    
                #save the scores
                current_state = next_state   
            
            #get scores
            episode_score = score/500 #200 is max number of steps for cartpole v0 and 500 for cartpole v1
            scores_list.append(episode_score)
            
            #update decay parameters
            if decay_type == 'epsilon':
                epsilon  = epsilon - decay_step
                #print(epsilon)
                
            #results from most recent 100 episodes
            if episode % 100 == 0:
                master_list = scores_list
                scores_list = []
                      
        #calculate and return objective function
        return master_list

In [2]:
from IPython.display import clear_output
import gym
import numpy as np
import random
import optuna

def objective(trial):
    
    """ Learning hyperparamters we want to optimise"""
    env = gym.make('CartPole-v1')
   
    #specify hyperparamters and ranges
    state_space =  4
    action_space = 2
    alpha = trial.suggest_float('alpha', 0.01, 0.3, log = True)
    gamma = trial.suggest_uniform('gamma', 0.85, 0.999)
    epsilon = trial.suggest_float('epsilon', 0.01, 0.3, log = True)
    decay_type = 'epsilon'
    decay_steps = trial.suggest_uniform('decay_steps', 0, 20000)
    decay_end = trial.suggest_float('decay_end', 0.0001, epsilon, log = False)
    random_policy = False

    #instantiate model
    #(env, state_space, action_space, alpha, gamma, epsilon, decay_type, decay_steps, decay_end, random_policy)
    model = Q_learner_opt(env = env, state_space=state_space, action_space=action_space, alpha=alpha, gamma=gamma, 
                          epsilon=epsilon, decay_type=decay_type, decay_steps=decay_steps, decay_end=decay_end, 
                          random_policy=random_policy)
    
    #report intermediate objective value
    value = model.train(20000)
    
    return np.median(value)

  from .autonotebook import tqdm as notebook_tqdm


## Optimize hyperparameters

In [3]:
#set up the median stopping rule as the pruning condition.
study = optuna.create_study(study_name = "cartpole_opt_4", storage = "sqlite:///cartpole_opt.db",direction = 'maximize')
#study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials=50)

[32m[I 2022-08-09 11:30:09,399][0m A new study created in RDB with name: cartpole_opt_4[0m
[32m[I 2022-08-09 11:31:05,292][0m Trial 0 finished with value: 0.147 and parameters: {'alpha': 0.015887784939561177, 'gamma': 0.9229460225549834, 'epsilon': 0.012461881350975547, 'decay_steps': 6116.695994087402, 'decay_end': 0.005281461133483823}. Best is trial 0 with value: 0.147.[0m
[32m[I 2022-08-09 11:31:41,327][0m Trial 1 finished with value: 0.125 and parameters: {'alpha': 0.014245282127494083, 'gamma': 0.962122700132542, 'epsilon': 0.044038252396981246, 'decay_steps': 11395.284011015043, 'decay_end': 0.03386912508795687}. Best is trial 0 with value: 0.147.[0m
[32m[I 2022-08-09 11:32:37,662][0m Trial 2 finished with value: 0.209 and parameters: {'alpha': 0.02251145536983653, 'gamma': 0.8759386045597057, 'epsilon': 0.16259405727928136, 'decay_steps': 18134.937101731888, 'decay_end': 0.11688950316213835}. Best is trial 2 with value: 0.209.[0m
[32m[I 2022-08-09 11:33:11,801][0m

[32m[I 2022-08-09 13:19:52,209][0m Trial 27 finished with value: 0.449 and parameters: {'alpha': 0.2155954190186733, 'gamma': 0.9693595090168338, 'epsilon': 0.2065231231364847, 'decay_steps': 10039.670625643232, 'decay_end': 0.19027906641863584}. Best is trial 6 with value: 1.0.[0m
  coefficient = 1 / z / p_accept
[32m[I 2022-08-09 13:21:19,933][0m Trial 28 finished with value: 0.228 and parameters: {'alpha': 0.13087717123960577, 'gamma': 0.9491959366529279, 'epsilon': 0.14036400003302052, 'decay_steps': 11645.418836376704, 'decay_end': 0.13489051630236626}. Best is trial 6 with value: 1.0.[0m
  coefficient = 1 / z / p_accept
[32m[I 2022-08-09 13:22:21,100][0m Trial 29 finished with value: 0.144 and parameters: {'alpha': 0.2256767162435639, 'gamma': 0.9177840422031593, 'epsilon': 0.10673400977698348, 'decay_steps': 7270.604913939907, 'decay_end': 0.011391638552495087}. Best is trial 6 with value: 1.0.[0m
[32m[I 2022-08-09 13:25:20,986][0m Trial 30 finished with value: 0.776 

  coefficient = 1 / z / p_accept
[32m[I 2022-08-09 14:14:10,974][0m Trial 46 finished with value: 0.639 and parameters: {'alpha': 0.20730588955540533, 'gamma': 0.9797762619367143, 'epsilon': 0.17776200814126786, 'decay_steps': 11713.430895369966, 'decay_end': 0.17039370012320673}. Best is trial 6 with value: 1.0.[0m
  coefficient = 1 / z / p_accept
  score = log_l - log_g
[32m[I 2022-08-09 14:14:49,896][0m Trial 47 finished with value: 0.073 and parameters: {'alpha': 0.0168290062170959, 'gamma': 0.9881772496923245, 'epsilon': 0.017007724362736544, 'decay_steps': 8597.00869705726, 'decay_end': 0.005792605331248911}. Best is trial 6 with value: 1.0.[0m
  coefficient = 1 / z / p_accept
[32m[I 2022-08-09 14:17:43,750][0m Trial 48 finished with value: 0.616 and parameters: {'alpha': 0.14984400518840874, 'gamma': 0.9611877150245821, 'epsilon': 0.10721009862142183, 'decay_steps': 6532.765203794343, 'decay_end': 0.08611913901110606}. Best is trial 6 with value: 1.0.[0m
  coefficient =

In [10]:
trial = study.best_trial
print("Best trial:")
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  1.0
  Params: 
    alpha: 0.23694065523262592
    decay_end: 0.017600229344677983
    decay_steps: 6278.965774396572
    epsilon: 0.09473808641392292
    gamma: 0.9613235873080813


In [11]:
# Save results to csv file
df = study.trials_dataframe().drop(['datetime_start', 'datetime_complete', 'duration'], axis=1)  # Exclude columns
df = df.loc[df['state'] == 'COMPLETE']        # Keep only results that did not prune
df = df.drop('state', axis=1)                 # Exclude state column
df = df.sort_values('value')                  # Sort based on accuracy
df.to_csv('./optuna_results/opt_QL3.csv', index=False)  # Save to csv file

## Visualization

In [12]:
fig = optuna.visualization.plot_slice(study, params=["alpha", "gamma","epsilon", 'decay_steps', 'decay_end'])
fig.show()

In [13]:
fig = optuna.visualization.plot_parallel_coordinate(study, params=["alpha", "gamma","epsilon",'decay_steps', 'decay_end'])
fig.show()

In [14]:
fig = optuna.visualization.plot_param_importances(study)
fig.show()

In [15]:
fig = optuna.visualization.plot_contour(study, params=["alpha", "gamma"])
fig.show()