### Setting The Environment

In [1]:
import os
import gym
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [2]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name)

Test the environment by random sampling.

In [3]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
env.close()

Episode:1 Score:22.0
Episode:2 Score:14.0
Episode:3 Score:27.0
Episode:4 Score:17.0
Episode:5 Score:42.0


In [4]:
log_path = os.path.join('Training', 'Logs') #deifne directories

In [5]:
log_path

'Training\\Logs'

### Set DQN model (Default)
#### 2 layers with 64 units
Parameters are updated by stochastic gradient descent (or a variant)
by minimizing the square loss.

In [6]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda:env])
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log= log_path) #MLP--2 layers of 64...no fancy stuff like lstm
#verbose: log our observations

Using cpu device


In [7]:
DQN??

[1;31mInit signature:[0m
[0mDQN[0m[1;33m([0m[1;33m
[0m    [0mpolicy[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mstr[0m[1;33m,[0m [0mType[0m[1;33m[[0m[0mstable_baselines3[0m[1;33m.[0m[0mdqn[0m[1;33m.[0m[0mpolicies[0m[1;33m.[0m[0mDQNPolicy[0m[1;33m][0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0menv[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mgym[0m[1;33m.[0m[0mcore[0m[1;33m.[0m[0mEnv[0m[1;33m,[0m [0mstable_baselines3[0m[1;33m.[0m[0mcommon[0m[1;33m.[0m[0mvec_env[0m[1;33m.[0m[0mbase_vec_env[0m[1;33m.[0m[0mVecEnv[0m[1;33m,[0m [0mstr[0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0mlearning_rate[0m[1;33m:[0m [0mUnion[0m[1;33m[[0m[0mfloat[0m[1;33m,[0m [0mCallable[0m[1;33m[[0m[1;33m[[0m[0mfloat[0m[1;33m][0m[1;33m,[0m [0mfloat[0m[1;33m][0m[1;33m][0m [1;33m=[0m [1;36m0.0001[0m[1;33m,[0m[1;33m
[0m    [0mbuffer_size[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m1000000[0m[1;33m,[0m[1;33m
[0

In [9]:
model.learn(total_timesteps=150000)

Logging to Training\Logs\DQN_10
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.993    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 7412     |
|    time_elapsed     | 0        |
|    total_timesteps  | 106      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.988    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8214     |
|    time_elapsed     | 0        |
|    total_timesteps  | 191      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.984    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 7910     |
|    time_elapsed     | 0        |
|    total_timesteps  | 260      |
----------------------------------
-----------------------

<stable_baselines3.dqn.dqn.DQN at 0x17f08cb5fa0>

Save the Model(DQN)

In [10]:
DQN_Path = os.path.join('Training','Saved Models', 'DQN_Model_Cartpole')

In [11]:
model.save(DQN_Path)

In [12]:
evaluate_policy(model, env, n_eval_episodes=10, render = True)



(260.0, 61.27968668327213)

In [13]:
env.close()

Use the model to predict the next move for given state

In [30]:
reward_ = []
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset() #Reset the environment
    done = False
    score = 0

    while not done:
        env.render()
        action, _ = model.predict(obs) #use the prediction from trained model
        obs, reward, done, info = env.step(action)
        reward_.append(reward)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
env.close()

Episode:1 Score:[222.]
Episode:2 Score:[206.]
Episode:3 Score:[256.]
Episode:4 Score:[212.]
Episode:5 Score:[256.]


In [28]:
net_arch = [dict(net_arch=dict(pi=[128, 128], vf=[128,128]))]

In [29]:
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device


TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of SymInts size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
