In [4]:
# collection, defaultdict and lambda
import numpy as np
from collections import defaultdict

visited_pairs = defaultdict(lambda: np.ones(2))  # {"key": np.array([0, 0])}
print(visited_pairs[1][1])
visited_pairs[1][1] += 1
print(visited_pairs[1][1])

1.0
2.0


In [5]:
# defaultdict means that if a key is not found in the dictionary ...
#     ... then instead of a KeyError being thrown, a new entry is created
d = defaultdict(int)  # specify the type
for k in "abraccadabra":
    d[k] += 1 # no need to check if key exists
d.items()

dict_items([('a', 5), ('b', 2), ('r', 2), ('c', 2), ('d', 1)])

In [7]:
# replace verbose - debug log
# https://docs.python.org/3/howto/logging.html
import logging
import sys
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)  # level is specified
# if no destination is set, they will set a destination of the console (sys.stderr)
logger_1 = logging.getLogger(__name__)

logger_1.debug('A debug message!')
logger_1.propagate = False
# Child loggers propagate messages up to the handlers associated with their ancestor loggers.
logger_1.info('1/2 - We processed {} records'.format(len([1, 2])))
logger_1.propagate = True
logger_1.info('2/2 - We processed %d records', len([1, 2]))  # uses the old, %-style of string formatting.

# The INFO message doesn’t appear because the default level is WARNING

DEBUG:__main__:A debug message!
INFO:__main__:2/2 - We processed 2 records


In [8]:
# Advanced logging
import logging
logger_2 = logging.getLogger(__name__)
logger_2.setLevel(logging.INFO)  # lowest-severity log message a logger will handle

formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt="%Y-%m-%d %H:%M:%S")  # specify the layout of log records in the final output
file_handler = logging.FileHandler("logfile.log")  # Handlers send the log records 
file_handler.setFormatter(formatter)

logger_2.addHandler(file_handler)
logger_2.info('2 - We processed %d records', len([1, 2]))

INFO:__main__:2 - We processed 2 records


In [9]:
# unpack tuple
sar = ((20, 6, True), 0, 1.0)
(state, done, reward) = sar  # unpack
print(state)
print(reward)
print(*sar)  #  *-operator to unpack the arguments out of a list or tuple
state, done, reward = (*sar,)  # unpack
print(state)
print(reward)

(20, 6, True)
1.0
(20, 6, True) 0 1.0
(20, 6, True)
1.0


In [10]:
# product of elements in a list
import gym
import gym.spaces
env = gym.make('Blackjack-v0')
nb_possible_states = np.prod([elem.n for elem in env.observation_space.spaces])

In [11]:
# If-Else one-line
reward = 0.5 
print('You won :)\n') if reward > 0 else print('You lost :(\n')
state = [20]
probs = [0.8, 0.2] if state[0] > 18 else [0.2, 0.8]
action = np.random.choice(np.arange(2), p=probs)

You won :)



In [None]:
# zip
episode = [((12, 10, False), 1, 0), ((13, 10, False), 1, 0),
           ((20, 10, False), 0, 1.0)]
states, actions, rewards = zip(*episode)
print(actions)
gamma = 2
discounts = [gamma**i for i in range(len(rewards)+1)]
print(discounts)

In [None]:
# enumerate
for i, state in enumerate(states):
    print(i)
    print(state)
    print(states[i])  # same as state
    print(actions[i])  # way to access the corresponding action

my_list = ['apple', 'banana', 'grapes', 'pear']
# argument = where to start the counter
counter_list = list(enumerate(my_list, 1))
print(counter_list)
# Output: [(1, 'apple'), (2, 'banana'), (3, 'grapes'), (4, 'pear')]

In [None]:
# clip of decay
epsilon = 1
eps_decay = 0.999
eps_min = 0.05
for i in range(10):
    epsilon = max(epsilon*eps_decay, eps_min)

In [None]:
# argmax and dict()
Q = defaultdict(lambda: np.zeros(2))
Q[1][0] = 0
Q[1][1] = 1
Q[2][0] = 2
policy = dict((k,np.argmax(v)) for k, v in Q.items())
print(policy)

In [None]:
# monitor progress
i_episode = 100
num_episodes = 1000
if i_episode % 10 == 0:
    print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
    sys.stdout.flush()

In [None]:
# multiply lists
rewards = np.array([1, 2, 3, 4])
discounts = np.array([1, 1, 1, 1, 1])
i = 1
print(rewards[i:])
print(discounts[:-(1+i)])
print(sum(rewards[i:]*discounts[:-(1+i)]))

In [15]:
print("simon /r 1".rstrip())
'test string\n'.rstrip()

simon /r 1


'test string'

In [None]:
# np.zeros requires a tuple
V_opt = np.zeros((4,12))
print(V_opt)

In [None]:
reshape(4,12)

In [15]:
# linspace()
num_episodes = 100
avg_scores = [1, 2, 3, 4, 5]
np.linspace(0,num_episodes,len(avg_scores))

array([  0.,  25.,  50.,  75., 100.])

In [None]:
import check_test

In [None]:
# np.arange()
random.choice(np.arange(env.action_space.n))

In [7]:
# deque
from collections import deque
# idea: `avg_rewards[0]` is the average return collected by the agent over the first 100 episodes.
import numpy as np
num_episodes = 3
avg_scores = deque(maxlen=num_episodes)   # average scores over every plot_every episodes
tmp_scores = [10, 12]
avg_scores.append(np.mean(tmp_scores))
tmp_scores = [8, 10]
avg_scores.append(np.mean(tmp_scores))
tmp_scores = [6, 8]
avg_scores.append(np.mean(tmp_scores))
tmp_scores = [4, 6]
avg_scores.append(np.mean(tmp_scores))
print(avg_scores)
print(np.asarray(avg_scores))  # note asarray (not array)

deque([9.0, 7.0, 5.0], maxlen=3)
[9. 7. 5.]


In [13]:
# dot product
a = np.array([1, 2])
b = np.array([10, 20])
np.dot(a, b)

50

In [None]:
# initialize best average reward
best_avg_reward = -math.inf

In [22]:
import numpy as np
nA = 3
print(np.arange(nA))
prob = np.ones(nA) / nA
np.random.choice(np.arange(nA), p=prob)

[0 1 2]


2

In [25]:
nA = 5
np.random.choice(np.arange(nA))

4

In [2]:
# https://gym.openai.com/docs/#spaces
import gym
import gym.spaces
env = gym.make('CartPole-v0')
print(env.action_space)
#> Discrete(2)
print(env.observation_space)
#> Box(4,)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Discrete(2)
Box(4,)


