# Reinforcement Learning

In [1]:
import numpy as np 
import random

In [27]:
# Step 1: Find out steps and actions 
states = np.arange(16,31)
actions = ['ON', 'OFF']

In [28]:
print (states) 
print (actions)

[16 17 18 19 20 21 22 23 24 25 26 27 28 29 30]
['ON', 'OFF']


In [4]:
# Step 2: Create Qtable set alpha, gama, epsilon, episodes
# Q table is [s,a] -- s: state, a:action
Q = np.zeros((len(states),len(actions)))

In [20]:
#alpha: (0-1) -- range of trust?
#gamma: (0-1) 
#epsilon -- probability of exploring new actions for current best 
#episodes -- time that you want to give 
alpha = 0.1 
gamma = 0.9 
epsilon = 0.2
episodes = 300

In [7]:
# Step3: Write Reward function 
def get_reward(temp,action): 
    if 20 <= temp <= 24: 
        reward = 10 
    else: 
        reward = -5
    if action == 'ON': 
        reward -= 2 
    return reward 


In [16]:
# Step4: Environment Dynamic Change (e.g. Temperature Changes) 
def next_temp(temp, action):
    if action == 'ON': 
        temp -= random.choice([1,2])
    else: 
        temp+= random.choice([0,1,2])
    return int (np.clip(temp,16,30))
#clip lets value remain between the range defined - 30+ -> 30, 16- -> 16)

In [21]:
#Step 5 Training Loop 
for ep in range(episodes): 
    temp = random.choice(states) 
    done = False 
    for i in range(20): #Limit steps per episode
        #choose action (epsilon-greedy) 
        if random.uniform(0,1) < epsilon: 
            action = random.choice(actions) 
        else: 
            action = actions[np.argmax(Q[temp - 16])]

        next_state = next_temp(temp,action)
        reward = get_reward(next_state,action)
        a = actions.index(action)
        best_next=np.max(Q[next_state - 16])
        Q[temp - 16, a] += alpha * (reward + gamma * best_next - Q[temp-16, a])

print ("Training Done")

Training Done


In [29]:
# Test Learned policy 
temp = 18
for step in range(10): 
    action = actions[np.argmax(Q[temp - 16])]
    print (f"Step {step+1}: Temp={temp}C -> Action={action}")
    temp=next_temp(temp,action) 

Step 1: Temp=18C -> Action=OFF
Step 2: Temp=18C -> Action=OFF
Step 3: Temp=20C -> Action=OFF
Step 4: Temp=20C -> Action=OFF
Step 5: Temp=22C -> Action=ON
Step 6: Temp=21C -> Action=OFF
Step 7: Temp=21C -> Action=OFF
Step 8: Temp=21C -> Action=OFF
Step 9: Temp=21C -> Action=OFF
Step 10: Temp=21C -> Action=OFF


In [31]:
#Use user input 
try: 
    temp=int(input('Enter starting toom temp(16-30)'))
    if temp<16 or temp>30: 
             raise ValueError('Temp out of range')
except ValueError as ve: 
    print (ve) 
    temp=25
    print ('Set to default 25C')
print(f'Starting temp: {temp} C')

for step in range(10): 
    action = actions[np.argmax(Q[temp -16])]
    print(f"Step {step+1}: Temp={temp}C --> Action{action}")
    temp = next_temp(temp, action)
print ("\n Done")


Enter starting toom temp(16-30) 10


Temp out of range
Set to default 25C
Starting temp: 25 C
Step 1: Temp=25C --> ActionON
Step 2: Temp=24C --> ActionON
Step 3: Temp=23C --> ActionON
Step 4: Temp=21C --> ActionOFF
Step 5: Temp=21C --> ActionOFF
Step 6: Temp=23C --> ActionON
Step 7: Temp=22C --> ActionON
Step 8: Temp=21C --> ActionOFF
Step 9: Temp=23C --> ActionON
Step 10: Temp=22C --> ActionON

 Done
