## Python Notebook to interact with gym-battery and battery-agent

This python notebook is a working document to interact with and test the environment and the agent.

Note: In order for this to work, gym-battery needs to be installed as a package, using pip install -e gym-battery from wherever gym-battery exists.

The ipython notebook should exist in battery dispatch by default and should be ableto access those resources so it does not necessarily need to be build/installed using pip.

In [1]:
import gym
import gym_battery 
import numpy as np
import pandas as pd

In [2]:
env = gym.make('gym_battery:battery-v0', **{'N_actions':5})
env.set_standard_system()

setting the standard system, A10S Med busines large usage with a 2,000kW/10,000kWh battery


In [3]:
# Show the possible action mapping the agent can take
env.action_mapping

{0: -1000.0, 1: -500.0, 2: 0.0, 3: 500.0, 4: 1000.0}

In [4]:
print(env.observation_space.low)
print("to")
print(env.observation_space.high)

[  0.   0. 300. 300.]
to
[   24. 10000. 16200. 17200.]


In [5]:
# Set how to structure the environment. 'count_days' will generate the a single day as an episode. THe number of days
# given indicates how many differnet days to use.
# This needs to be changed so that it generates LONGER episodes, not DIFFERENT episodes, but this hasn't been done yet.
env.episode_type = 'count_days'
env.run_N_episodes = 1
env.load.DF = env.load.DF[76:76*4]
env.fit_load_to_space()

In [6]:
env.load.DF['value'][76] = 4500
env.load.DF

Unnamed: 0,duration,start,value,duration_hrs
76,900,2011-03-07 00:00:00,4500.0,0.25
77,900,2011-03-07 00:15:00,4500.0,0.25
78,900,2011-03-07 00:30:00,4800.0,0.25
79,900,2011-03-07 00:45:00,4500.0,0.25
80,900,2011-03-07 01:00:00,4500.0,0.25
81,900,2011-03-07 01:15:00,4800.0,0.25
82,900,2011-03-07 01:30:00,4500.0,0.25
83,900,2011-03-07 01:45:00,4200.0,0.25
84,900,2011-03-07 02:00:00,4200.0,0.25
85,900,2011-03-07 02:15:00,4200.0,0.25


In [7]:
# Get the do-nothing value for taking no action
def dict_key_by_val(d, val):
    for k in d.keys():
        if d[k] == val:
            return k
    raise ValueError("value not found in dictionary")
    
act0 = dict_key_by_val(env.action_mapping, 0)
act0

2

In [8]:
''' Set up the agent and the discretizer.'''
from batterydispatch.agent.agents import DynaQAgent
from batterydispatch.agent.discretizers import Box_Discretizer

from batterydispatch.agent.policies import do_nothing
agent = DynaQAgent()
agent.set_policy(do_nothing, {'do_nothing_action': act0})

# Note, you can change the size of the state sapce by changing the number of buckets, below
agent.set_discretizer(Box_Discretizer(env.observation_space, N=[6, 4, 12, 12]))
agent.actions = env.action_space
agent.learning_rate = 0.05 # used for the updates of the Q estimates
agent.subtype = 'on-policy' # Setup the MC agent for off-policy learning

global eps
eps=0

remember to set self.actions = env.action_space!


In [46]:
seed = 42
agent.set_seed(seed)
env.set_seed(seed)


[42]

In [47]:

print(env.observation_space.sample())
print(env.observation_space.sample())


[   8.979441 4843.8696   5344.3896   3851.1821  ]
[  13.34626 6842.9487  1834.3031  3286.6504 ]


In [None]:
from gym.spaces import 

In [39]:
print(env.observation_space.sample())
print(env.observation_space.sample())


[  14.647108 4577.328    1931.4066   5129.007   ]
[   9.063035  894.4754   5263.6787   5895.132   ]


In [40]:
agent.set_greedy_policy(1)

In [41]:
state = env.reset()
agent.get_action(state,  list(env.action_mapping.keys()), 0.25)

4

In [19]:
env.observation_space.sample()

array([   8.450609, 3704.6824  , 5414.954   , 2356.2095  ], dtype=float32)

## Plot the day of data that we will be trying to learn from

In [None]:

done = False
state = env.reset()
i = 0
while not done:
    i+=1
    _,reward,done, details = env.step(act0)

from matplotlib import pyplot as plt
plt.plot(env.grid_flow.net_flow)
try:
    print(list(env.grid_flow.start_date)[0])
except:
    pass
print(i)
print(reward)
default_reward = reward
plt.show()


In [None]:
# We then initialize the agent state-action estimates, based on the original billing period.
# We also give the do_nothing action a small bonus of 100, in order to prevent the agent from arbitrarily taking action.
agent.initialize_state_actions(new_default=default_reward,
                              do_nothing_action = act0,
                              do_nothing_bonus = 100)

## Set up the function to run the episodes, and run episodes until convergence.

In [None]:
# This function runs the actual episodes, repeatedly, until policy converges.

from IPython.display import clear_output
# initial state
from batterydispatch.agent.agents import PolicyConvergedError

eps=0
history=[]
def run_episodes(random_charge = True, run_type='to_convergence'):
    possible_actions = list(env.action_mapping.keys())
        
    done=False
    i = 0
    
    over=False
    while not over:
        state = env.reset(random_charge=random_charge)

        i += 1
        if i > 30:
            i = 0
            clear_output()

        while not done:
            action = agent.get_action(state, possible_actions, 0.25)
            #print(action)
            old_state = state.copy()
            state, reward, done, details = env.step(action)
            
            agent.observe_sars(old_state, action, reward, state)
        try:
            agent.end_episode(reward)
        except PolicyConvergedError:
            converged = True
            print("Converged!")
        try:
            new_demand = max(env.grid_flow.net_flow)
            orig_demand = max(env.grid_flow.load)
        except AttributeError:
            new_demand = "???"
            orig_demand = "???"

        done = False
        ran_once = True


        print(f"Current reward of {int(reward)} / {int(default_reward)}, {new_demand} / {orig_demand}, patience={agent.patience_counter}")
        
        if run_type == "to_convergence":
            over = converged
        elif run_type == "once":
            over = ran_once
        #converged = agent.check_policy_convergence(False)
        history.append((eps, reward, new_demand, default_reward, orig_demand))
    
    return eps

In [None]:
# We then set the final parameters guiding the episodes: The agents proclivity for random actions, 
# the number of episodes without a policy change before we can say we've converge.
agent.set_greedy_policy(eta=0.125)
agent.patience = 100
agent.planning_steps
agent.name

In [None]:
agent.learning_rate = 0.075



In [None]:
%%time
agent.set_greedy_policy(eta=0.05)

starting_learning_rate = 0.05
agent.planning_steps = 35
agent.patience_counter = 0

i=0


while agent.learning_rate > 0.005:
    i+=1
    eps+=1
        
    if i==30:
        i=0
        clear_output()
    run_episodes(random_charge = False, run_type="once")
    agent.learning_rate = starting_learning_rate * np.exp(-0.0002*eps)

In [None]:
len(history)

In [None]:
np.exp(-0.0002*10000)*0.05

In [None]:
agent.history

In [None]:
Qs = pd.DataFrame.from_dict(agent.S_A_values, orient='index')
Qs.to_clipboard()

In [None]:
counts = pd.DataFrame.from_dict(agent.S_A_frequency, orient='index')
counts.to_clipboard


In [None]:
print(f"The agent converged after {eps} episodes")

Agent has taken between 10 and 30 minutes, and between 700 and 2262 episodes, to converge on day 1. Optimal policy:
Current reward of -397414.125 / -406791.825, 5600.0 / 6000.0, patience=21

For 2 days, agent took 5 hours 8 minutes, and converged after 21200 episodes.

## Then we allow the agent to take entirely greedy actions and run the algorithm to see how much the agent learned. 

In [None]:
agent.set_greedy_policy(eta=0)
    
state = env.reset(random_charge=False)
done = False
while not done:
    action = agent.get_action(state, list(env.action_mapping.keys()), 0.25)
    #print(state)
    #action = int(input("action:"))

    #print(action)
    state, reward, done, details = env.step(action)

try:
    new_demand = max(env.grid_flow.net_flow)
    orig_demand = max(env.grid_flow.load)
except AttributeError:
    new_demand = "???"
    orig_demand = "???"
    
    env.grid_flow['final_reward'] = reward
    env.grid_flow['original_reward'] = default_reward


print(f"Current reward of {reward} / {default_reward}, {new_demand} / {orig_demand}, patience={agent.patience_counter}")




In [None]:
import sqlite3

def save_results(scenario=None, agent_name=None, notes=None):
    
    
    conn = sqlite3.connect('gym_battery_database.db')
    
    
    result = conn.execute('SELECT MAX(id) FROM grid_flow_output;')
    scenario_id = result.fetchone() + 1
    
    if scenario is None:
        scenario = input("Enter the scenario name (i.e. the load used): ")
    if agent is None:
        scenario = input("Enter the agent name, or y to accept {}: ".format(agent.name))
        if scenario.lower() == 'y':
            scenario = agent.name
    if notes is None:
        notes = input("Consider adding any notes: ")
    
    saved_time = pd.Timestamp.now()
    
    # Save the final grid_flow using entirely greedy policy
    DF = env.grid_flow.copy()
    
    DF['agent_state'] = [tuple(agent.discretize_space(np.array(s))) for s in DF.state]
    agent_state_hash_table = {hash(s):s for s in DF.agent_state}
    DF.agent_state = [hash(s) for s in DF.agent_state]
    
    state_hash_table = {hash(s):s for s in DF.state}
    DF.state = [hash(s) for s in DF.state]
    
    DF['reward'] = reward
    DF['agent'] = agent_name
    DF['scenario'] = scenario
    DF['episodes'] = len(history)
    DF['notes'] = notes
    DF['scenario_id'] = scenario_id
    DF['saved_timestamp'] = saved_time
    
    DF.to_sql('grid_flow_output', conn, if_exists='append')

    # Save the state-action value estimates
    val = agent.S_A_values.copy()
    val = pd.DataFrame.from_dict(val, orient='index')
    val = val.reset_index()
    val['state'] = [[i.level_0, i.level_1, i.level_2, i.level_3] for ix, i in val.iterrows()]
    val = val.rename(columns={"state": "agent_state"})
    val.index = val.agent_state
    val = val.drop(columns=['level_0', 'level_1', 'level_2', 'level_3', 'agent_state'])
    val.index = [tuple(x) for x in val.index]
    
    add_agent_state_hash = {hash(s):s for s in val.index if hash(s) not in agent_state_hash_table.keys()}
    agent_state_hash_table.update(add_agent_state_hash)
    
    val.index = [hash(s) for s in val.index]
    val['agent'] = agent_name
    val['scenario'] = scenario
    va['scenario_id'] = scenario_id
    val['saved_timestamp'] = saved_time
    val.to_sql('state_action_values', conn, if_exists='append')
    
    
    agent_state_hash_DF = pd.DataFrame.from_dict(agent_state_hash_table, orient='index', columns=['hour', 'charge', 'load', 'demand'])
    agent_state_hash_DF['saved_timestamp'] = saved_time
    agent_state_hash_DF['state'] = agent_state_hash_DF.index
    try:
        agent_state_hash_DF = pd.read_sql('SELECT * FROM agent_states_hash;', conn).append(agent_state_hash_DF)
    except:
        print("Error reading in agent state hash table. Is this the first time you're running it?")
    agent_state_hash_DF.drop_duplicates(subset='state', inplace=True)
    agent_state_hash_DF.reset_index(drop=True, inplace=True)
    agent_state_hash_DF.saved_timestamp = pd.to_datetime(agent_state_hash_DF.saved_timestamp)
#    conn.execute("DROP TABLE agent_states_hash;")
    try:
        agent_state_hash_DF.to_sql('agent_states_hash', conn, if_exists='replace', index=False)    
    except:
        print("returnign DF")
        return agent_state_hash_DF

    state_hash_DF = pd.DataFrame.from_dict(state_hash_table, orient='index', columns=['hour', 'charge', 'load', 'demand'])
    state_hash_DF['saved_timestamp'] = saved_time
    state_hash_DF['state'] = state_hash_DF.index
    try:
        state_hash_DF = pd.read_sql('SELECT * FROM states_hash;', conn).append(state_hash_DF)
    except:
        print("Error reading in state hash table. Is this the first time you're running it?")
    state_hash_DF.drop_duplicates(subset='state', inplace=True)
    state_hash_DF.reset_index(drop=True, inplace=True)
    state_hash_DF.saved_timestamp = pd.to_datetime(state_hash_DF.saved_timestamp)
    state_hash_DF.to_sql('states_hash', conn, if_exists='replace', index=False)
    
    # Save the history of performance by episode
    df_history = pd.DataFrame(history, columns=['episode_cnt', 'reward', 'new_demand', 'orig_reward', 'orig_demand'])
    df_history['saved_timestamp'] =saved_time
    df_history['agent'] = agent_name
    df_history['scenario'] = scenario
    df_history['scenario_id'] = scenario_id
    df_history.to_sql('history', conn, if_exists='append')
    
    conn.close()

In [None]:
DF = save_results(scenario='Day1_load', agent_name='DynaQ', notes="ran the DynaQ agent again on the Day1 data, for a second (same agent)")

In [None]:
pd.to_datetime(DF.saved_timestamp)

In [None]:
sum(DF.index.duplicated())

In [2]:
import sqlite3
conn = sqlite3.connect('gym_battery_database.db')
    

In [5]:
conn.execute('ALTER TABLE history ADD COLUMN epsilon FLOAT NULL;')
conn.commit()

In [6]:
conn.execute('select * from history;')

<sqlite3.Cursor at 0x4f47180>