In [None]:
# tip:
#b) To register your custom env, do `from ray import tune;
#   tune.register('[name]', lambda cfg: [return env obj from here using cfg])`.
#   Then in your config, do `config['env'] = [name]`.

# Import the RL algorithm (Trainer) we would like to use.
from ray.rllib.agents.ppo import PPOTrainer
import src.jss_graph_env.disjunctive_graph_jss_env as jss_env
import src.jsp_instance_parser 
from ray import tune

# Configure the algorithm.
config = {
    # Environment (RLlib understands openAI gym registered strings).
    "env": "jss_env",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 1,
    # Change this to "framework: torch", if you are using PyTorch.
    # Also, use "framework: tf2" for tf2.x eager execution.
    "framework": "tf",
    # Tweak the default model provided automatically by RLlib,
    # given the environment's observation- and action spaces.
    #"horizon":1,
    "evaluation_duration":10,
    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "relu",
    },
    # Set up a separate evaluation worker set for the
    # `trainer.evaluate()` call after training (see below).
    "evaluation_num_workers": 1,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": True,
    },
}

# function which returns environment, by now config is unused...
def env_creator(env_config):
    path='resources/jsp_instances/standard/la01.txt'
    curr_instance=src.jsp_instance_parser.parse_jps_standard_specification(path)
    res,std_matrix=curr_instance
    env = jss_env.DisjunctiveGraphJssEnv(res,default_visualisations='gantt_console',reward_mode='utilisation')
    return env

# use tune to register the custom environment for the ppo trainer
tune.register_env('jss_env',env_creator)

# Create our RLlib Trainer.
trainer = PPOTrainer(config=config)





In [None]:
# Run it for n training iterations. A training iteration includes
# parallel sample collection by the environment workers as well as
# loss calculation on the collected batch and a model update.
#import time
#for _ in range(10):
#   tmp=time.time()
#   print(f"training nr. {_}")
#   trainer.train()
#   print(f"training for iteration {_} took {time.time()-tmp} seconds")

In [None]:
import numpy as np
from matplotlib import pyplot as plt
path='resources/jsp_instances/standard/la01.txt'
#path='resources/jsp_instances/standard/abz8.txt'
curr_instance=src.jsp_instance_parser.parse_jps_standard_specification(path)
res,std_matrix=curr_instance
env = jss_env.DisjunctiveGraphJssEnv(res,default_visualisations='gantt_console')
#env.render()
# run until episode ends
# list that lists all actions
action_list=np.arange(0,env.n_jobs*env.n_machines)
iteration_list=[]
invalid_action_list=[]
finished_list=[]
epoch_nr=0
for _ in range(10):
    epoch_nr+=1
    trainer.train()
    print(f"trainer trained: {epoch_nr} Epochs")
    episode_reward = 0
    done = False
    obs = env.reset()
    iteration=0
    invalid_action=0

    while not done or iteration>5000:
        iteration +=1
        action = trainer.compute_action(obs)
        if action not in action_list[env.valid_action_mask()]: invalid_action+=1
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print(f"scheduling finished in {iteration} iterations with {invalid_action} invalid actions")
    env.render()

    data_df=env.network_as_dataframe()

    finished_list.append(max(data_df["Finish"]))
    iteration_list.append(iteration)
    invalid_action_list.append(invalid_action)

plt.plot(finished_list)
plt.title("finish time")
plt.show()

plt.plot(iteration_list)
plt.title("count of iterations")
plt.show()

plt.plot(invalid_action_list)
plt.title("count of invalid actions")
plt.show()

In [None]:
for _ in range(10):
    epoch_nr+=1
    trainer.train()
    print(f"trainer trained: {epoch_nr} Epochs")
    episode_reward = 0
    done = False
    obs = env.reset()
    iteration=0
    invalid_action=0

    while not done or iteration<5000:
        iteration +=1
        action = trainer.compute_action(obs)
        if action not in action_list[env.valid_action_mask()]: invalid_action+=1
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print(f"scheduling finished in {iteration} iterations with {invalid_action} invalid actions")
    env.render()

    data_df=env.network_as_dataframe()

    finished_list.append(max(data_df["Finish"]))
    iteration_list.append(iteration)
    invalid_action_list.append(invalid_action)

plt.plot(finished_list)
plt.title("finish time")
plt.show()

plt.plot(iteration_list)
plt.title("count of iterations")
plt.show()

plt.plot(invalid_action_list)
plt.title("count of invalid actions")
plt.show()

In [None]:
for _ in range(10):
    epoch_nr+=1
    trainer.train()
    print(f"trainer trained: {epoch_nr} Epochs")
    episode_reward = 0
    done = False
    obs = env.reset()
    iteration=0
    invalid_action=0

    while not done:
        iteration +=1
        action = trainer.compute_action(obs)
        if action not in action_list[env.valid_action_mask()]: invalid_action+=1
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print(f"scheduling finished in {iteration} iterations with {invalid_action} invalid actions")
    env.render()

    data_df=env.network_as_dataframe()

    finished_list.append(max(data_df["Finish"]))
    iteration_list.append(iteration)
    invalid_action_list.append(invalid_action)

plt.plot(finished_list)
plt.title("finish time")
plt.show()

plt.plot(iteration_list)
plt.title("count of iterations")
plt.show()

plt.plot(invalid_action_list)
plt.title("count of invalid actions")
plt.show()

In [None]:
for _ in range(10):
    epoch_nr+=1
    trainer.train()
    print(f"trainer trained: {epoch_nr} Epochs")
    episode_reward = 0
    done = False
    obs = env.reset()
    iteration=0
    invalid_action=0

    while not done:
        iteration +=1
        action = trainer.compute_action(obs)
        if action not in action_list[env.valid_action_mask()]: invalid_action+=1
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print(f"scheduling finished in {iteration} iterations with {invalid_action} invalid actions")
    env.render()

    data_df=env.network_as_dataframe()

    finished_list.append(max(data_df["Finish"]))
    iteration_list.append(iteration)
    invalid_action_list.append(invalid_action)

plt.plot(finished_list)
plt.title("finish time")
plt.show()

plt.plot(iteration_list)
plt.title("count of iterations")
plt.show()

plt.plot(invalid_action_list)
plt.title("count of invalid actions")
plt.show()

In [None]:
for _ in range(10):
    epoch_nr+=1
    trainer.train()
    print(f"trainer trained: {epoch_nr} Epochs")
    episode_reward = 0
    done = False
    obs = env.reset()
    iteration=0
    invalid_action=0

    while not done:
        iteration +=1
        action = trainer.compute_action(obs)
        if action not in action_list[env.valid_action_mask()]: invalid_action+=1
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print(f"scheduling finished in {iteration} iterations with {invalid_action} invalid actions")
    env.render()

    data_df=env.network_as_dataframe()

    finished_list.append(max(data_df["Finish"]))
    iteration_list.append(iteration)
    invalid_action_list.append(invalid_action)

plt.plot(finished_list)
plt.title("finish time")
plt.show()

plt.plot(iteration_list)
plt.title("count of iterations")
plt.show()

plt.plot(invalid_action_list)
plt.title("count of invalid actions")
plt.show()

In [None]:
for _ in range(10):
    epoch_nr+=1
    trainer.train()
    print(f"trainer trained: {epoch_nr} Epochs")
    episode_reward = 0
    done = False
    obs = env.reset()
    iteration=0
    invalid_action=0

    while not done:
        iteration +=1
        action = trainer.compute_action(obs)
        if action not in action_list[env.valid_action_mask()]: invalid_action+=1
        obs, reward, done, info = env.step(action)
        episode_reward += reward
    print(f"scheduling finished in {iteration} iterations with {invalid_action} invalid actions")
    env.render()

    data_df=env.network_as_dataframe()

    finished_list.append(max(data_df["Finish"]))
    iteration_list.append(iteration)
    invalid_action_list.append(invalid_action)

plt.plot(finished_list)
plt.title("finish time")
plt.show()

plt.plot(iteration_list)
plt.title("count of iterations")
plt.show()

plt.plot(invalid_action_list)
plt.title("count of invalid actions")
plt.show()

In [None]:
data_df=env.network_as_dataframe()
print(max(data_df["Finish"]))

plt.plot(finished_list)
plt.title("finished after iterations")
plt.show()

plt.plot(iteration_list)
plt.title("iterations")
plt.show()

plt.plot(invalid_action_list)
plt.title("number of illegal actions")

plt.show()

In [None]:
# do not know what render does
# Evaluate the trained Trainer (and rendxer each timestep to the shell's
# output).
#evaluate(
    #    self,
   #     episodes_left_fn=None,  # deprecated
  #      duration_fn: Optional[Callable[[int], int]] = None,
 #   ) 
#print('start evaluate')
#trainer.evaluate()
#   self.get_policy(policy_id) and call compute_actions()
#trainer.get_policy