In [1]:
import numpy as np
from random import Random
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from seeds import known_seeds
import evaluation
import utils
from action import ActionSpace

In [2]:
space = spaces.Box(low=0, high=1000000, shape=(6, 7), dtype=np.int32)
space.sample()

array([[ 72378,  52501, 358724,  81765, 359851, 524812, 431996],
       [225769, 440589, 458311, 233503, 767823, 424308, 450050],
       [356973, 767163, 855865, 316818,  74908, 866268, 470491],
       [917144, 247712, 970525,  94974, 467444, 338918, 974429],
       [192324, 316135, 271789, 731083,  36282, 396037, 585370],
       [300485, 204380, 983470, 819423, 641339, 327806, 154680]],
      dtype=int32)

In [3]:
    #old fleet function
    def init_fleet(self):
        fleet = {"DC1":{},"DC2":{},"DC3":{},"DC4":{}}
        for i in fleet.keys():
            for j in self.server_generations:
                fleet[i].update(j, {})
                fleet[i][j].update("servers", [])
                fleet[i][j].update("timestep_bought", [])
                fleet[i][j].update("total_owned", 0)
        return fleet

In [4]:

action_space = spaces.Box(low=0, high=3, shape=(4,7), dtype=np.int32)
action_space.sample()

array([[1, 1, 0, 0, 3, 3, 0],
       [2, 0, 1, 0, 3, 2, 0],
       [3, 3, 3, 0, 3, 1, 0],
       [2, 3, 2, 2, 3, 0, 2]], dtype=int32)

In [5]:
from gymnasium.wrappers import FlattenObservation # type: ignore
class CustomEnv(gym.Env):
    def __init__(self):
        #super(self).__init__()

        """
        define self.actionspace and self.observation_space below using 
        variables available in "gym.spaces"
        """

        #agent action space (actions it can make)
        #datacenter, servergen, action, where the index corresponds to an action and chosen number is the fraction of datacenter to fill with that servergen
        self.action_space = spaces.Box(low=0, high=1, shape=(4,7,4), dtype=np.float32)
        
        self.default_demand, self.datacenters, self.servers, self.selling_prices = utils.load_problem_data()

        self.fleet_columns = ['datacenter_id', 'server_generation', 'server_id', 'action',
       'server_type', 'release_time', 'purchase_price', 'slots_size', 'energy_consumption', 
       'capacity', 'life_expectancy', 'cost_of_moving','average_maintenance_fee', 'cost_of_energy',
       'latency_sensitivity', 'slots_capacity', 'selling_price', 'lifespan', 'moved']

        #agent observation space (what the agent can "see"/information that is fed to agent)
        #a 3d array of latency, server_gen, demand, concatenated with 
        #a 3d array of latency, server_gen, supply
        self.observation_space = spaces.Box(low=0, high=1000000, shape=(6, 7), dtype=np.int32)

        self.seeds_array = known_seeds("training")
        self.seed_counter = 0

        self.server_generations = ['CPU.S1', 'CPU.S2', 'CPU.S3', 'CPU.S4', 'GPU.S1', 'GPU.S2', 'GPU.S3']
        self.latencies = ['low', 'medium', 'high']
        self.data_centers = ['DC1', 'DC2', 'DC3', 'DC4']

    #might need func below to convert agent action into a relevant action
    #def conv_agent_action_to_move(self, action):
    
    #returns mask for the action space based on possible plays
    #def valid_action_mask(self):
    
    #initiallise/reset all of the base variables at the end of the "game"
    #has to return a base/initial observation

    def convert_demand_to_observation(self, demand):
        demand_observation = np.zeros((3,7), np.int32)
        for i in range(len(self.server_generations)):
            servergen_demand = demand[demand["server_generation"] == self.server_generations[i]]
            for j in range(len(self.latencies)):
                latency_demand = servergen_demand[self.latencies[j]]
                demand_observation[j][i] = latency_demand.sum()
        return demand_observation

    def convert_fleet_to_observation(self, fleet):
        observed_fleet = np.zeros((3,7), np.int32)
        for i in range(len(self.data_centers[0:3])):
            for j in range(len(self.server_generations)):
                #filter for the datacenter
                filtered_dc = fleet[fleet["datacenter_id"] == self.data_centers[i]]
                #get sum of the server generation
                gen_total = filtered_dc[filtered_dc["server_generation"] == self.server_generations[j]].shape[0]
                if(self.server_generations[i] == "DC3"):
                    #get dc4 and add onto dc3 total
                    filtered_dc = fleet[fleet["datacenter_id"] == self.datacenters[i+1]]
                    gen_total += filtered_dc[filtered_dc["server_generation"] == self.server_generations[j]].shape[0]
                observed_fleet[i][j] = gen_total
        return observed_fleet

    def init_fleet(self):
        self.fleet = pd.DataFrame(columns=self.fleet_columns)

    def reset(self, seed=None, options=None):
        self.timestep = 1
        self.seed_counter += 1
        self.seed_counter %= 10
        np.random.seed(self.seeds_array[self.seed_counter])

        self.actionspace = ActionSpace()
        self.OBJECTIVE = 0

        self.selling_prices2 = evaluation.change_selling_prices_format(self.selling_prices)
        self.init_fleet()
        self.demand = evaluation.get_actual_demand(self.default_demand)
        self.timestep_demand = self.demand[self.demand["time_step"] == self.timestep]
        observation_demand = self.convert_demand_to_observation(self.timestep_demand)
        observation_fleet = self.convert_fleet_to_observation(self.fleet)
        observation = np.concatenate((observation_demand, observation_fleet))
        self.done = False
        return observation, {}
    
    #caps the buy number to datacenter capacity
    def cap_buy_num(self, datacenter, server_gen, number):
        slots = self.fleet.groupby(by=['datacenter_id']).agg({'slots_size': 'sum',
                                                        'slots_capacity': 'mean'})
        if(slots.empty):
            return number
        server_slotsize = (self.servers[self.servers["server_generation"] == server_gen]["slots_size"]).iloc[0]
        if(server_slotsize * number + slots["slots_size"].mean() > slots["slots_capacity"].mean()):
            number = slots["slots_capacity"].mean() - slots["slots_size"].mean()
        return int(number)

    #buys "number" amount of servers at datacenter
    def buy(self, datacenter, server_gen, number=10):
        ts_fleet = pd.DataFrame(columns=self.fleet_columns[0:3])
        number = self.cap_buy_num(datacenter, server_gen, number)

        fleet_array = []
        datacenter_array =[]
        server_gen_array=[]
        buy_array=[]
        server_id_array=[]
        for i in range(number):
            server_id = self.actionspace.generate_server_id(server_gen)
            datacenter_array.append(datacenter)
            server_gen_array.append(server_gen)
            server_id_array.append(server_id)
            buy_array.append("buy")
            fleet_array.append([datacenter, server_gen, server_id, "buy"])
        temp = pd.DataFrame({"datacenter_id":datacenter_array, "server_generation":server_gen_array,
         "server_id": server_id_array, "action": buy_array})


        ts_fleet = pd.concat([ts_fleet, temp])
        ts_fleet = ts_fleet.merge(self.servers, on='server_generation', how='left')
        ts_fleet = ts_fleet.merge(self.datacenters, on='datacenter_id', how='left')
        ts_fleet = ts_fleet.merge(self.selling_prices, 
                            on=['server_generation', 'latency_sensitivity'], 
                            how='left')
        ts_fleet = ts_fleet.fillna(0)
        self.fleet = pd.concat([self.fleet, ts_fleet])

    #called in a loop where each time it is called the agent chooses an action and change
    #state of game appropriately according to agent action
    def step(self, action):
        """
        after agent move, do it, calc the new observation state
        and the reward from that move it made, if "end" of game set self.done to True

        """
        reward = 0
        actions = self.actionspace.convert_actionspace_to_actionV2(action)

        for i in actions:
            if(i[0] == "buy"):
                self.buy(datacenter = i[3], server_gen = i[1], number = i[2])

        self.fleet = evaluation.update_check_lifespan(self.fleet)
        Zf = evaluation.get_capacity_by_server_generation_latency_sensitivity(self.fleet)
        D = evaluation.get_time_step_demand(self.demand, self.timestep)
        U = evaluation.get_utilization(D, Zf)
        #check if fleet is empty
        if self.fleet.shape[0] > 0:
            # get the server capacity at timestep
            Zf = evaluation.get_capacity_by_server_generation_latency_sensitivity(self.fleet)

            # evaluate objective function at current timestep
            U = evaluation.get_utilization(D, Zf)

            L = evaluation.get_normalized_lifespan(self.fleet)
    
            P = evaluation.get_profit(D, 
                           Zf, 
                           self.selling_prices2,
                           self.fleet)
                           
            o = U * L * P

            self.OBJECTIVE += o
            reward += U+L
            if(L <= 0):
                reward -=2
        else:
            reward -= 10
        #reached final timestep
        if(self.timestep >= 10):
            self.done = True
        self.timestep += 1

        #extra info on the game if wanted for yourself
        info = {}
        truncated = False
        
        self.timestep_demand = self.demand[self.demand["time_step"] == self.timestep]
        
        observation_demand = self.convert_demand_to_observation(self.timestep_demand)
        observation_fleet = self.convert_fleet_to_observation(self.fleet)
        observation = np.concatenate((observation_demand, observation_fleet))

        return (observation, reward, self.done, truncated, info)


In [6]:
from stable_baselines3.common.env_checker import check_env
env = CustomEnv()
check_env(env)

  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)


In [7]:
import os
import time
from stable_baselines3 import PPO
def train():
    #directory model and log saved to
    model_dir = f"models/V1/{int(time.time())}/"
    log_dir = f"logs/V1/{int(time.time())}/"

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    env = CustomEnv()
    #wrap for action masking
    #env = ActionMasker(env, mask_fn) 
    env.reset()

    model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_dir)
    #model = PPO(MaskableActorCriticPolicy, env, verbose=1, tensorboard_log=log_dir)

    TIMESTEPS = 1
    #adjust the range below to adjust timesteps it runs for (calc stepcount as max range val * timesteps)
    #saves model every TIMESTEPS number of steps
    for i in range(1,50):
        model.learn(total_timesteps=TIMESTEPS, reset_num_timesteps=False, tb_log_name=f"PPO")
        print("1 cycle done")
        model.save(f"{model_dir}/{TIMESTEPS*i}")

In [8]:
envv = CustomEnv()
#check environmento
check_env(envv)

#extra checks, unocomment to run
episodes = 4

for episode in range(episodes):
    done = False
    obs = envv.reset()
    while not done:
        random_action = envv.action_space.sample()
        obs, reward, done, trunc, info = env.step(random_action)
        #print("obs", obs)
        print('reward',reward)

  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)


reward 0.026746438597667455
reward 0.05699253629385741
reward 0.07902547006963849
reward 0.10423982254731898
reward 0.13834027452093656
reward 0.19015873594520355
reward 0.19317803760292934
reward 0.21944169665155272
reward 0.25924387362604884
reward 0.28901704441704207
reward 0.3276231104271028
reward 0.35593710466266515
reward 0.3854166666666917


In [9]:
train()
print("done")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs/V1/1725045645/PPO_0


  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fleet = pd.concat([self.fleet, ts_fleet])
  fleet['lifespan'] = fleet['lifespan'].fillna(0)
  self.fle