In [1]:
from DeepLearning.PPO import MaskablePPO
from DeepLearning.Thesis.Environments.RewardTesting import FinalvpRewardEnv
from DeepLearning.GetActionMask import getActionMask
from DeepLearning.Thesis.Observations.get_observation import getObservation, lowerBound, upperBound
import os

env = FinalvpRewardEnv(lowerBounds=lowerBound, upperBounds=upperBound, getObservationFunction=getObservation, getActionMaskFunction=getActionMask)
actionMask = getActionMask
observation = getObservation

netArchDict = dict(pi=[128, 128, 128], vf=[128, 128, 128])
gamma = 0.99
n_steps = 4096

saveName = "Reward_finalvp"
savePath = f"DeepLearning/Thesis/Rewards/Models/{saveName}"

model = MaskablePPO("MlpPolicy", env, verbose=1, policy_kwargs=dict(net_arch=netArchDict), gamma=gamma, n_steps=n_steps, getActionMask=actionMask, getObservation=observation, savePath=savePath, tensorboard_log="./tensorboard_logs_thesis/")
model.learn(total_timesteps=2_000_000, tb_log_name=saveName)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tensorboard_logs_thesis/Reward_finalvp_1


  logger.warn(


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 96.8     |
|    ep_rew_mean     | 5.64     |
| time/              |          |
|    fps             | 890      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 4096     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 97.1        |
|    ep_rew_mean          | 6.11        |
| time/                   |             |
|    fps                  | 654         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.020831073 |
|    clip_fraction        | 0.24        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.87       |
|    explained_variance   | -0.0904     |
|    learning_rate        | 0.



-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 104         |
|    ep_rew_mean          | 6.42        |
| time/                   |             |
|    fps                  | 617         |
|    iterations           | 3           |
|    time_elapsed         | 19          |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.029532395 |
|    clip_fraction        | 0.294       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.82       |
|    explained_variance   | 0.624       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.0379     |
|    n_updates            | 20          |
|    policy_gradient_loss | -0.0461     |
|    value_loss           | 0.161       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 97.5  

In [None]:
"""
Running Agent simulations
"""
from Agents.AgentRandom2 import AgentRandom2
from Agents.AgentMCTS import AgentMCTS
from Agents.AgentUCT import AgentUCT
from Agents.AgentModel import AgentMultiModel, AgentModel
from Game.CatanGame import *
from CatanSimulator import CreateGame
from DeepLearning.PPO import MaskablePPO
from Game.CatanPlayer import PlayerStatsTracker
from tabulate import tabulate
from DeepLearning.Stats import headers
import dill as pickle
from CatanData.GameStateViewer import SaveGameStateImage, DisplayImage
import math
from DeepLearning.Environments.CatanEnv import CatanEnv, SelfPlayDistribution, CatanTradingEnv, SelfPlayDistTradingEnv

from DeepLearning.Thesis.Observations.nodes_v_hexes import getNodeInHexObs, nodeInHexLowerBound, nodeInHexUpperBound

winner = [0,0,0,0]
player0Stats = PlayerStatsTracker()
Player0LosingStats = PlayerStatsTracker()
player1Stats = PlayerStatsTracker()


testModel = MaskablePPO.load("DeepLearning/Thesis/Hyperparameters/Models/NumSteps_8192/Final.zip")
# testModel2 = MaskablePPO.load("DeepLearning/Thesis/Hyperparameters/Models/NetworkArch_128_128/Final.zip")
# testModel3 = MaskablePPO.load("DeepLearning/Thesis/Hyperparameters/Models/NetworkArch_256_256/Final.zip")
# testModel4 = MaskablePPO.load("DeepLearning/Thesis/Hyperparameters/Models/NetworkArch_512_512/Final.zip")

players = [ AgentModel("P0", 0, recordStats=True, playerTrading=False, model=testModel),
            # AgentModel("P1", 1, recordStats=True, playerTrading=False, model=testModel2),
            AgentRandom2("P1", 1, recordStats=True, playerTrading=False),
            AgentRandom2("P2", 2, recordStats=True, playerTrading=False),
            AgentRandom2("P3", 3, recordStats=True, playerTrading=False),]


COLLECT_STATS = True
for episode in range(2000):
    game = CreateGame(players)
    game = pickle.loads(pickle.dumps(game, -1))
    numTurns = 0
    while True:
        currPlayer = game.gameState.players[game.gameState.currPlayer]

        agentAction = currPlayer.DoMove(game)
        agentAction.ApplyAction(game.gameState)

        if currPlayer.seatNumber == 0 and agentAction.type == 'EndTurn':
        #     DisplayImage(game.gameState, agentAction)
        #     time.sleep(1)
            numTurns += 1

        if game.gameState.currState == "OVER":
            # DisplayImage(game.gameState, agentAction)
            break
    
    # print("Winner: ", game.gameState.winner)
    winner[game.gameState.winner] += 1
    lost = game.gameState.winner != 0

    # Stats
    if COLLECT_STATS:
        game.gameState.players[0].generatePlayerStats()
        game.gameState.players[1].generatePlayerStats()

        player0Stats += game.gameState.players[0].stats
        player1Stats += game.gameState.players[1].stats
        if lost:
            Player0LosingStats += game.gameState.players[0].stats

# Collect stats
if COLLECT_STATS:
    player0Stats.getAverages()
    Player0LosingStats.getAverages()
    player1Stats.getAverages()
    player0Data = player0Stats.getList()
    player0LosingData = Player0LosingStats.getList()
    player1Data = player1Stats.getList()

    p_hat0 = winner[0] / sum(winner)
    p_hat1 = winner[1] / sum(winner)
    margin_error0 = round(100*(1.96 * math.sqrt((p_hat0 * (1 - p_hat0)) / sum(winner))), 2)
    margin_error1 = round(100*(1.96 * math.sqrt((p_hat1 * (1 - p_hat1)) / sum(winner))), 2)
    player0Data.insert(0, margin_error0)
    player0LosingData.insert(0, -1)
    player1Data.insert(0, margin_error1)
    player0Data.insert(0, winner[0]/sum(winner))
    player0LosingData.insert(0, -1)
    player1Data.insert(0, winner[1]/sum(winner))
    player0Data.insert(0, "Player0")
    player0LosingData.insert(0, "Player0LossesStats")
    player1Data.insert(0, "Player1")

    table = tabulate([player0Data, player0LosingData, player1Data], headers=headers, tablefmt='simple')
    print(table)

print(f"\nNum turns: {numTurns}")

print("\n\nWinnings: ", winner)


# Brick, ore, wool, wheat, wood

In [None]:
import pandas as pd

# Save to csv
fileName = f'NumSteps_8192_v_3Random.csv'
df = pd.DataFrame([player0Data, player0LosingData, player1Data], columns=headers)
df.to_csv(f'DeepLearning/Thesis/Hyperparameters/Data/{fileName}', index=False)