# Marelle Board import
* Getting players info dict => `MarelleBoard.players`
* 3 phases : place, move and end => `MarelleBoard.phase`
* Reset the board => `MarelleBoard.initialize_game()`
* Printing the board => `MarelleBoard.print_board()`
* Id:Action dict => `MarelleBoard.id_to_action`
* Action:Id dict => `MarelleBoard.action_to_id`
* Get the board state => `MarelleBoard.get_state()`
* Play an action => `MarelleBoard.play_action(action_id, player)`
* Get legal action ids => `MarelleBoard.get_legal_action_ids(player)`
* Check if game ended (returns 0 if not ended or winning player id) : `MarelleBoard.check_if_end(player)`

In [4]:
import marl_env
import marl_agents
import marl_models
# importlib.reload(marl_env)
# importlib.reload(marl_agents)
# importlib.reload(marl_models)

from marl_env import MarelleBoard, MarelleGymEnv, MarelleGame
from marl_models import FCModel
from marl_agents import RandomAgent, BetterRandomAgent, Reinforce

import progressbar as pb
import numpy as np 
import importlib
import os

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import wandb

import torch
import sys

sys.path.insert(0, './utils')
from cliffwalk import CliffWalk
env = CliffWalk(proba_succ=0.98)

# Training agents

In [2]:
def train_agent(
    env,
    n_epochs,
    n_trajectories,
    trained_agent,
    opponent_agent,
    log_training=False,
    save_model_freq=50):

    if log_training:
        wandb.init(project="marl")
        wandb.config.n_trajectories = n_trajectories
        wandb.n_epochs = n_epochs

        # Trained agent
        wandb.config.trained_agent = trained_agent.__class__.__name__
        wandb.config.incentivize_captures = trained_agent.incentivize_captures
        wandb.config.punish_opponent_captures = trained_agent.punish_opponent_captures
        wandb.config.lr = trained_agent.lr
        wandb.config.optimizer_name = trained_agent.optimizer.__class__.__name__
        
        # Opponent agent
        wandb.config.opponent_agent = opponent_agent.__class__.__name__

        # Model to watch metrics from
        if trained_agent.model != None :
            wandb.watch(trained_agent.model)
        
    trained_agent.train(n_trajectories, n_epochs, opponent_agent, log_training, save_model_freq)

def load_model(model_name, run_id, model):
    wandb.restore(model_name, run_path=f'clement-guillo/marl/{run_id}',root= "models/temp/")
    model.load_state_dict(torch.load(f'models/temp/{model_name}'))
    os.remove(f'models/temp/{model_name}')
    return(model)

# Test CliffWalk

In [None]:
importlib.reload(marl_env)
importlib.reload(marl_agents)
importlib.reload(marl_models)
from marl_env import MarelleBoard, MarelleGymEnv, MarelleGame
from marl_models import FCModel
from marl_agents import RandomAgent, BetterRandomAgent, Reinforce, Reinforce_place, Reinforce_cliff

env = CliffWalk(proba_succ=0.98)
model = FCModel(2, env.Na)
print("go")
intel=Reinforce_cliff(env, model, lr=0.001)
intel.train(100, 10, None, None, 1)


In [None]:
import torch.nn as nn
o=intel.model(torch.tensor([prev_state,state],dtype = torch.float))
nn.Softmax(dim=0)(o)

In [3]:

print(env.step(3))
env.render()

(24, 0.6, False, {})
+-----------------------+
| : : : : : : : : : : : |
| : : : : : : : : : : : |
|[43m_[0m: : : : : : : : : : : |
|S:x:x:x:x:x:x:x:x:x:x:G|
+-----------------------+
  (up)


In [None]:

prev_state=intel.env.state
state=intel.env.state
for i in range(10):
    print(prev_state,state)
    action = intel.learned_act(prev_state,state)
    prev_state=state
    state, reward, done, _ = intel.env.step(action)
    intel.env.render()

# Test Marelle

In [None]:
importlib.reload(marl_env)
importlib.reload(marl_agents)
importlib.reload(marl_models)
from marl_env import MarelleBoard, MarelleGymEnv, MarelleGame
from marl_models import FCModel
from marl_agents import RandomAgent, BetterRandomAgent, Reinforce, Reinforce_place

env = MarelleGymEnv()

einstein_model = FCModel(len(env.board.get_state()), len(env.board.id_to_action))
einstein_place_model = FCModel(len(env.board.get_state()), 566)

einstein=Reinforce(env, 1, einstein_model, lr=0.005, incentivize_captures=False, punish_opponent_captures=False)
einstein=Reinforce_place(env, 1, einstein_place_model, lr=0.005, incentivize_captures=True, punish_opponent_captures=True)


piccolo=BetterRandomAgent(env, -1)
piccolo_dumb=RandomAgent(env,-1)

print("training")
train_agent(
    env=env,
    n_epochs=1000,
    n_trajectories=500,
    trained_agent=einstein,
    #opponent_agent=piccolo,
    opponent_agent=piccolo_dumb,
    log_training=True,
    save_model_freq=5
)


In [None]:
vec_s=env.board.get_state()
vec_s_2=[]
for s in vec_s:
    if s==0:
        vec_s_2.append(-3)
    else:
        vec_s_2.append(s)

s=vec_s_2
mat_state=np.ones((7,7))*(-3)
mat_state[3,4]=s[0]
mat_state[2,4]=s[1]
mat_state[2,3]=s[2]
mat_state[2,2]=s[3]

mat_state[3,2]=s[4]
mat_state[4,2]=s[5]
mat_state[4,3]=s[6]
mat_state[4,4]=s[7]

mat_state[3,5]=s[8]
mat_state[1,5]=s[9]
mat_state[1,3]=s[10]
mat_state[1,1]=s[11]

mat_state[3,1]=s[12]
mat_state[3,3]=s[13]
mat_state[3,5]=s[14]
mat_state[5,5]=s[15]

mat_state[3,6]=s[16]
mat_state[0,6]=s[17]
mat_state[0,3]=s[18]
mat_state[0,0]=s[19]

mat_state[3,0]=s[20]
mat_state[6,0]=s[21]
mat_state[6,3]=s[22]
mat_state[6,6]=s[23]

print(mat_state)

In [None]:
#wandb.init()

In [None]:
#TO DO
#agent fort en place
# avec un conv net 7x7 board +conv dans agent
# ne filtrer que les places ? -> non les licitesq moves servent à ça
#reward = difference de jeton à la fin de la phase changer le step et le reward associé ?
#mon reward "place_phase" nécessite des prises peut etre le faire jouer contre l'aléatoire?
#créer un env particulier pour l'agent place pour pas triturer le done, plus propre, en parler à Tim.

### Restauration Modèle

# Playing games

In [None]:
importlib.reload(marl_env)
importlib.reload(marl_agents)
importlib.reload(marl_models)
from marl_env import MarelleBoard, MarelleGymEnv, MarelleGame
from marl_models import FCModel
from marl_agents import RandomAgent, BetterRandomAgent, Reinforce, Reinforce_place

env = MarelleGymEnv()
einstein_place_model = load_model('model_200_1000.pt','3jdqnjih',FCModel(len(env.board.get_state()),566))
einstein = Reinforce_place(env, 1, einstein_place_model, lr=0.01, incentivize_captures=True, punish_opponent_captures=True)


gabrielle = RandomAgent(env, -1)
piccolo = BetterRandomAgent(env, 1)
game = MarelleGame(env=env, player1 = einstein, player2=gabrielle, clear_output=False)

In [None]:
history = game.play()

In [None]:
#TO DO
#evaluate agent, faire jouer un agent entrainé
#couleur sur l'affichage, ou vidéo animé (interface click bouton ?)
#Paramétrer la fréquence d'enregistrement
# récupérer les fichiers sauvés..

# Time test

In [None]:
import cProfile
env = MarelleGymEnv()

einstein_model = FCModel(len(env.board.get_state()), len(env.board.id_to_action))

einstein=Reinforce(env, 1, einstein_model, lr=0.005, incentivize_captures=False, punish_opponent_captures=False)

piccolo=BetterRandomAgent(env, -1)


cProfile.run("""train_agent(
    env=env,
    n_epochs=10,
    n_trajectories=100,
    trained_agent=einstein,
    opponent_agent=piccolo,
    log_training=False
)""")