# Dialogue Manager in Plato trained with REINFORCE

✨**experimental**✨

* convolutional neural network used for encoding of dialogue-states, network architecture is displayed in output 
* reaches ~95% success-rate after 200 dialogues (in ~40 seconds)

In [2]:
%%bash
REPOSRC=https://gitlab.tubit.tu-berlin.de/OKS/plato.git
REPODIR=plato
REROGIT=$REPODIR/.git

[ -d $REROGIT ] || git clone $REPOSRC $REPODIR
(cd $REPODIR; git remote update && git checkout actorcritic && git merge origin/actorcritic)

Fetching origin
Your branch is behind 'origin/actorcritic' by 13 commits, and can be fast-forwarded.
  (use "git pull" to update your local branch)
Updating 409a14f..ded2412
Fast-forward
 .gitignore                                         |  2 +-
 ConversationalAgent/ConversationalSingleAgent.py   | 15 ++++
 Dialogue/Action.py                                 |  9 +++
 DialogueManagement/DialogueManager.py              | 81 ++++++++++++----------
 .../ReinforcementLearning/QPolicy.py               | 13 +++-
 .../ReinforcementLearning/pytorch_a2c_policy.py    | 41 +++++------
 .../AgendaBasedUserSimulator/AgendaBasedUS.py      | 76 +++++++++++---------
 log_to_csv.py                                      | 36 +++++-----
 8 files changed, 156 insertions(+), 117 deletions(-)


From https://gitlab.tubit.tu-berlin.de/OKS/plato
   409a14f..ded2412  actorcritic -> origin/actorcritic
   1e45c55..af3c9ac  multi_intent_user_simulation -> origin/multi_intent_user_simulation
 * [new branch]      test_influence_handcrafted -> origin/test_influence_handcrafted
Already on 'actorcritic'


In [3]:
%%bash
source activate plato
cd plato
pip install -r requirements.txt



In [4]:
#%load_ext autoreload
#%autoreload 2
import os
import sys
sys.path.append('plato')

import shutil
from os import chdir
import torch
import torch.nn as nn
import torch.nn.functional as F

import DialogueManagement.DialoguePolicy.ReinforcementLearning.run_rl_training
from DialogueManagement.DialoguePolicy.ReinforcementLearning.pytorch_reinforce_policy import PyTorchReinforcePolicy
from ConversationalAgent.ConversationalSingleAgent import ConversationalSingleAgent
from DialogueManagement.DialoguePolicy.ReinforcementLearning.run_rl_training import \
    build_config, run_it
from torch.distributions import Categorical

#import importlib
#importlib.reload(x)

def clean_dir(dir):
    if os.path.isdir(dir):
        shutil.rmtree(dir)
    os.mkdir(dir)

In [None]:


class PolicyAgentModified(nn.Module):
    def __init__(self, vocab_size, num_actions, hidden_dim=64, embed_dim=32,**kwargs) -> None:
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embed_dim, out_channels=hidden_dim, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(
                in_channels=hidden_dim,
                out_channels=hidden_dim,
                kernel_size=3,
                stride=2,
            ),
            nn.ELU(),
            nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3),
            nn.ELU(),
        )
        self.pooling = nn.AdaptiveMaxPool1d(1)

        self.affine2 = nn.Linear(hidden_dim, num_actions)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(2, 1)
        features = self.convnet(x)
        features_pooled = self.pooling(features).squeeze(2)
        return F.softmax(self.affine2(features_pooled), dim=1)

    def step(self, state):
        probs = self.calc_probs(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

    def calc_probs(self, state):
        return self.forward(state)

    def log_probs(self, state: torch.Tensor, action: torch.Tensor):
        probs = self.calc_probs(state)
        m = Categorical(probs)
        return m.log_prob(action)


In [7]:

base_path = "."
policy_path = base_path+"/policies/agent"
domain_path = "shared_data/domain"

chdir("%s" % base_path)
clean_dir("logs")
clean_dir("policies")


config = {
        "GENERAL": {
            "print_level": "info",
            "interaction_mode": "simulation",
            "agents": 1,
            "runs": 5,
            "experience_logs": {
                "save": False,
                "load": False,
                "path": "logs/train_reinforce_logs.pkl",
            },
        },
        "DIALOGUE": {
            "num_dialogues": 1000,
            "initiative": "system",
            "domain": "CamRest",
            "ontology_path": domain_path+"/alex-rules.json",
            "db_path": domain_path+"/alex-dbase.db",
            "db_type": "sql",
            "cache_sql_results": True,
        },
        "AGENT_0": {
            "role": "system",
            "USER_SIMULATOR": {
                "simulator": "agenda",
                "patience": 5,
                "pop_distribution": [1.0],
                "slot_confuse_prob": 0.0,
                "op_confuse_prob": 0.0,
                "value_confuse_prob": 0.0,
            },
            "DM": {
                "policy": {
                    "type": "pytorch_reinforce",
                    "train": True,
                    "learning_rate": 0.01,
                    "learning_decay_rate": 0.995,
                    "discount_factor": 0.99,
                    "exploration_rate": 1.0,
                    "exploration_decay_rate": 0.99,
                    "min_exploration_rate": 0.01,
                    "policy_path": policy_path,
                    #"PolicyAgentModelClass":PolicyAgentModified
                }
            },

        },
    }

run_it(config, 200)
config['AGENT_0']['DM']['policy']['train']=False
run_it(config)

0it [00:00, ?it/s[{'dialogue': 0, 'success-rate': 0.0, 'loss': 0.0}]]

PolicyAgent(
  (encoder): StateEncoder(
    (embedding): Embedding(62, 32, padding_idx=1)
    (convnet): Sequential(
      (0): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
      (1): ELU(alpha=1.0)
      (2): Conv1d(64, 64, kernel_size=(3,), stride=(2,))
      (3): ELU(alpha=1.0)
      (4): Conv1d(64, 64, kernel_size=(3,), stride=(2,))
      (5): ELU(alpha=1.0)
      (6): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
      (7): ELU(alpha=1.0)
    )
    (pooling): AdaptiveMaxPool1d(output_size=1)
  )
  (actor): Actor(
    (intent_head): Linear(in_features=64, out_features=15, bias=True)
    (slots_head): Linear(in_features=64, out_features=10, bias=True)
  )
)


200it [00:37,  5.34it/s[{'dialogue': 199, 'success-rate': 0.84, 'loss': 17.335, 'eps': 0.08962861870232469}]]




Dialogue Success Rate: 94.0
Average Cumulative Reward: 18.205749999999995
Average Turns: 11.625


100it [00:14,  7.00it/s[{'dialogue': 99, 'success-rate': 0.84, 'loss': 0.0, 'eps': 1.0}]]




Dialogue Success Rate: 95.0
Average Cumulative Reward: 18.4225
Average Turns: 11.5
