# Dialogue Manager in Plato trained with Reinforcement Learning

✨**experimental**✨

In [29]:
%%bash
REPOSRC=https://gitlab.tubit.tu-berlin.de/OKS/plato.git
REPODIR=plato
REROGIT=$REPODIR/.git

[ -d $REROGIT ] || git clone $REPOSRC $REPODIR
(cd $REPODIR; git remote update && git checkout actorcritic && git merge origin/actorcritic)

Fetching origin
Your branch is behind 'origin/actorcritic' by 1 commit, and can be fast-forwarded.
  (use "git pull" to update your local branch)
Updating 70edd6b..409a14f
Fast-forward
 .../DialoguePolicy/ReinforcementLearning/pytorch_reinforce_policy.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)


From https://gitlab.tubit.tu-berlin.de/OKS/plato
   70edd6b..409a14f  actorcritic -> origin/actorcritic
Already on 'actorcritic'


In [19]:
%%bash
source activate plato
cd plato
pip install -r requirements.txt
pip install torch==1.2.0 torchtext sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Using cached scikit_learn-0.22.2.post1-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
Collecting joblib>=0.11
  Using cached joblib-0.14.1-py2.py3-none-any.whl (294 kB)
Collecting scipy>=0.17.0
  Using cached scipy-1.4.1-cp37-cp37m-manylinux1_x86_64.whl (26.1 MB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py): started
  Building wheel for sklearn (setup.py): finished with status 'done'
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1315 sha256=958134df2edcd039a724b5e78246f988f534d521af78802601d325043076b4c0
  Stored in directory: /home/jupyter-tilo/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: joblib, scipy, scikit-learn, sklearn
Successfully installed joblib-0.14.1 scikit-learn-0.22.2.post1 scipy-1.4.1 sklearn-0.0


In [30]:
#%load_ext autoreload
#%autoreload 2
import os
import sys
sys.path.append('plato')

import shutil
from os import chdir
import torch
import torch.nn as nn
import torch.nn.functional as F

import DialogueManagement.DialoguePolicy.ReinforcementLearning.run_rl_training
from DialogueManagement.DialoguePolicy.ReinforcementLearning.pytorch_reinforce_policy import PyTorchReinforcePolicy
from ConversationalAgent.ConversationalSingleAgent import ConversationalSingleAgent
from DialogueManagement.DialoguePolicy.ReinforcementLearning.run_rl_training import \
    build_config, run_it
from torch.distributions import Categorical

#import importlib
#importlib.reload(x)

def clean_dir(dir):
    if os.path.isdir(dir):
        shutil.rmtree(dir)
    os.mkdir(dir)

class PolicyAgentModified(nn.Module):
    def __init__(self, vocab_size, num_actions, hidden_dim=64, embed_dim=32,**kwargs) -> None:
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels=embed_dim, out_channels=hidden_dim, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(
                in_channels=hidden_dim,
                out_channels=hidden_dim,
                kernel_size=3,
                stride=2,
            ),
            nn.ELU(),
            nn.Conv1d(in_channels=hidden_dim, out_channels=hidden_dim, kernel_size=3),
            nn.ELU(),
        )
        self.pooling = nn.AdaptiveMaxPool1d(1)

        self.affine2 = nn.Linear(hidden_dim, num_actions)

    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(2, 1)
        features = self.convnet(x)
        features_pooled = self.pooling(features).squeeze(2)
        return F.softmax(self.affine2(features_pooled), dim=1)

    def step(self, state):
        probs = self.calc_probs(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

    def calc_probs(self, state):
        return self.forward(state)

    def log_probs(self, state: torch.Tensor, action: torch.Tensor):
        probs = self.calc_probs(state)
        m = Categorical(probs)
        return m.log_prob(action)


base_path = "."
policy_path = base_path+"/policies/agent"
domain_path = "shared_data/domain"

chdir("%s" % base_path)
clean_dir("logs")
clean_dir("policies")


config = {
        "GENERAL": {
            "print_level": "info",
            "interaction_mode": "simulation",
            "agents": 1,
            "runs": 5,
            "experience_logs": {
                "save": False,
                "load": False,
                "path": "logs/train_reinforce_logs.pkl",
            },
        },
        "DIALOGUE": {
            "num_dialogues": 1000,
            "initiative": "system",
            "domain": "CamRest",
            "ontology_path": domain_path+"/alex-rules.json",
            "db_path": domain_path+"/alex-dbase.db",
            "db_type": "sql",
            "cache_sql_results": True,
        },
        "AGENT_0": {
            "role": "system",
            "USER_SIMULATOR": {
                "simulator": "agenda",
                "patience": 5,
                "pop_distribution": [1.0],
                "slot_confuse_prob": 0.0,
                "op_confuse_prob": 0.0,
                "value_confuse_prob": 0.0,
            },
            "DM": {
                "policy": {
                    "type": "pytorch_reinforce",
                    "train": True,
                    "learning_rate": 0.01,
                    "learning_decay_rate": 0.995,
                    "discount_factor": 0.99,
                    "exploration_rate": 1.0,
                    "exploration_decay_rate": 1.0,
                    "min_exploration_rate": 0.01,
                    "policy_path": policy_path,
                    #"PolicyAgentModelClass":PolicyAgentModified
                }
            },
            "NLU": None,
            "DST": {"dst": "dummy"},
            "NLG": None,
        },
    }

#ca = ConversationalSingleAgent(config)
#ca.initialize()
#print(ca.minibatch_length)
#print(ca.dialogue_manager.it_works)
#ca.dialogue_manager.policy.PolicyAgentModelClass

run_it(config, 100)
config['AGENT_0']['DM']['policy']['train']=False
run_it(config)

  y = column_or_1d(y, warn=True)
0it [00:00, ?it/s[{'dialogue': 0, 'success-rate': 0.0, 'loss': 0.0}]]

PolicyAgent(
  (encoder): StateEncoder(
    (embedding): Embedding(62, 32, padding_idx=1)
    (convnet): Sequential(
      (0): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
      (1): ELU(alpha=1.0)
      (2): Conv1d(64, 64, kernel_size=(3,), stride=(2,))
      (3): ELU(alpha=1.0)
      (4): Conv1d(64, 64, kernel_size=(3,), stride=(2,))
      (5): ELU(alpha=1.0)
      (6): Conv1d(64, 64, kernel_size=(3,), stride=(1,))
      (7): ELU(alpha=1.0)
    )
    (pooling): AdaptiveMaxPool1d(output_size=1)
  )
  (actor): Actor(
    (intent_head): Linear(in_features=64, out_features=15, bias=True)
    (slots_head): Linear(in_features=64, out_features=10, bias=True)
  )
)


100it [00:18,  5.46it/s[{'dialogue': 99, 'success-rate': 0.84, 'loss': 17.782, 'eps': 1.0}]]
0it [00:00, ?it/s[{'dialogue': 0, 'success-rate': 0.0, 'loss': 0.0}]]



Dialogue Success Rate: 100.0
Average Cumulative Reward: 19.5095
Average Turns: 10.81


100it [00:14,  6.84it/s[{'dialogue': 99, 'success-rate': 0.84, 'loss': 0.0, 'eps': 1.0}]]




Dialogue Success Rate: 97.0
Average Cumulative Reward: 18.74450000000001
Average Turns: 13.48
