## Pytorchの環境構築

In [1]:
import time
import os
import pyhocon
import torch
import argparse
from torch import nn
from torch import optim
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import glob
import os, re, json
import matplotlib.pyplot as plt
import jsonlines

# 応答の対話行為推定モデル

In [2]:
data_url_repo = "data/corpus/*"
data_url_dir = glob.glob(data_url_repo)

### GPUをdevice

In [3]:
torch.cuda.is_available()

True

In [4]:
device = torch.device('cuda:0')

### データ処理

In [5]:
EOS_token = '<EOS>'
BOS_token = '<BOS>'
file_pattern = re.compile(r'^sw\_([a-z]+?)\_([0-9]+?)\.jsonlines$')

swda_tagu = {
    '<Uninterpretable>': ['abandoned_or_turn-exit/uninterpretable', 'non-verbal'],
    '<Statement>': ['statement-non-opinion', 'statement-opinion', 'other_answers', '3rd-party-talk', 'self-talk', 'offers,_options_commits', 'collaborative_completion'],
    '<Question>': ['q', 'yes-no-question', 'wh-question', 'declarative_yes-no-question', 'backchannel_in_question_form', 'open-question', 'rhetorical-questions', 'signal-non-understanding', 'or-clause', 'tag-question', 'declarative_wh-question'],
    '<Directive>': ['action-directive'],
    '<Greeting>': ['conventional-opening', 'conventional-closing'],
    '<Apology>': ['apology', 'no_answers', 'reject', 'negative_non-no_answers', 'dispreferred_answers', 'dispreferred_answers'],
    '<Agreement>': ['agree/accept', 'maybe/accept-part', 'thanking'],
    '<Understanding>': ['acknowledge_(backchannel)', 'summarize/reformulate', 'appreciation', 'response_acknowledgement', 'affirmative_non-yes_answers', 'yes_answers'],
    '<Other>': ['other', 'hedge', 'quotation', 'repeat-phrase', 'hold_before_answer/agreement', 'downplayer']
}

daily_tagu = {1: "inform", 2: "question", 3: "directive", 4: "commissive"}

#### 対話行為のID化

In [8]:
class DA_to_ID:
    
    def __init__(self, config, X_DA, Y_DA):
        self.word2id = None
        self.id2word = None
        self.config = config
        self.X_DA = X_DA
        self.Y_DA = Y_DA
        self.construct()
        
    def construct(self):
        vocab = {'<PAD>': 0}
        vocab_count = {}
        
        for x,y in zip(self.X_DA, self.Y_DA):
            for token in x:
                if token in vocab_count:
                    vocab_count[token] += 1
                else:
                    vocab_count[token] = 1
                    
            for token in y:
                if token in vocab_count:
                    vocab_count[token] += 1
                else:
                    vocab_count[token] = 1
                    
        for k, _ in sorted(vocab_count.items(), key=lambda x: -x[1]):
            vocab[k] = len(vocab)
            if len(vocab) >= self.config['MAX_VOCAB']: break
        self.word2id = vocab
        self.id2word = {v : k for k, v in vocab.items()}
        return vocab
        
    def tokenize(self, X_tensor, Y_tensor):
        X_tensor = [[self.word2id[token] for token in sentence] for sentence in X_tensor]
        Y_tensor = [[self.word2id[token] for token in sentence] for sentence in Y_tensor]
        return X_tensor, Y_tensor

#### 発話のID化

In [9]:
class UTT_to_ID:
    
    def __init__(self, config, X_UTT, Y_UTT):
        self.word2id = None
        self.id2word = None
        self.config = config
        self.X_UTT = X_UTT
        self.Y_UTT = Y_UTT
        self.construct()
        
    def construct(self):
        
        vocab = {'<UNK>': 0, '<EOS>': 1, '<BOS>': 2, '<UttPAD>': 3, '<ConvPAD>': 4}
        vocab_count = {}
        
        for x,y in zip(self.X_UTT, self.Y_UTT):
            for seq in x:
                for word in seq:
                    if word in vocab_count:
                        vocab_count[word] += 1
                    else:
                        vocab_count[word] = 1
            for seq in y:
                for word in seq:
                    if word in vocab_count:
                        vocab_count[word] += 1
                    else:
                        vocab_count[word] = 1
                        
        for k, _ in sorted(vocab_count.items(), key=lambda x: -x[1]):
            vocab[k] = len(vocab)
            if len(vocab) >= self.config['UTT_MAX_VOCAB']: break
        self.word2id = vocab
        self.id2word = {v : k for k, v in vocab.items()}

        return vocab
        
    def tokenize(self, X_tensor, Y_tensor):
        
        X_tensor = [[[self.word2id[token] if token in self.word2id else self.word2id['<UNK>'] for token in seq] for seq in dialogue] for dialogue in X_tensor]
        Y_tensor = [[[self.word2id[token] if token in self.word2id else self.word2id['<UNK>'] for token in seq] for seq in dialogue] for dialogue in Y_tensor]
        return X_tensor, Y_tensor

#### トレーニングデータ作成

In [61]:
def create_traindata(config, name):
    files = [f for f in os.listdir(config[name]['train_path']) if file_pattern.match(f)]
    # print("files:" , files)
    da_x = []
    da_y = []
    utt_x = []
    utt_y = []
    turn = []
    tasikame = []
    # 1file 1conversation
    for filename in files:
        # print(os.path.join(config['train_path'], filename))
        with open(os.path.join(config[name]['train_path'], filename), 'r') as f:
            data = f.read().split('\n')
            # print(data)
            data.remove('')
            da_seq = []
            utt_seq = []
            turn_seq = []
            # 1line 1turn
            for idx, line in enumerate(data, 1):
                jsondata = json.loads(line)
                # single-turn multi dialogue case
                if config[name]['multi_dialogue']:
                    for da, utt in zip(jsondata['DA'], jsondata['sentence']):
                        da_seq.append(da)
                        tasikame.append(da)
                        utt_seq.append(utt.split(' '))
                        turn_seq.append(0)
                    if not config[name]['turn']:
                        da_seq.append('<turn>')
                        utt_seq.append('<turn>')
                    turn_seq[-1] = 1
                # single-turn single dialogue case
                else:
                    da_seq.append(jsondata['DA'][-1])
                    utt_seq.append(jsondata['sentence'][-1].split(' '))
            da_seq = [easy_damsl(da) for da in da_seq]
        
            
            # assert len(turn_seq) == len(da_seq), '{} != {}'.format(len(turn_seq), len(da_seq))
        if config['state']:
            for i in range(max(1, len(da_seq) - 1 - config[name]['window_size'])):
                da_x.append(da_seq[i:min(len(da_seq)-1, i + config[name]['window_size'])])
                da_y.append(da_seq[1 + i:min(len(da_seq), 1 + i + config[name]['window_size'])])
                utt_x.append(utt_seq[i:min(len(da_seq)-1, i + config[name]['window_size'])])
                utt_y.append(utt_seq[1 + i:min(len(da_seq), 1 + i + config[name]['window_size'])])
                turn.append(turn_seq[i:min(len(da_seq), i + config[name]['window_size'])])
        else:
            da_x.append(da_seq[:-1])
            da_y.append(da_seq[1:])
            utt_x.append(utt_seq[:-1])
            utt_y.append(utt_seq[1:])
            turn.append(turn_seq[:-1])
    assert len(da_x) == len(da_y), 'Unexpect length da_posts and da_cmnts'
    assert len(utt_x) == len(utt_y), 'Unexpect length utt_posts and utt_cmnts'
    # assert len(turn) == len(da_posts)
    
    return da_posts, da_cmnts, utt_posts, utt_cmnts, turn


#### タグ付

In [12]:
def easy_damsl(tag):
    easy_tag = [k for k, v in swda_tagu.items() if tag in v]
    return easy_tag[0] if not len(easy_tag) < 1 else tag

#### データの割合(8:1:1)

In [13]:
def separate_data(x, y, turn):
    split_size = round(len(x) / 10)
    if split_size == 0: split_size = 1
    X_train, Y_train, Tturn = x[split_size * 2:], y[split_size * 2:], turn[split_size * 2:]
    X_valid, Y_valid, Vturn = x[split_size: split_size * 2], y[split_size: split_size * 2], turn[split_size: split_size * 2]
    X_test, Y_test, Testturn = x[:split_size], y[:split_size], turn[:split_size]
    assert len(X_train) == len(Y_train), 'Unexpect to separate train data'
    return X_train, Y_train, X_valid, Y_valid, X_test, Y_test, Tturn, Vturn, Testturn

## 訓練

In [54]:
def initialize_env(name):
    config = pyhocon.ConfigFactory.parse_file('./dialogue.conf')
    config['log_dirs'] = os.path.join(config[name]['log_dir'])
    if not os.path.exists(config['log_dirs']):
        os.mkdir(config['log_dirs'])
     
    return config

#### データ取得

In [59]:
def create_DAdata(config, name):
    posts, cmnts, _, _, turn = create_traindata(config, name)
    X_train, Y_train, X_valid, Y_valid, X_test, Y_test, Tturn, Vturn, Testturn = separate_data(posts, cmnts, turn)
    return X_train, Y_train, X_valid, Y_valid, X_test, Y_test, Tturn, Vturn, Testturn

def create_Uttdata(config, name):
    _, _, posts, cmnts, turn = create_traindata(config, name)
    X_train, Y_train, X_valid, Y_valid, X_test, Y_test, _, _, _ = separate_data(posts, cmnts, turn)
    return X_train, Y_train, X_valid, Y_valid, X_test, Y_test


In [32]:
def select_model(mode_name, utt_vocab, da_vocab, config, device, lr):
    
    if model_name == "cmb_attention":
        model = CmbAttentionModel(utt_vocab, da_vocab, config, device).to(device)
        opt = optim.Adam(da_predict_model.parameters(), lr)
        
    else:
        model = None
        opt = None
        
    return model, opt

In [63]:
write = SummaryWriter("./logs")

model_name = 'CmbAttention'
config = initialize_env(model_name)

XDA_train, YDA_train, XDA_valid, YDA_valid, _, _, Tturn, Vturn, _ = create_DAdata(config, model_name)
XUtt_train, YUtt_train, XUtt_valid, YUtt_valid, _, _ = create_Uttdata(config, model_name)

DA_vocab = DA_to_ID(config, XDA_train+XDA_valid, YDA_train+YDA_valid)
Utt_vocab = UTT_to_ID(config, XUtt_train+XUtt_valid, YUtt_train+YUtt_valid)

XDA_train, YDA_train = DA_to_ID.tokenize(XDA_train, YDA_train)
XDA_valid, YDA_valid = DA_to_ID.tokenize(XDA_valid, YDA_valid)
XUtt_train, XUtt_valid = UTT_to_ID.tokenize(XUtt_train, YUtt_train)
XUtt_valid, YUtt_valid = UTT_to_ID.tokenize(XUtt_valid, YUtt_valid)

print('Finish preparing dataset...')


ConfigMissingException: 'No configuration setting found for key turn'

In [36]:
lr = config['lr']
batch_size = config['BATCH_SIZE']
plot_train_losses = []
plot_valid_losses = []
    
print_total_loss = 0
plot_total_loss = 0
plot_total_acc = 0

models, optims = select_model(experiment, utt_vocab, da_vocab, config, device, lr)  

NameError: name 'config' is not defined

In [None]:
print("start TRAINING")

start = time.time()


## 検証

In [51]:
os.path.join(config['CmbAttention']['log_dir'])

'./data/model/'

## テスト

## モデル(Cmb Attention)

In [29]:
class CmbAttentionModel(nn.Module):
    
    def __init__(self, utt_vocab, da_vocab, config, device):
        super(CmbAttentionModel, self).__init__()
        
        self.utter_encoder = UtteraceEncoder(len(utt_vocab.word2id), config['UTT_EMBED'], config['UTT_HIDDEN'])

        self.context_encoder = RNNContextAwareEncoder(config['CON_EMBED'], config['CON_HIDDEN'])

        self.da_encoder = RNNDAAwareEncoder(len(utt_vocab.word2id), config['DA_EMBED'], config['DA_HIDDEN'])

        self.de_encoder = DenceEncoder(config['DA_HIDDEN'] + config['CON_HIDDEN'], config['DA_EMBED'], len(da_vocab.word2id))
        
        # self.weights = torch.tensor([1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]).cuda()

        # self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='mean', weight=self.weights)

        self.cross_entropy_loss = nn.CrossEntropyLoss(reduction='mean')

        self.device = device
        
    def forward(self, X_utter, X_da, Y_da, mask, utter_hidden, context_hidden, da_hidden, turn):

        utter_output, utter_weights = self.utter_encoder(X_utter, mask)

        turn_output = torch.cat((utter_output, turn), dim=2)

        context_output, context_weights, context_hidden = self.context_encoder(turn_output, mask, context_hidden)

        da_output, da_weights, da_hidden = self.da_encoder(X_da, mask, da_hidden)

        x_output = torch.cat((context_output, da_output), dim=2)

        dence_output = self.de_encoder(x_output)

        output = dence_output.squeeze(1)  # (batch_size, da_dim)
        
        Y_da = Y_da.squeeze()
        
        loss = self.cross_entropy_loss(output, Y_da)

        loss.backward(retain_graph=True)

        return loss.item(), utter_hidden, context_hidden, da_hidden
    
    def evaluate(self, X_utter, X_da, Y_da, mask, utter_hidden, context_hidden, da_hidden, turn):
        
        with torch.no_grad():

            utter_output, utter_weights = self.utter_encoder(X_utter, mask)

            turn_output = torch.cat((utter_output, turn), dim=2)

            context_output, context_weights, context_hidden = self.context_encoder(turn_output, mask, context_hidden)

            da_output, da_weights, da_hidden = self.da_encoder(X_da, mask, da_hidden)

            x_output = torch.cat((context_output, da_output), dim=2)

            dence_output = self.de_encoder(x_output)

            output = dence_output.squeeze(1)  # (batch_size, da_dim)
            
            Y_da = Y_da.squeeze(0)
            
            loss = self.cross_entropy_loss(output, Y_da)

        return loss.item(), utter_hidden, context_hidden, da_hidden


    def prediction(self, X_utter, X_da, mask, utter_hidden, context_hidden, da_hidden, turn):

        with torch.no_grad():

            utter_output, utter_weights = self.utter_encoder(X_utter, mask)

            turn_output = torch.cat((utter_output, turn), dim=2)

            context_output, context_weights, context_hidden = self.context_encoder(turn_output, mask, context_hidden)

            da_output, da_weights, da_hidden = self.da_encoder(X_da, mask, da_hidden)

            x_output = torch.cat((context_output, da_output), dim=2)

            dence_output = self.de_encoder(x_output)

            output = dence_output.squeeze(1)  # (batch_size, da_dim)

        return output, utter_hidden, context_hidden, da_hidden, utter_weights


    def initDAHidden(self, batch_size):
        return self.utter_encoder.initHidden(batch_size, self.device), self.context_encoder.initHidden(batch_size, self.device), self.da_encoder.initHidden(batch_size, self.device)



### Word Embedding

In [16]:
class WordEmbedding(nn.Module):

    def __init__(self, vocab_size, embed_size, w_model):
        super(WordEmbedding, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.linear = nn.Linear(embed_size, w_model)

    def forward(self, x_word):
        return torch.tanh(self.linear(self.word_embedding(x_word)))

### DA Embedding

In [17]:
class DAEmbedding(nn.Module):

    def __init__(self, da_size, embed_size, d_model):
        super(DAEmbedding, self).__init__()
        self.da_embedding = nn.Embedding(da_size, embed_size)
        self.linear = nn.Linear(embed_size, d_model)

    def forward(self, x_da):
        return torch.tanh(self.linear(self.da_embedding(x_da)))

### Attention

In [23]:
class Attention(nn.Module):

    def __init__(self, d_model):
        super(Attention, self).__init__()
        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
        self.d_k = d_model
       
    def forward(self, q, k, v, mask=None):
        # 全結合層で特徴量を変換
        k = self.k_linear(k)
        q = self.q_linear(q)
        v = self.v_linear(v)

        # Attentionの値を計算する
        # 各値を足し算すると大きくなりすぎるので、root(d_k)で割って調整
        weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.d_k)
        
        # ここでmaskを計算
        if mask is not None:
            mask = mask.unsqueeze(1)
            weights = weights.masked_fill(mask == 0, -1e9)

        # softmaxで規格化をする
        attention_weights = F.softmax(weights, dim=-1)

        # AttentionをValueとかけ算
        output = torch.matmul(attention_weights, v)

        # 全結合層で特徴量を変換
        output = self.out(output)

        return output, attention_weights



In [18]:
class ContextAttention(nn.Module):

    def __init__(self, d_model, hidden_size, att_size):
        super(ContextAttention, self).__init__()
        self.q_linear = nn.Linear(att_size, att_size)
        self.v_linear = nn.Linear(att_size, att_size)
        self.k_linear = nn.Linear(att_size, att_size)

        self.fc_1 = nn.Linear(d_model, d_model)
        self.fc_3 = nn.Linear(hidden_size, d_model, bias=True)
        self.fc_2 = nn.Linear(d_model, att_size)

        self.fc_out = nn.Linear(att_size, hidden_size, bias=True)
        self.d_k = att_size

    def forward(self, x, mask, hidden):
        
        x = self.fc_2(torch.tanh(self.fc_1(x) + self.fc_3(hidden)))

        q = self.q_linear(x)
        v = self.v_linear(x)
        k = self.k_linear(x)

        weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.d_k)

        # ここでmaskを計算
        if mask is not None:
            mask = mask.unsqueeze(1)
            weights = weights.masked_fill(mask == 0, -1e9)

        attention_weights = F.softmax(weights, dim=-1)

        att_output = torch.matmul(attention_weights, v)

        output = self.fc_out(att_output)

        return output, attention_weights

### Feed Forward

In [19]:
class FeedForward(nn.Module):

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):

        x = self.linear_1(x)

        x = self.dropout(F.relu(x))

        x = self.linear_2(x)

        return x

### Positinal Encoding

In [20]:
class PositinalEncoding(nn.Module):

    def __init__(self, d_model, max_len, dropout=0.1):
        super(PositinalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        
        x = x + self.pe[:x.size(0), :]

        return self.dropout(x)


### Decoder

In [21]:
class DenceDecoder(nn.Module):
    def __init__(self, da_hidden, da_embed_size, da_input_size):
        super(DenceDecoder, self).__init__()
        self.he = nn.Linear(da_hidden, da_embed_size)
        self.ey = nn.Linear(da_embed_size, da_input_size)

    def forward(self, hidden):
        pred = self.ey(torch.tanh(self.he(hidden)))
        return pred


### Context Encoder

#### Self-Attention => RNN

In [26]:
class RNNContextAwareEncoder(nn.Module):
    
    def __init__(self, emb_dim, d_model):
        super(RNNContextAwareEncoder, self).__init__()
        self.d_model = d_model
        self.linear = nn.Linear(self.d_model+1, self.d_model)
        self.rnn = nn.GRU(self.d_model, self.d_model, batch_first=True)
        self.attention = ContextAttention(self.d_model, self.d_model, self.d_model)
        self.ffn = FeedForward(self.d_model, self.d_model)

    def forward(self, x, mask, hidden):

        lin_output = self.linear(x)

        att_output, att_weights = self.attention(lin_output, mask, hidden.transpose(0,1))        

        rnn_output, rnn_hidden = self.rnn(att_output, hidden)

        ffn_output = self.ffn(rnn_output)

        return ffn_output, att_weights, rnn_hidden

    def initHidden(self, batch_size, device):
        return torch.zeros(1, batch_size, self.d_model).to(device)

### DA Encoder

#### Self-Attention => RNN

In [27]:
class RNNDAAwareEncoder(nn.Module):

    def __init__(self, da_size, emb_dim, d_model):
        super(RNNDAAwareEncoder, self).__init__()
        self.d_model = d_model
        self.embedding = DAEmbedding(da_size, emb_dim, self.d_model)
        self.rnn = nn.GRU(self.d_model, self.d_model, batch_first=True)
        # self.attention = ContextAwareAttention(self.d_model, self.d_model, self.d_model)
        self.attention = ContextAttention(self.d_model, self.d_model, self.d_model)
        self.ffn = FeedForward(self.d_model, emb_dim)

    def forward(self, X_da, mask, hidden):

        emb_output = self.embedding(X_da)

        att_output, att_weights = self.attention(emb_output, mask, hidden.transpose(0,1))        

        rnn_output, rnn_hidden = self.rnn(att_output, hidden)

        ffn_output = self.ffn(rnn_output)

        return ffn_output, att_weights, rnn_hidden

    def initHidden(self, batch_size, device):
        return torch.zeros(1, batch_size, self.d_model).to(device)

### Utterance Encoder

#### PE => Self-Attention

In [24]:
class UtteraceEncoder(nn.Module):
    
    def __init__(self, vocab_size, emb_dim, d_model):
        super(UtteraceEncoder, self).__init__()
        self.d_model = d_model
        self.embedding = WordEmbedding(vocab_size, emb_dim, self.d_model)
        self.pe = PositinalEncoding(self.d_model, 200)
        self.att = Attention(self.d_model)
        self.ffn = FeedForward(d_model, emb_dim)
        
    def forward(self, x_utter, mask):

        emb_output = self.embedding(x_utter)

        pos_output = self.pe(emb_output)

        att_output, att_weights = self.att(pos_output, pos_output, pos_output, mask)

        ffn_output = self.ffn(att_output)

        seq_len = ffn_output.size()[1]

        avg_output = F.avg_pool2d(ffn_output, (seq_len, 1)) # => (128, 1, 512)

        return avg_output, att_weights  # 発話ベクトル(128, 1, 512)

    def initHidden(self, batch_size, device):
        return torch.zeros(1, batch_size, self.d_model).to(device)