In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import tensorflow_datasets as tfds
import os
import numpy as np
import time
import matplotlib.pyplot as plt
%load_ext tensorboard

In [2]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dailog_text = "drive/MyDrive/研究/対話モデル/dailog.txt"

In [4]:
questions, answers = [], []
file_path = ""
for line in open(dailog_text, 'r'):
    idx = line.find('_')
    questions.append(line[:idx-1])
    answers.append(line[idx+1:-1])

In [9]:
len(questions), len(answers)

(22920, 22920)

In [15]:
train_question = []
train_answer = []
test_question = []
test_answer = []

In [16]:
for r in range(len(questions)-5):
  trains=[]
  tests=[]
  for i in range(5):
    trains.append(questions[r+i])
    tests.append(answers[r+i])
  train_question.append(trains)
  train_answer.append(tests)

In [22]:
train_utterance = train_question[0:len(train_question)-int(len(train_question)*0.50)]
val_utterance = train_question[int(len(train_question)*0.50):]

In [23]:
train_response = train_answer[0:len(train_answer)-int(len(train_answer)*0.50)]
val_response = train_answer[int(len(train_answer)*0.50):]

## pip install janome

In [25]:
pip install janome

Collecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 63.4 MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.4.1


In [26]:
from janome.tokenizer import Tokenizer

In [28]:
tokenizer = Tokenizer(wakati=True)
inp_tokens, out_tokens = [], []

for i in questions:
  inp_tokens += tokenizer.tokenize(i)
for i in answers:
  out_tokens += tokenizer.tokenize(i)

In [30]:
inp_vocab_lists = list(set(inp_tokens))
out_vocab_lists = list(set(out_tokens))
tokenizaer_utteranse = tfds.deprecated.text.TokenTextEncoder(inp_vocab_lists, tokenizer=tokenizer)
tokenizaer_response = tfds.deprecated.text.TokenTextEncoder(out_vocab_lists, tokenizer=tokenizer)

In [31]:
sample_text = "台風１１号の影響で二日遅れて甲子園球場で開幕し、決勝は２５日の予定だそうです。"

tokenized_string = tokenizaer_utteranse.encode(sample_text)
print('Tokenized string is {}'.format(tokenized_string))

Tokenized string is [494, 1554, 1554, 912, 4277, 2269, 1017, 3488, 4047, 1501, 5035, 2488, 4848, 1017, 73, 703, 4803, 2885, 2149, 4407, 4222, 4047, 4277, 4162, 2785, 5696, 2852, 319]


In [32]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_utterance, train_response))
val_dataset = tf.data.Dataset.from_tensor_slices((val_utterance, val_response))

## Train dataset 作成

In [34]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [35]:
def encode(lang1,lang2):
    lang1 = [tokenizaer_utteranse.vocab_size] + tokenizaer_utteranse.encode(lang1.numpy()) + [tokenizaer_utteranse.vocab_size + 1]
    lang2 = [tokenizaer_response.vocab_size] + tokenizaer_response.encode(lang2.numpy()) + [tokenizaer_response.vocab_size + 1]
    return lang1,lang2

In [36]:
MAX_LENGTH = 40
def filter_max_length(x, y, max_length=MAX_LENGTH):
    return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [37]:
def tf_encoder(ut, re):
    return tf.py_function(encode, [ut,re], [tf.int64, tf.int64])

In [38]:
train_data = train_dataset.map(tf_encoder)
train_data = train_data.filter(filter_max_length)
train_data = train_data.cache()
train_data = train_data.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))
train_data.prefetch(tf.data.experimental.AUTOTUNE)

<PrefetchDataset shapes: ((None, None), (None, None)), types: (tf.int64, tf.int64)>

In [39]:
val_data = val_dataset.map(tf_encoder)
val_data = val_data.filter(filter_max_length).padded_batch(BATCH_SIZE, padded_shapes=([-1], [-1]))

## Model 構築

In [40]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [41]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [42]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [43]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

### Scaled dot product attention

In [44]:
def scaled_dot_product_attention(q, k, v, mask):
  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

### Multi head attention

In [45]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

### Fead Fowaed NetWork

In [46]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

### Encoder Layer

In [47]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

### Decoder Layer

In [48]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

### Encoder

In [49]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    #print(x)
    # adding embedding and position encoding.
    # ここでエラーが発生(970, 128) -> (4450, 128)
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)

    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

### Decoder

In [50]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

### Transformer

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)
    
    self.rnn = tf.keras.layers.GRU()

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [54]:
for (batch, (inp, tar)) in enumerate(train_dataset):
  for i in range(len(inp)):
    print(inp[i])

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
tf.Tensor(b'\xe8\x83\x83\xe3\x82\xab\xe3\x83\xa1\xe3\x83\xa9\xe3\x82\x92\xe5\x8b\xa7\xe3\x82\x81\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b\xe3\x82\x93\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8b\xef\xbc\x9f', shape=(), dtype=string)
tf.Tensor(b'\xe5\xb9\xb41\xe5\x9b\x9e\xe9\xa3\xb2\xe3\x82\x93\xe3\x81\xa7\xe3\x82\x8b\xe3\x82\x88\xe3\x80\x82', shape=(), dtype=string)
tf.Tensor(b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xb0\xe3\x82\x93\xe3\x81\xaf\xef\xbc\x81\xe3\x81\xa1\xe3\x82\x87\xe3\x81\x86\xe3\x81\xa9\xe9\x80\x80\xe5\xb1\x88\xe3\x81\x97\xe3\x81\xa6\xe3\x81\x9f\xe3\x82\x93\xe3\x81\xa0\xe3\x80\x82', shape=(), dtype=string)
tf.Tensor(b'\xe6\x99\x82\xe9\x96\x93\xe3\x81\x8c\xe3\x81\x82\xe3\x82\x8b\xe3\x81\xae\xe3\x81\xaf\xe3\x81\x84\xe3\x81\x84\xe3\x81\x93\xe3\x81\xa8\xe3\x81\xa0', shape=(), dtype=string)
tf.Tensor(b'\xe9\x80\x80\xe5\xb1\x88\xe3\x81\xaf\xe5\xbf\x83\xe3\x81\xa7\xe3\x81\x99\xe3\x81\xad', shape=(), dtype=string)
tf.Tensor(b'\xe5\xb9\xb41\xe5\x9b\x9e\