<a href="https://colab.research.google.com/github/davidr-1123/LyricGeneratorJapanese/blob/main/OriginalJapaneseTextGenGen2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  aptitude-common libcgi-fast-perl libcgi-pm-perl libclass-accessor-perl
  libcwidget3v5 libencode-locale-perl libfcgi-perl libhtml-parser-perl
  libhtml-tagset-perl libhttp-date-perl libhttp-message-perl libio-html-perl
  libio-string-perl liblwp-mediatypes-perl libparse-debianchangelog-perl
  libsigc++-2.0-0v5 libsub-name-perl libtimedate-perl liburi-perl libxapian30
Suggested packages:
  aptitude-doc-en | aptitude-doc apt-xapian-index debtags tasksel
  libcwidget-dev libdata-dump-perl libhtml-template-perl libxml-simple-perl
  libwww-perl xapian-tools
The following NEW packages will be installed:
  aptitude aptitude-common libcgi-fast-perl libcgi-pm-perl
  libclass-accessor-perl libcwidget3v5 libencode-locale-perl libfcgi-perl
  libhtml-parser-perl libhtml-tagset-perl libhttp-date-perl
  libhttp-message-perl libio-html-perl libio-string

In [14]:
%tensorflow_version 2.x
import MeCab
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, LSTM
import os
from google.colab import files 

files.upload()
text = open('aimyon.txt', 'r', encoding='utf-8').read()
text = text
table = str.maketrans({
    '\u3000': '',
    '…': '。',
    '”': '」',
    '“': '「',
    ',': '、',
    '.': '。'
})
text = text.translate(table)

wakati = MeCab.Tagger('-Owakati')
words = wakati.parse(text).split(' ')
print(words)

vocab = sorted(set(words))

# Creating a mapping from unique characters to indices
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)


def text_to_int(text):
    return np.array([char2idx[c] for c in words])


text_as_int = text_to_int(text)


def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])


print(int_to_text(text_as_int[:5]))

seq_length = 10  # length of sequence for a training example
examples_per_epoch = len(text) // (seq_length + 1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)


def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello


dataset = sequences.map(split_input_target)  # we use map to apply the above function to every entry

for x, y in dataset.take(2):
    print("\n\nEXAMPLE\n")
    print("INPUT")
    print(int_to_text(x))
    print("\nOUTPUT")
    print(int_to_text(y))

BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 1024
RNN_UNITS = 2048 * 2

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                             return_sequences=True,
                             stateful=True,
                             recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model


model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()


def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)


model.compile(optimizer='adam', loss=loss)

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints_aimyon'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    save_freq=int(2300 * 2))



['健康', '的', 'な', '朝', 'だ', 'な', 'こんな', '時', 'に', '君', 'の', '「', '愛し', 'てる', '」', 'が', '聞き', 'たい', 'や', '揺れる', 'カーテン', '少し', '浮い', 'た', '前髪', 'も', 'すべて', '心地', 'いい', 'さ', 'それ', 'に', '割れ', 'て', 'しまっ', 'た', '目玉焼き', 'ついて', 'ない', 'なあ', 'バランス', 'を', 'とっ', 'て', 'も', '溢れ', 'ちゃう', 'や', '少し', '辛く', 'て', '少し', '酸っぱく', 'て', '甘ったるかっ', 'た', 'りさ', 'とりあえず', '今日', 'は', 'バラ', 'の', '花', 'に', '願い', '込め', 'て', 'さ', '馬鹿', 'な', '夢', 'で', '踊ろ', 'う', '愛', 'を', '伝え', 'たい', 'だ', 'と', 'か', '臭い', 'こと', 'ばっか', '考え', 'て', '待っ', 'て', 'て', 'も', 'だんだん', 'ソファ', 'に', '沈ん', 'で', 'いく', 'だけ', '僕', 'が', '明日', '良い', '男', 'に', 'なる', 'わけ', 'で', 'も', 'ない', 'から', 'さ', '焦ら', 'ず', 'に', 'いる', 'よ', '今日', 'は', '日', 'が', '落ちる', '頃', 'に', '会える', 'の', '？', '「', '完璧', 'な', '男', 'に', 'なんて', '惹か', 'れ', 'ない', '」', 'と', '君', 'が', '笑っ', 'て', 'た', 'から', '悔しい', 'や', '腐る', 'ほど', 'に', '話し', 'たい', 'こと', '沢山', 'ある', 'のに', 'な', '寂しい', 'さ', '結局', 'の', 'ところ', '君', 'は', 'さ', 'どう', 'し', 'たい', 'の', '？', 'まじ', 'で', '僕', 'に', '愛さ', 'れる', '気', 'あん', 'の', '？'

In [9]:
history = model.fit(data, epochs=201, callbacks=[checkpoint_callback])

Epoch 1/201
Epoch 2/201
Epoch 3/201
Epoch 4/201
Epoch 5/201
Epoch 6/201
Epoch 7/201
Epoch 8/201
Epoch 9/201
Epoch 10/201
Epoch 11/201
Epoch 12/201
Epoch 13/201
Epoch 14/201
Epoch 15/201
Epoch 16/201
Epoch 17/201
Epoch 18/201
Epoch 19/201
Epoch 20/201
Epoch 21/201
Epoch 22/201
Epoch 23/201
Epoch 24/201
Epoch 25/201
Epoch 26/201
Epoch 27/201
Epoch 28/201
Epoch 29/201
Epoch 30/201
Epoch 31/201
Epoch 32/201
Epoch 33/201
Epoch 34/201
Epoch 35/201
Epoch 36/201
Epoch 37/201
Epoch 38/201
Epoch 39/201
Epoch 40/201
Epoch 41/201
Epoch 42/201
Epoch 43/201
Epoch 44/201
Epoch 45/201
Epoch 46/201
Epoch 47/201
Epoch 48/201
Epoch 49/201
Epoch 50/201
Epoch 51/201
Epoch 52/201
Epoch 53/201
Epoch 54/201
Epoch 55/201
Epoch 56/201
Epoch 57/201
Epoch 58/201
Epoch 59/201
Epoch 60/201
Epoch 61/201
Epoch 62/201
Epoch 63/201
Epoch 64/201
Epoch 65/201
Epoch 66/201
Epoch 67/201
Epoch 68/201
Epoch 69/201
Epoch 70/201
Epoch 71/201
Epoch 72/201
Epoch 73/201
Epoch 74/201
Epoch 75/201
Epoch 76/201
Epoch 77/201
Epoch 78

In [15]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [16]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)
    file = open('Generated_lyrics_aimyon.txt', 'a', encoding='utf-8')
    num_generate = 25

    wordlist = wakati.parse(start_string).split(",")
    for word in wordlist:
      word.replace('\n', '')
    try:
      # Converting our start string to numbers (vectorizing)
      input_eval = [char2idx[s] for s in words]
      input_eval = tf.expand_dims(input_eval, 0)

      # Empty string to store our results
      text_generated = []

      # Low temperatures results in more predictable text.
      # Higher temperatures results in more surprising text.
      # Experiment to find the best setting.
      temperature = 1.0

      # Here batch size == 1
      model.reset_states()
      for i in range(num_generate):
          predictions = model(input_eval)
          # remove the batch dimension

          predictions = tf.squeeze(predictions, 0)

          # using a categorical distribution to predict the character returned by the model
          predictions = predictions / temperature
          predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

          # We pass the predicted character as the next input to the model
          # along with the previous hidden state
          input_eval = tf.expand_dims([predicted_id], 0)

          text_generated.append(idx2char[predicted_id])
    except KeyError:
      print('別の言葉を選んでくれ')
    file.write(start_string + ''.join(text_generated))
    file.write("\n")
    file.close()
    return (start_string + ''.join(text_generated))

In [17]:
inp = input("Type a starting string: ")
inp = wakati.parse(inp).replace(' ', '')
inp = inp.replace('\n', '')
table = str.maketrans({
    '\u3000': '',
    '…': '。',
    '”': '」',
    '“': '「',
    ',': '、',
    '.': '。'
})
inp = inp.translate(table)
print(generate_text(model, inp))

Type a starting string: まだ
まだかすんで私の身体「いらっしゃい」ほらまた聞こえた真夏の夜の匂いが誘ったかな貴方は私
