<a href="https://colab.research.google.com/github/davidr-1123/LyricGeneratorJapanese/blob/main/OriginalJapaneseTextGenGen2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7

In [None]:
%tensorflow_version 2.x
import MeCab
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Bidirectional, LSTM
import os
from google.colab import files 

files.upload()
text = open('aimyon.txt', 'r', encoding='utf-8').read()
text = text
table = str.maketrans({
    '\u3000': '',
    '…': '。',
    '”': '」',
    '“': '「',
    ',': '、',
    '.': '。'
})
text = text.translate(table)

wakati = MeCab.Tagger('-Owakati')
words = wakati.parse(text).split(' ')
print(words)

vocab = sorted(set(words))

# Creating a mapping from unique characters to indices
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)


def text_to_int(text):
    return np.array([char2idx[c] for c in words])


text_as_int = text_to_int(text)


def int_to_text(ints):
    try:
        ints = ints.numpy()
    except:
        pass
    return ''.join(idx2char[ints])


print(int_to_text(text_as_int[:5]))

seq_length = 10  # length of sequence for a training example
examples_per_epoch = len(text) // (seq_length + 1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)


def split_input_target(chunk):  # for the example: hello
    input_text = chunk[:-1]  # hell
    target_text = chunk[1:]  # ello
    return input_text, target_text  # hell, ello


dataset = sequences.map(split_input_target)  # we use map to apply the above function to every entry

for x, y in dataset.take(2):
    print("\n\nEXAMPLE\n")
    print("INPUT")
    print(int_to_text(x))
    print("\nOUTPUT")
    print(int_to_text(y))

BATCH_SIZE = 64
VOCAB_SIZE = len(vocab)  # vocab is number of unique characters
EMBEDDING_DIM = 1024
RNN_UNITS = 2048 * 2

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                             return_sequences=True,
                             stateful=True,
                             recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model


model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, BATCH_SIZE)
model.summary()


def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)


model.compile(optimizer='adam', loss=loss)

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints_aimyon'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    save_freq=int(2300 * 2))



In [None]:
history = model.fit(data, epochs=201, callbacks=[checkpoint_callback])

In [15]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, RNN_UNITS, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [16]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)
    file = open('Generated_lyrics_aimyon.txt', 'a', encoding='utf-8')
    num_generate = 25

    wordlist = wakati.parse(start_string).split(",")
    for word in wordlist:
      word.replace('\n', '')
    try:
      # Converting our start string to numbers (vectorizing)
      input_eval = [char2idx[s] for s in words]
      input_eval = tf.expand_dims(input_eval, 0)

      # Empty string to store our results
      text_generated = []

      # Low temperatures results in more predictable text.
      # Higher temperatures results in more surprising text.
      # Experiment to find the best setting.
      temperature = 1.0

      # Here batch size == 1
      model.reset_states()
      for i in range(num_generate):
          predictions = model(input_eval)
          # remove the batch dimension

          predictions = tf.squeeze(predictions, 0)

          # using a categorical distribution to predict the character returned by the model
          predictions = predictions / temperature
          predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

          # We pass the predicted character as the next input to the model
          # along with the previous hidden state
          input_eval = tf.expand_dims([predicted_id], 0)

          text_generated.append(idx2char[predicted_id])
    except KeyError:
      print('別の言葉を選んでくれ')
    file.write(start_string + ''.join(text_generated))
    file.write("\n")
    file.close()
    return (start_string + ''.join(text_generated))

In [None]:
inp = input("Type a starting string: ")
inp = wakati.parse(inp).replace(' ', '')
inp = inp.replace('\n', '')
table = str.maketrans({
    '\u3000': '',
    '…': '。',
    '”': '」',
    '“': '「',
    ',': '、',
    '.': '。'
})
inp = inp.translate(table)
print(generate_text(model, inp))