### This notebook is a modified version of a tutorial found here:

https://machinelearningmastery.com/how-to-develop-a-word-level-neural-language-model-in-keras/?unapproved=525293&moderation-hash=c2d08afe210bc708b6fea629746145cc#comment-525293

# URL to Google Colab environment for this notebook:

https://colab.research.google.com/drive/1HQuIGxAEb9SFsil9DnECkoLiMbc9nHtx#scrollTo=zHtL3Qx3ck5z

# Eminem RNN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Imports
import sys
import numpy as np
import string
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, LSTM, Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer, one_hot, hashing_trick
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences
from keras.preprocessing.text import text_to_word_sequence

from random import randint
from pickle import load, dump

import io
import pandas as pd
import time
import re

In [None]:
lyrics_by_song_with_headers = pd.read_csv('/content/drive/My Drive/Eminem Lyrics with Headers', index_col=0)
lyrics_by_song_with_headers.head()

In [None]:
f = open('/content/drive/My Drive/lyrics_eminem.txt', 'r')

with open('/content/drive/My Drive/lyrics_eminem.txt', 'r') as f:
    text = f.read()
type(text) # 1 huge string

In [None]:
# Create function to clean and tokenize text
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [None]:
# Create function to save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [None]:
# clean my text of lyrics
tokens_list = clean_doc(text)
print(tokens_list[:200])
print('Total Tokens: %d' % len(tokens_list))
print('Unique Tokens: %d' % len(set(tokens_list)))

In [None]:
# Organizing tokens into sequences
seq_length = 50 + 1
sequences = list()
for i in range(seq_length, len(tokens_list)):
	# select sequence of tokens
	seq = tokens_list[i-seq_length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

In [None]:
# save sequences to file
out_filename = 'final_eminem_lyrics.txt'
save_doc(sequences, out_filename)

In [None]:
# load doc into memory
def load_doc(filepath):
	# open the file as read only
	file = open(filepath, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load
in_filename = 'final_eminem_lyrics.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [None]:
# Using Keras tokenizer to tokenizer and sequence data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
len(tokenizer.word_index)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
# Designing LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 300, input_length=seq_length)) # 300 is dimension size
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Finally... FIT THIS MODEL!

In [None]:
# Fit model
model.fit(X, y, batch_size=128, epochs=30)

In [None]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Re-Loading Data and using Trained Model to Generate Text

In [None]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

# load cleaned text sequences
in_filename = 'final_eminem_lyrics.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

# load the model
model = load_model('model.h5')

# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)