In [1]:
import tensorflow as tf

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import random
import numpy as np
import os
from keras.models import Sequential
from keras.layers import LSTM

Using TensorFlow backend.


In [3]:
import string

In [5]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [6]:
# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [7]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [8]:
# load document
in_filename = 'lyrics_rb.txt'
doc = load_doc(in_filename)
print(doc[:200])



Let the little girl dance let the little girl dance
She never danced before so let her on the floor 
Let the little girl dance let the little girl dance
She wants to give it a try so let the little 


In [9]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['let', 'the', 'little', 'girl', 'dance', 'let', 'the', 'little', 'girl', 'dance', 'she', 'never', 'danced', 'before', 'so', 'let', 'her', 'on', 'the', 'floor', 'let', 'the', 'little', 'girl', 'dance', 'let', 'the', 'little', 'girl', 'dance', 'she', 'wants', 'to', 'give', 'it', 'a', 'try', 'so', 'let', 'the', 'little', 'girl', 'by', 'shes', 'been', 'a', 'little', 'wallflower', 'on', 'the', 'shelf', 'standing', 'by', 'herself', 'now', 'she', 'got', 'the', 'nerve', 'to', 'take', 'a', 'chance', 'so', 'let', 'the', 'little', 'girl', 'dance', 'let', 'the', 'little', 'girl', 'through', 'she', 'wants', 'to', 'pass', 'by', 'you', 'buddy', 'cant', 'you', 'see', 'she', 'wants', 'to', 'dance', 'with', 'me', 'shes', 'been', 'a', 'little', 'wallflower', 'on', 'the', 'shelf', 'standing', 'by', 'herself', 'now', 'she', 'got', 'the', 'nerve', 'to', 'take', 'a', 'chance', 'so', 'let', 'the', 'little', 'girl', 'dance', 'let', 'the', 'little', 'girl', 'through', 'she', 'wants', 'to', 'pass', 'by', 'you',

In [10]:
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 647721


In [11]:
# save sequences to file
out_filename = 'rb_sequences.txt'
save_doc(sequences, out_filename)

## Train

In [12]:
# load
in_filename = 'rb_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [13]:
lines

['let the little girl dance let the little girl dance she never danced before so let her on the floor let the little girl dance let the little girl dance she wants to give it a try so let the little girl by shes been a little wallflower on the shelf',
 'the little girl dance let the little girl dance she never danced before so let her on the floor let the little girl dance let the little girl dance she wants to give it a try so let the little girl by shes been a little wallflower on the shelf standing',
 'little girl dance let the little girl dance she never danced before so let her on the floor let the little girl dance let the little girl dance she wants to give it a try so let the little girl by shes been a little wallflower on the shelf standing by',
 'girl dance let the little girl dance she never danced before so let her on the floor let the little girl dance let the little girl dance she wants to give it a try so let the little girl by shes been a little wallflower on the shelf 

In [15]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [16]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [19]:
sequences

[[57,
  3,
  115,
  60,
  354,
  57,
  3,
  115,
  60,
  354,
  71,
  52,
  2058,
  192,
  24,
  57,
  86,
  16,
  3,
  452,
  57,
  3,
  115,
  60,
  354,
  57,
  3,
  115,
  60,
  354,
  71,
  438,
  4,
  104,
  9,
  7,
  158,
  24,
  57,
  3,
  115,
  60,
  81,
  168,
  80,
  7,
  115,
  9002,
  16,
  3,
  2059],
 [3,
  115,
  60,
  354,
  57,
  3,
  115,
  60,
  354,
  71,
  52,
  2058,
  192,
  24,
  57,
  86,
  16,
  3,
  452,
  57,
  3,
  115,
  60,
  354,
  57,
  3,
  115,
  60,
  354,
  71,
  438,
  4,
  104,
  9,
  7,
  158,
  24,
  57,
  3,
  115,
  60,
  81,
  168,
  80,
  7,
  115,
  9002,
  16,
  3,
  2059,
  431],
 [115,
  60,
  354,
  57,
  3,
  115,
  60,
  354,
  71,
  52,
  2058,
  192,
  24,
  57,
  86,
  16,
  3,
  452,
  57,
  3,
  115,
  60,
  354,
  57,
  3,
  115,
  60,
  354,
  71,
  438,
  4,
  104,
  9,
  7,
  158,
  24,
  57,
  3,
  115,
  60,
  81,
  168,
  80,
  7,
  115,
  9002,
  16,
  3,
  2059,
  431,
  81],
 [60,
  354,
  57,
  3,
  115,
  60,
  354,

In [20]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [21]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [22]:
X

array([[  57,    3,  115, ..., 9002,   16,    3],
       [   3,  115,   60, ...,   16,    3, 2059],
       [ 115,   60,  354, ...,    3, 2059,  431],
       ...,
       [   6,  426,   39, ...,  320,   27,   30],
       [ 426,   39,   21, ...,   27,   30,   19],
       [  39,   21, 2429, ...,   30,   19,   44]])

In [20]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            864850    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 17297)             1746997   
Total params: 2,762,747
Trainable params: 2,762,747
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
# fit model
model.fit(X, y, batch_size=2048, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12c0c7eb8>

In [27]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

## Use Model

In [1]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [9]:
seq_length = len(lines[0].split()) - 1

In [10]:
# load the model
model = load_model('model.h5')
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [11]:
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

let me sing sha la la la la la la oh baby sha la la la la la la oh babyyour love is just for me for me its just for me its just for me for me for me its just for me i stepped out to say that i



In [12]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [16]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, "in the", 1)
print(generated)

way
