## imports

In [77]:
from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
from  tensorflow.keras.preprocessing import text_dataset_from_directory
import numpy
from keras.preprocessing.sequence import pad_sequences
import os
import numpy
from keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.metrics import 


 ## methods to get data 

In [78]:
def importData(directory: str) -> list:
    with open(directory) as f:
        return f.read()

def getAllFilesInFolder(dir: str) -> list:
    return os.listdir(dir)


## load all of the country music data into memory

In [79]:
songs = []
for file in getAllFilesInFolder("./data/Pop"):
    filePath = "./data/Pop/{}".format(file)
    if "DS_Store" not in filePath:
        songs.append(importData(filePath))

len(songs)


100

## turn each word in text into integer-index representation 

In [80]:

# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(songs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# Transforms each text in texts to a sequence of integers.
encoded = tokenizer.texts_to_sequences(songs)
len(encoded)


Vocabulary Size: 2795


100

## get all of the triples and thier target word in the music data

In [81]:

# create word -> word sequences 
# get sequences of words together, so given "a b c d e"
#  you should give [a,b,c,d], [b,c,d,e] adn all the adjacent triples and thier target word
sequences = []
for i in range(len(encoded)):
	for j in range(2, len(encoded[i])):
		sequence = encoded[i][j-2:j+1]
		sequences.append(sequence)
		
		
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 34664


[[26, 21, 3],
 [21, 3, 308],
 [3, 308, 13],
 [308, 13, 1],
 [13, 1, 1505],
 [1, 1505, 15],
 [1505, 15, 7],
 [15, 7, 1506],
 [7, 1506, 5],
 [1506, 5, 21],
 [5, 21, 3],
 [21, 3, 1507],
 [3, 1507, 1],
 [1507, 1, 49],
 [1, 49, 10],
 [49, 10, 1508],
 [10, 1508, 7],
 [1508, 7, 214],
 [7, 214, 1],
 [214, 1, 117],
 [1, 117, 1],
 [117, 1, 1158],
 [1, 1158, 7],
 [1158, 7, 67],
 [7, 67, 16],
 [67, 16, 97],
 [16, 97, 26],
 [97, 26, 1509],
 [26, 1509, 1510],
 [1509, 1510, 1],
 [1510, 1, 117],
 [1, 117, 11],
 [117, 11, 1511],
 [11, 1511, 15],
 [1511, 15, 7],
 [15, 7, 300],
 [7, 300, 136],
 [300, 136, 2],
 [136, 2, 55],
 [2, 55, 5],
 [55, 5, 2],
 [5, 2, 248],
 [2, 248, 64],
 [248, 64, 512],
 [64, 512, 9],
 [512, 9, 286],
 [9, 286, 62],
 [286, 62, 2],
 [62, 2, 248],
 [2, 248, 37],
 [248, 37, 629],
 [37, 629, 123],
 [629, 123, 2],
 [123, 2, 137],
 [2, 137, 287],
 [137, 287, 2],
 [287, 2, 19],
 [2, 19, 22],
 [19, 22, 1],
 [22, 1, 137],
 [1, 137, 353],
 [137, 353, 7],
 [353, 7, 630],
 [7, 630, 39],
 [630

### make all senteances of equal length (pad them with space - keras req.)

In [82]:
# pad sequences to equal lenth (Keras requirement)
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 3


## Get the word embeddings from GLOVE and add them to local memory

In [83]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = numpy.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


## create weight matrix for glove embeddings

In [84]:
# create a weight matrix for words in training docs
embedding_matrix = numpy.zeros((vocab_size, 100))
for word, i in  tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

### prepare data and labels for model 

In [85]:
# convert to numpy and split into data (triples) and labels (target word)
sequences = numpy.array(sequences)
testData, testLabels = sequences[:,:-1],sequences[:,-1]

# one hot encode outputs -> Converts a class vector (integers) to binary class matrix.
testLabels= to_categorical(testLabels, num_classes=vocab_size)

### define and fit the model 

In [86]:
# define model
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length-1, trainable=False)
model = Sequential()
model.add(e)
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(testData, testLabels, epochs=60, verbose=2)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2, 100)            279500    
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_5 (Dense)              (None, 2795)              282295    
Total params: 642,195
Trainable params: 362,695
Non-trainable params: 279,500
_________________________________________________________________
None
Epoch 1/60
1084/1084 - 20s - loss: 6.0593 - accuracy: 0.0565
Epoch 2/60
1084/1084 - 11s - loss: 5.4154 - accuracy: 0.0914
Epoch 3/60
1084/1084 - 11s - loss: 4.9545 - accuracy: 0.1296
Epoch 4/60
1084/1084 - 10s - loss: 4.5696 - accuracy: 0.1676
Epoch 5/60
1084/1084 - 10s - loss: 4.2471 - accuracy: 0.2030
Epoch 6/60
1084/1084 - 10s - loss: 3.9711 - accuracy: 0.2347
E

<keras.callbacks.History at 0x7fea8acd8d00>

### method to generate sequences of text from model

In [87]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = numpy.argmax(model.predict(encoded), axis=-1)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [88]:
import random

def getSentanceStarter(sentanceStarterLenth, songs):
    randomSongUpperBound = len(songs)
    #
    randomSongIndex = random.randint(0, randomSongUpperBound-1)
    randomSong = songs[randomSongIndex]
    splitSong = randomSong.split("\n")
    randomLineIndex = random.randint(0, len(splitSong)-1)
    line = splitSong[randomLineIndex]
    splitLine = line.split(" ")
    joinedSelection = " ".join(splitLine[0:sentanceStarterLenth])
    return joinedSelection



In [89]:
for i in range(10):
    print(generate_seq(model, tokenizer, max_length-1, getSentanceStarter(2, songs), 10))

It's a sucker for you that's what i like that's what i
I'ma leave the door open i'ma leave the door open i'ma leave
We're going down swinging i'll be there and watch me burn well
Nah-nah-nah (in my life cause you know that i want your love
I will always love you i can't wanna feel i feel like
No doubt in my life cause you know that i want your
I'm flyin', pallivaalu pallivaalu pallivaalu pallivaalu pallivaalu pallivaalu pallivaalu pallivaalu pallivaalu pallivaalu
I'm sorry, for breaking your heart and i know how it feels
'Cause somewhere in the arms of the rain boom boom boom boom
And in your eyes on me i said you're holding back she


In [90]:
# print(generate_seq(model, tokenizer, "I", 6))
print(generate_seq(model, tokenizer, max_length-1, "if i", 100))

if i had to do the same again i would never do this but and i know how it feels that i want your love and i know how it feels that i want your love and i know how it feels that i want your love and i know how it feels that i want your love and i know how it feels that i want your love and i know how it feels that i want your love and i know how it feels that i want your love and i know how it feels that i want your love


## eval with BLEU score
https://towardsdatascience.com/how-to-evaluate-texxt-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1


In [91]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu


def bleu(ref, gen):
    ''' 
    calculate pair wise bleu score. uses nltk implementation
    Args:
        references : a list of reference sentences 
        candidates : a list of candidate(generated) sentences
    Returns:
        bleu score(float)
    '''
    ref_bleu = []
    gen_bleu = []
    for l in gen:
        gen_bleu.append(l.split())
    for i,l in enumerate(ref):
        ref_bleu.append([l.split()])
    cc = SmoothingFunction()
    score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(0, 1, 0, 0), smoothing_function=cc.method4)
    return score_bleu

In [92]:
bleu(["It's hittin' rock bottom smoke 'em if you got 'em"],["nothing's going right makin' i just wish you were a"])

0.025584278811044955