## imports

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
from  tensorflow.keras.preprocessing import text_dataset_from_directory
import numpy
from keras.preprocessing.sequence import pad_sequences
import os
import numpy
from keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.metrics import 


 ## methods to get data 

In [2]:
def importData(directory: str) -> list:
    with open(directory) as f:
        return f.read()

def getAllFilesInFolder(dir: str) -> list:
    return os.listdir(dir)


## load all of the country music data into memory

In [3]:
songs = []
for file in getAllFilesInFolder("./data/Country"):
    filePath = "./data/Country/{}".format(file)
    if "DS_Store" not in filePath:
        songs.append(importData(filePath))

len(songs)


100

## turn each word in text into integer-index representation 

In [4]:

# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(songs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# Transforms each text in texts to a sequence of integers.
encoded = tokenizer.texts_to_sequences(songs)
len(encoded)


Vocabulary Size: 2803


100

## get all of the triples and thier target word in the music data

In [5]:

# create word -> word sequences 
# get sequences of words together, so given "a b c d e"
#  you should give [a,b,c,d], [b,c,d,e] adn all the adjacent triples and thier target word
sequences = []
for i in range(len(encoded)):
	for j in range(2, len(encoded[i])):
		sequence = encoded[i][j-2:j+1]
		sequences.append(sequence)
		
		
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 25367


[[2, 458, 14],
 [458, 14, 516],
 [14, 516, 2],
 [516, 2, 32],
 [2, 32, 408],
 [32, 408, 3],
 [408, 3, 5],
 [3, 5, 459],
 [5, 459, 460],
 [459, 460, 1439],
 [460, 1439, 34],
 [1439, 34, 1],
 [34, 1, 161],
 [1, 161, 118],
 [161, 118, 169],
 [118, 169, 18],
 [169, 18, 5],
 [18, 5, 43],
 [5, 43, 320],
 [43, 320, 112],
 [320, 112, 1440],
 [112, 1440, 3],
 [1440, 3, 89],
 [3, 89, 3],
 [89, 3, 169],
 [3, 169, 94],
 [169, 94, 38],
 [94, 38, 110],
 [38, 110, 4],
 [110, 4, 95],
 [4, 95, 110],
 [95, 110, 156],
 [110, 156, 95],
 [156, 95, 58],
 [95, 58, 2],
 [58, 2, 458],
 [2, 458, 14],
 [458, 14, 516],
 [14, 516, 2],
 [516, 2, 32],
 [2, 32, 408],
 [32, 408, 3],
 [408, 3, 5],
 [3, 5, 459],
 [5, 459, 1104],
 [459, 1104, 73],
 [1104, 73, 261],
 [73, 261, 3],
 [261, 3, 178],
 [3, 178, 29],
 [178, 29, 188],
 [29, 188, 1105],
 [188, 1105, 1441],
 [1105, 1441, 19],
 [1441, 19, 3],
 [19, 3, 23],
 [3, 23, 116],
 [23, 116, 611],
 [116, 611, 1442],
 [611, 1442, 12],
 [1442, 12, 1443],
 [12, 1443, 11],
 [144

### make all senteances of equal length (pad them with space - keras req.)

In [6]:
# pad sequences to equal lenth (Keras requirement)
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 3


## Get the word embeddings from GLOVE and add them to local memory

In [7]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = numpy.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400001 word vectors.


## create weight matrix for glove embeddings

In [8]:
# create a weight matrix for words in training docs
embedding_matrix = numpy.zeros((vocab_size, 100))
for word, i in  tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

### prepare data and labels for model 

In [9]:
# convert to numpy and split into data (triples) and labels (target word)
sequences = numpy.array(sequences)
testData, testLabels = sequences[:,:-1],sequences[:,-1]

# one hot encode outputs -> Converts a class vector (integers) to binary class matrix.
testLabels= to_categorical(testLabels, num_classes=vocab_size)

### define and fit the model 

In [25]:
# define model
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length-1, trainable=False)
model = Sequential()
model.add(e)
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(testData, testLabels, epochs=100, verbose=2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 100)            280300    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 2803)              283103    
Total params: 643,803
Trainable params: 363,503
Non-trainable params: 280,300
_________________________________________________________________
None
Epoch 1/100
793/793 - 5s - loss: 6.3286 - accuracy: 0.0513
Epoch 2/100
793/793 - 3s - loss: 5.7974 - accuracy: 0.0718
Epoch 3/100
793/793 - 3s - loss: 5.4601 - accuracy: 0.0933
Epoch 4/100
793/793 - 3s - loss: 5.1506 - accuracy: 0.1117
Epoch 5/100
793/793 - 3s - loss: 4.8614 - accuracy: 0.1311
Epoch 6/100
793/793 - 3s - loss: 4.5920 - accuracy: 0.1529
Epoch 7/100
7

<keras.callbacks.History at 0x7feaa09180a0>

### method to generate sequences of text from model

In [11]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = numpy.argmax(model.predict(encoded), axis=-1)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [None]:
import random

def getSentanceStarter(sentanceStarterLenth, songs):
    randomSongUpperBound = len(songs)
    #
    randomSongIndex = random.randint(0, randomSongUpperBound-1)
    randomSong = songs[randomSongIndex]
    splitSong = randomSong.split("\n")
    randomLineIndex = random.randint(0, len(splitSong)-1)
    line = splitSong[randomLineIndex]
    splitLine = line.split(" ")
    joinedSelection = " ".join(splitLine[0:sentanceStarterLenth])
    return joinedSelection



In [28]:
# print(generate_seq(model, tokenizer, "I", 6))
print(generate_seq(model, tokenizer, max_length-1, "if i", 100))

if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i didn't love you if i


## eval with BLEU score
https://towardsdatascience.com/how-to-evaluate-texxt-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1


In [13]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu, sentence_bleu


def bleu(ref, gen):
    ''' 
    calculate pair wise bleu score. uses nltk implementation
    Args:
        references : a list of reference sentences 
        candidates : a list of candidate(generated) sentences
    Returns:
        bleu score(float)
    '''
    ref_bleu = []
    gen_bleu = []
    for l in gen:
        gen_bleu.append(l.split())
    for i,l in enumerate(ref):
        ref_bleu.append([l.split()])
    cc = SmoothingFunction()
    score_bleu = corpus_bleu(ref_bleu, gen_bleu, weights=(0, 1, 0, 0), smoothing_function=cc.method4)
    return score_bleu

In [14]:
bleu(["It's hittin' rock bottom smoke 'em if you got 'em"],["nothing's going right makin' i just wish you were a"])

0.025584278811044955