### imports

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
from  tensorflow.keras.preprocessing import text_dataset_from_directory
import os
import numpy
from keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.metrics import 


### methods for importing the data

In [10]:
def importData(directory: str) -> list:
    with open(directory) as f:
        return f.read()

def getAllFilesInFolder(dir: str) -> list:
    return os.listdir(dir)

def cleanText(text: str):
    splitText = text.split(" ")
    return splitText


### pull country songs into local mem

In [11]:
countrySongs = []
for file in getAllFilesInFolder("./data/Country"):
    filePath = "./data/Country/{}".format(file)
    if "DS_Store" not in filePath:
        countrySongs.append(importData(filePath))

len(countrySongs)


100

### use tokenizer to encode words to numerical representation

In [12]:

# source text -> allCountryLyrics
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(countrySongs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# Transforms each text in texts to a sequence of integers.
encoded = tokenizer.texts_to_sequences(countrySongs)
len(encoded)


Vocabulary Size: 2803


100

### get the doubles in the text (2 words and their target word/ next word)

In [13]:

# create word -> word sequences 
# get sequences of words together, so given "a b c d"
#  you should give [a,b], [b,c], [c,d] all the adjacent pairs
sequences = []
for i in range(len(encoded)):
	for j in range(2, len(encoded[i])):
		sequence = encoded[i][j-2:j+1]
		sequences.append(sequence)
		
		
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 25367


[[2, 458, 14],
 [458, 14, 516],
 [14, 516, 2],
 [516, 2, 32],
 [2, 32, 408],
 [32, 408, 3],
 [408, 3, 5],
 [3, 5, 459],
 [5, 459, 460],
 [459, 460, 1439],
 [460, 1439, 34],
 [1439, 34, 1],
 [34, 1, 161],
 [1, 161, 118],
 [161, 118, 169],
 [118, 169, 18],
 [169, 18, 5],
 [18, 5, 43],
 [5, 43, 320],
 [43, 320, 112],
 [320, 112, 1440],
 [112, 1440, 3],
 [1440, 3, 89],
 [3, 89, 3],
 [89, 3, 169],
 [3, 169, 94],
 [169, 94, 38],
 [94, 38, 110],
 [38, 110, 4],
 [110, 4, 95],
 [4, 95, 110],
 [95, 110, 156],
 [110, 156, 95],
 [156, 95, 58],
 [95, 58, 2],
 [58, 2, 458],
 [2, 458, 14],
 [458, 14, 516],
 [14, 516, 2],
 [516, 2, 32],
 [2, 32, 408],
 [32, 408, 3],
 [408, 3, 5],
 [3, 5, 459],
 [5, 459, 1104],
 [459, 1104, 73],
 [1104, 73, 261],
 [73, 261, 3],
 [261, 3, 178],
 [3, 178, 29],
 [178, 29, 188],
 [29, 188, 1105],
 [188, 1105, 1441],
 [1105, 1441, 19],
 [1441, 19, 3],
 [19, 3, 23],
 [3, 23, 116],
 [23, 116, 611],
 [116, 611, 1442],
 [611, 1442, 12],
 [1442, 12, 1443],
 [12, 1443, 11],
 [144

### make all senteances of equal length (pad them with space - keras req.)

In [14]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 3


### split into test data (leading word) and the test label (adjacent/ target word)

In [15]:
# split into X and y elements (data and label)
sequences = numpy.array(sequences)
testData, testLabels = sequences[:,:-1],sequences[:,-1]
# one hot encode outputs -> Converts a class vector (integers) to binary class matrix.
testLabels= to_categorical(testLabels, num_classes=vocab_size)

### define and fit the model

In [16]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(testData, testLabels, epochs=100, verbose=2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 10)             28030     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_1 (Dense)              (None, 2803)              283103    
Total params: 355,533
Trainable params: 355,533
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
793/793 - 6s - loss: 6.4657 - accuracy: 0.0399
Epoch 2/100
793/793 - 3s - loss: 6.0790 - accuracy: 0.0407
Epoch 3/100
793/793 - 3s - loss: 5.9755 - accuracy: 0.0428
Epoch 4/100
793/793 - 3s - loss: 5.8496 - accuracy: 0.0529
Epoch 5/100
793/793 - 3s - loss: 5.6556 - accuracy: 0.0591
Epoch 6/100
793/793 - 3s - loss: 5.4773 - accuracy: 0.0673
Epoch 7/100
793/793

<keras.callbacks.History at 0x7ffa27bb3910>

In [17]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = numpy.argmax(model.predict(encoded), axis=-1)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

### method to get a random sentance starter


In [33]:
import random

def getSentanceStarter(sentanceStarterLenth, songs):
    randomSongUpperBound = len(songs)
    #
    randomSongIndex = random.randint(0, randomSongUpperBound-1)
    randomSong = songs[randomSongIndex]
    splitSong = randomSong.split("\n")
    randomLineIndex = random.randint(0, len(splitSong)-1)
    line = splitSong[randomLineIndex]
    splitLine = line.split(" ")
    joinedSelection = " ".join(splitLine[0:sentanceStarterLenth])
    return joinedSelection



In [34]:
getSentanceStarter(3, countrySongs)

'in that sleepy'

In [18]:
# print(generate_seq(model, tokenizer, "I", 6))
print(generate_seq(model, tokenizer, max_length-1, 'I can', 100))

#After the model is fit, we test it by passing it a given word from the vocabulary and having the model predict the next word. 
# Here we pass in ‘Jack‘ by encoding it and calling model.predict_classes() to get the integer output for the predicted word. 
# This is then looked up in the vocabulary mapping to give the associated word.

I can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in my dreams i can never find you in


In [19]:
# they went precious times there i know who holds
# who holds my handev'ry step is my wish i
# wish i was born and raisedthe place where i
# where i grew content to be my queen
# my queen you're the one i want to thank

## eval with BLEU score
https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1
