## imports

In [36]:
import numpy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
import os


### methods to import the data into memory

In [38]:
def importData(directory: str) -> list:
    with open(directory) as f:
        return f.read()

def getAllFilesInFolder(dir: str) -> list:
    return os.listdir(dir)


### pull country songs into local mem

In [45]:
countrySongs = []
for file in getAllFilesInFolder("./data/Country"):
    filePath = "./data/Country/{}".format(file)
    if "DS_Store" not in filePath:
        countrySongs.append(importData(filePath))

len(countrySongs)


100

### use tokenizer to convert text into integer-index representation

In [59]:

# source text -> allCountryLyrics
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(countrySongs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# Transforms each text in texts to a sequence of integers.
encoded = tokenizer.texts_to_sequences(countrySongs)
len(encoded)


Vocabulary Size: 2803


100

### get the pairs in the text ie a word, and its adjacent word (the target word)

In [63]:
# create word -> word sequences 
# get sequences of words together, so given "a b c d"
#  you should give [a,b], [b,c], [c,d] all the adjacent pairs
sequences = []
for i in range(len(encoded)):
	for j in range(1, len(encoded[i])):
		sequence = encoded[i][j-1:j+1]
		sequences.append(sequence)
		
		
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 25467


[[2, 458],
 [458, 14],
 [14, 516],
 [516, 2],
 [2, 32],
 [32, 408],
 [408, 3],
 [3, 5],
 [5, 459],
 [459, 460],
 [460, 1439],
 [1439, 34],
 [34, 1],
 [1, 161],
 [161, 118],
 [118, 169],
 [169, 18],
 [18, 5],
 [5, 43],
 [43, 320],
 [320, 112],
 [112, 1440],
 [1440, 3],
 [3, 89],
 [89, 3],
 [3, 169],
 [169, 94],
 [94, 38],
 [38, 110],
 [110, 4],
 [4, 95],
 [95, 110],
 [110, 156],
 [156, 95],
 [95, 58],
 [58, 2],
 [2, 458],
 [458, 14],
 [14, 516],
 [516, 2],
 [2, 32],
 [32, 408],
 [408, 3],
 [3, 5],
 [5, 459],
 [459, 1104],
 [1104, 73],
 [73, 261],
 [261, 3],
 [3, 178],
 [178, 29],
 [29, 188],
 [188, 1105],
 [1105, 1441],
 [1441, 19],
 [19, 3],
 [3, 23],
 [23, 116],
 [116, 611],
 [611, 1442],
 [1442, 12],
 [12, 1443],
 [1443, 11],
 [11, 1444],
 [1444, 38],
 [38, 3],
 [3, 111],
 [111, 65],
 [65, 13],
 [13, 1445],
 [1445, 30],
 [30, 166],
 [166, 858],
 [858, 3],
 [3, 73],
 [73, 55],
 [55, 13],
 [13, 124],
 [124, 170],
 [170, 2],
 [2, 104],
 [104, 94],
 [94, 3],
 [3, 1],
 [1, 64],
 [64, 96],

### split into test data (leading word) and the test label (adjacent/ target word)

In [64]:
# split into X and y elements (data and label)
sequences = numpy.array(sequences)
testData, testLabels = sequences[:,0],sequences[:,1]
# one hot encode outputs -> Converts a class vector (integers) to binary class matrix.
testLabels= to_categorical(testLabels, num_classes=vocab_size)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             28030     
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 2803)              142953    
Total params: 183,183
Trainable params: 183,183
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
796/796 - 4s - loss: 6.5822 - accuracy: 0.0399
Epoch 2/100
796/796 - 2s - loss: 6.0478 - accuracy: 0.0396
Epoch 3/100
796/796 - 2s - loss: 5.9921 - accuracy: 0.0410
Epoch 4/100
796/796 - 2s - loss: 5.9272 - accuracy: 0.0464
Epoch 5/100
796/796 - 2s - loss: 5.8129 - accuracy: 0.0501
Epoch 6/100
796/796 - 2s - loss: 5.6840 - accuracy: 0.0574
Epoch 7/100
796/796 -

<keras.callbacks.History at 0x7faa28818f40>

### define and fit the model 

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(testData, testLabels, epochs=100, verbose=2)

### method for generating a new sequence

In [91]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = numpy.array(encoded)
		# print(encoded)
		# predict a word in the vocabulary
		yhat = numpy.argmax(model.predict(encoded), axis=-1)
		# print(yhat)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [None]:
import random

def getSentanceStarter(sentanceStarterLenth, songs):
    randomSongUpperBound = len(songs)
    #
    randomSongIndex = random.randint(0, randomSongUpperBound-1)
    randomSong = songs[randomSongIndex]
    splitSong = randomSong.split("\n")
    randomLineIndex = random.randint(0, len(splitSong)-1)
    line = splitSong[randomLineIndex]
    splitLine = line.split(" ")
    return splitLine[0:sentanceStarterLenth]

In [105]:
# print(generate_seq(model, tokenizer, "I", 6))
print(generate_seq(model, tokenizer, 'Unclasp', 8))

#After the model is fit, we test it by passing it a given word from the vocabulary and having the model predict the next word. 
# Here we pass in ‘Jack‘ by encoding it and calling model.predict_classes() to get the integer output for the predicted word. 
# This is then looked up in the vocabulary mapping to give the associated word.

[2468]
[14]
[14]
[145]
[145]
[4]
[4]
[2]
[2]
[23]
[23]
[24]
[24]
[1]
[1]
[64]
Unclasp your eyes and i don't know the world


## eval with BLEU score
https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1
