## imports

In [None]:
import numpy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
import os
from keras.preprocessing.sequence import pad_sequences


### methods for importing the data

In [3]:
def importData(directory: str) -> list:
    with open(directory) as f:
        return f.read()

def getAllFilesInFolder(dir: str) -> list:
    return os.listdir(dir)

### pull country songs into local mem

In [4]:
countrySongs = []
for file in getAllFilesInFolder("./data/Country"):
    filePath = "./data/Country/{}".format(file)
    if "DS_Store" not in filePath:
        countrySongs.append(importData(filePath))

len(countrySongs)


100

### use tokenizer to encode words to numerical representation

In [5]:

# source text -> allCountryLyrics
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(countrySongs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# Transforms each text in texts to a sequence of integers.
encoded = tokenizer.texts_to_sequences(countrySongs)
len(encoded)


Vocabulary Size: 2803


100

### get the triples in the text (3 words and their target word/ next word)

In [6]:

# create word -> word sequences 
# get sequences of words together, so given "a b c d e"
#  you should give [a,b,c,d], [b,c,d e] and all of the triples and thier target word
sequences = []
for i in range(len(encoded)):
	for j in range(3, len(encoded[i])):
		sequence = encoded[i][j-3:j+1]
		sequences.append(sequence)
		
		
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 25267


[[2, 458, 14, 516],
 [458, 14, 516, 2],
 [14, 516, 2, 32],
 [516, 2, 32, 408],
 [2, 32, 408, 3],
 [32, 408, 3, 5],
 [408, 3, 5, 459],
 [3, 5, 459, 460],
 [5, 459, 460, 1439],
 [459, 460, 1439, 34],
 [460, 1439, 34, 1],
 [1439, 34, 1, 161],
 [34, 1, 161, 118],
 [1, 161, 118, 169],
 [161, 118, 169, 18],
 [118, 169, 18, 5],
 [169, 18, 5, 43],
 [18, 5, 43, 320],
 [5, 43, 320, 112],
 [43, 320, 112, 1440],
 [320, 112, 1440, 3],
 [112, 1440, 3, 89],
 [1440, 3, 89, 3],
 [3, 89, 3, 169],
 [89, 3, 169, 94],
 [3, 169, 94, 38],
 [169, 94, 38, 110],
 [94, 38, 110, 4],
 [38, 110, 4, 95],
 [110, 4, 95, 110],
 [4, 95, 110, 156],
 [95, 110, 156, 95],
 [110, 156, 95, 58],
 [156, 95, 58, 2],
 [95, 58, 2, 458],
 [58, 2, 458, 14],
 [2, 458, 14, 516],
 [458, 14, 516, 2],
 [14, 516, 2, 32],
 [516, 2, 32, 408],
 [2, 32, 408, 3],
 [32, 408, 3, 5],
 [408, 3, 5, 459],
 [3, 5, 459, 1104],
 [5, 459, 1104, 73],
 [459, 1104, 73, 261],
 [1104, 73, 261, 3],
 [73, 261, 3, 178],
 [261, 3, 178, 29],
 [3, 178, 29, 188],
 

### make all senteances of equal length (pad them with space - keras req.)

In [None]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

### split into test data (leading word) and the test label (adjacent/ target word)

In [None]:

sequences = numpy.array(sequences)
testData, testLabels = sequences[:,:-1],sequences[:,-1]
# one hot encode outputs -> Converts a class vector (integers) to binary class matrix.
testLabels= to_categorical(testLabels, num_classes=vocab_size)

### define and fit the model

In [31]:


# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(testData, testLabels, epochs=100, verbose=2)

Max Sequence Length: 4
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             28030     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 2803)              142953    
Total params: 183,183
Trainable params: 183,183
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
790/790 - 4s - loss: 6.4840 - accuracy: 0.0385
Epoch 2/100
790/790 - 3s - loss: 6.1206 - accuracy: 0.0398
Epoch 3/100
790/790 - 3s - loss: 5.9905 - accuracy: 0.0412
Epoch 4/100
790/790 - 2s - loss: 5.8958 - accuracy: 0.0449
Epoch 5/100
790/790 - 3s - loss: 5.8006 - accuracy: 0.0526
Epoch 6/100
790/790 - 2s - loss: 5.6897 - accuracy: 0.0

<keras.callbacks.History at 0x7fe0462b8e50>

### method to generate sequences of text from model

In [33]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = numpy.argmax(model.predict(encoded), axis=-1)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [39]:
# print(generate_seq(model, tokenizer, "I", 6))
print(generate_seq(model, tokenizer, max_length-1, 'I am', 70))

#After the model is fit, we test it by passing it a given word from the vocabulary and having the model predict the next word. 
# Here we pass in ‘Jack‘ by encoding it and calling model.predict_classes() to get the integer output for the predicted word. 
# This is then looked up in the vocabulary mapping to give the associated word.

I am so many stories of where i've been and i'd be no chance to grayi you smile gets me on the job from nine to five that mistake i lost your shot of whiskey you be my honeysuckle i'll be your honey bee i'll be your honey bee i'll be your honey bee i'll be your honey bee i'll be your honey bee i'll be your honey bee i'll be your


## eval with BLEU score
https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1
