## imports

In [42]:
import numpy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
import os
from keras.preprocessing.sequence import pad_sequences


### methods for importing the data

In [43]:
def importData(directory: str) -> list:
    with open(directory) as f:
        return f.read()

def getAllFilesInFolder(dir: str) -> list:
    return os.listdir(dir)

### pull country songs into local mem

In [44]:
songs = []
for file in getAllFilesInFolder("./data/Pop"):
    filePath = "./data/Pop/{}".format(file)
    if "DS_Store" not in filePath:
        songs.append(importData(filePath))

len(songs)


100

### use tokenizer to encode words to numerical representation

In [45]:

# source text -> allCountryLyrics
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(songs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# Transforms each text in texts to a sequence of integers.
encoded = tokenizer.texts_to_sequences(songs)
len(encoded)


Vocabulary Size: 2795


100

### get the triples in the text (3 words and their target word/ next word)

In [46]:

# create word -> word sequences 
# get sequences of words together, so given "a b c d e"
#  you should give [a,b,c,d], [b,c,d e] and all of the triples and thier target word
sequences = []
for i in range(len(encoded)):
	for j in range(3, len(encoded[i])):
		sequence = encoded[i][j-3:j+1]
		sequences.append(sequence)
		
		
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 34564


[[26, 21, 3, 308],
 [21, 3, 308, 13],
 [3, 308, 13, 1],
 [308, 13, 1, 1505],
 [13, 1, 1505, 15],
 [1, 1505, 15, 7],
 [1505, 15, 7, 1506],
 [15, 7, 1506, 5],
 [7, 1506, 5, 21],
 [1506, 5, 21, 3],
 [5, 21, 3, 1507],
 [21, 3, 1507, 1],
 [3, 1507, 1, 49],
 [1507, 1, 49, 10],
 [1, 49, 10, 1508],
 [49, 10, 1508, 7],
 [10, 1508, 7, 214],
 [1508, 7, 214, 1],
 [7, 214, 1, 117],
 [214, 1, 117, 1],
 [1, 117, 1, 1158],
 [117, 1, 1158, 7],
 [1, 1158, 7, 67],
 [1158, 7, 67, 16],
 [7, 67, 16, 97],
 [67, 16, 97, 26],
 [16, 97, 26, 1509],
 [97, 26, 1509, 1510],
 [26, 1509, 1510, 1],
 [1509, 1510, 1, 117],
 [1510, 1, 117, 11],
 [1, 117, 11, 1511],
 [117, 11, 1511, 15],
 [11, 1511, 15, 7],
 [1511, 15, 7, 300],
 [15, 7, 300, 136],
 [7, 300, 136, 2],
 [300, 136, 2, 55],
 [136, 2, 55, 5],
 [2, 55, 5, 2],
 [55, 5, 2, 248],
 [5, 2, 248, 64],
 [2, 248, 64, 512],
 [248, 64, 512, 9],
 [64, 512, 9, 286],
 [512, 9, 286, 62],
 [9, 286, 62, 2],
 [286, 62, 2, 248],
 [62, 2, 248, 37],
 [2, 248, 37, 629],
 [248, 37, 62

### make all senteances of equal length (pad them with space - keras req.)

In [47]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 4


### split into test data (leading word) and the test label (adjacent/ target word)

In [48]:

sequences = numpy.array(sequences)
testData, testLabels = sequences[:,:-1],sequences[:,-1]
# one hot encode outputs -> Converts a class vector (integers) to binary class matrix.
testLabels= to_categorical(testLabels, num_classes=vocab_size)

### define and fit the model

In [49]:


# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(testData, testLabels, epochs=50, verbose=2)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 3, 10)             27950     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_4 (Dense)              (None, 2795)              282295    
Total params: 354,645
Trainable params: 354,645
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
1081/1081 - 22s - loss: 6.2501 - accuracy: 0.0421
Epoch 2/50
1081/1081 - 13s - loss: 5.9012 - accuracy: 0.0435
Epoch 3/50
1081/1081 - 13s - loss: 5.6868 - accuracy: 0.0512
Epoch 4/50
1081/1081 - 13s - loss: 5.4463 - accuracy: 0.0624
Epoch 5/50
1081/1081 - 13s - loss: 5.1943 - accuracy: 0.0802
Epoch 6/50
1081/1081 - 12s - loss: 4.9277 - accuracy: 0.1106
Epoch 7

<keras.callbacks.History at 0x7fbd04310a90>

### method to generate sequences of text from model

In [50]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = numpy.argmax(model.predict(encoded), axis=-1)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [51]:
import random

def getSentanceStarter(sentanceStarterLenth, songs):
    randomSongUpperBound = len(songs)
    #
    randomSongIndex = random.randint(0, randomSongUpperBound-1)
    randomSong = songs[randomSongIndex]
    splitSong = randomSong.split("\n")
    randomLineIndex = random.randint(0, len(splitSong)-1)
    line = splitSong[randomLineIndex]
    splitLine = line.split(" ")
    joinedSelection = " ".join(splitLine[0:sentanceStarterLenth])
    return joinedSelection



In [52]:
for i in range(10):
    print(generate_seq(model, tokenizer, max_length-1, getSentanceStarter(3, songs), 10))
    getSentanceStarter(3, songs)

I want you are the one i want ho ho ho the one
If you want me say you want me say you want me say
So beautiful in white tonightwhat i don't wanna be needing a way you
Kayyilentum thampuratti lighting thirumumpi grease get high lights you come it and
Fill up your bag and i fill my nights with the way you
Baby, what the fuck is now love yourself and if you think that
You are supreme the chicks'll scream for grease lightning go grease lighting you're
You're crazy, and i'm out of my mind'cause all of me and you
Don't think you still used to find me you look and much won't
Tonight, I'ma fight 'til we see the sunlight tick tock on the clock


## eval with BLEU score
https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1
