### imports

In [63]:
from tensorflow.keras.preprocessing.text import Tokenizer
# from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
from  tensorflow.keras.preprocessing import text_dataset_from_directory
import os
import numpy
from keras.preprocessing.sequence import pad_sequences

# from tensorflow.keras.metrics import 


### methods for importing the data

In [64]:
def importData(directory: str) -> list:
    with open(directory) as f:
        return f.read()

def getAllFilesInFolder(dir: str) -> list:
    return os.listdir(dir)

def cleanText(text: str):
    splitText = text.split(" ")
    return splitText


### pull country songs into local mem

In [65]:
songs = []
for file in getAllFilesInFolder("./data/Pop"):
    filePath = "./data/Pop/{}".format(file)
    if "DS_Store" not in filePath:
        songs.append(importData(filePath))

len(songs)


100

### use tokenizer to encode words to numerical representation

In [66]:

# source text -> allCountryLyrics
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(songs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# Transforms each text in texts to a sequence of integers.
encoded = tokenizer.texts_to_sequences(songs)
len(encoded)


Vocabulary Size: 2795


100

### get the doubles in the text (2 words and their target word/ next word)

In [67]:

# create word -> word sequences 
# get sequences of words together, so given "a b c d"
#  you should give [a,b], [b,c], [c,d] all the adjacent pairs
sequences = []
for i in range(len(encoded)):
	for j in range(2, len(encoded[i])):
		sequence = encoded[i][j-2:j+1]
		sequences.append(sequence)
		
		
print('Total Sequences: %d' % len(sequences))
sequences

Total Sequences: 34664


[[26, 21, 3],
 [21, 3, 308],
 [3, 308, 13],
 [308, 13, 1],
 [13, 1, 1505],
 [1, 1505, 15],
 [1505, 15, 7],
 [15, 7, 1506],
 [7, 1506, 5],
 [1506, 5, 21],
 [5, 21, 3],
 [21, 3, 1507],
 [3, 1507, 1],
 [1507, 1, 49],
 [1, 49, 10],
 [49, 10, 1508],
 [10, 1508, 7],
 [1508, 7, 214],
 [7, 214, 1],
 [214, 1, 117],
 [1, 117, 1],
 [117, 1, 1158],
 [1, 1158, 7],
 [1158, 7, 67],
 [7, 67, 16],
 [67, 16, 97],
 [16, 97, 26],
 [97, 26, 1509],
 [26, 1509, 1510],
 [1509, 1510, 1],
 [1510, 1, 117],
 [1, 117, 11],
 [117, 11, 1511],
 [11, 1511, 15],
 [1511, 15, 7],
 [15, 7, 300],
 [7, 300, 136],
 [300, 136, 2],
 [136, 2, 55],
 [2, 55, 5],
 [55, 5, 2],
 [5, 2, 248],
 [2, 248, 64],
 [248, 64, 512],
 [64, 512, 9],
 [512, 9, 286],
 [9, 286, 62],
 [286, 62, 2],
 [62, 2, 248],
 [2, 248, 37],
 [248, 37, 629],
 [37, 629, 123],
 [629, 123, 2],
 [123, 2, 137],
 [2, 137, 287],
 [137, 287, 2],
 [287, 2, 19],
 [2, 19, 22],
 [19, 22, 1],
 [22, 1, 137],
 [1, 137, 353],
 [137, 353, 7],
 [353, 7, 630],
 [7, 630, 39],
 [630

### make all senteances of equal length (pad them with space - keras req.)

In [68]:
# pad sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 3


### split into test data (leading word) and the test label (adjacent/ target word)

In [69]:
# split into X and y elements (data and label)
sequences = numpy.array(sequences)
testData, testLabels = sequences[:,:-1],sequences[:,-1]
# one hot encode outputs -> Converts a class vector (integers) to binary class matrix.
testLabels= to_categorical(testLabels, num_classes=vocab_size)

### define and fit the model

In [70]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(testData, testLabels, epochs=60, verbose=2)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 2, 10)             27950     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dense_4 (Dense)              (None, 2795)              282295    
Total params: 354,645
Trainable params: 354,645
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/60
1084/1084 - 23s - loss: 6.2578 - accuracy: 0.0408
Epoch 2/60
1084/1084 - 11s - loss: 5.9099 - accuracy: 0.0423
Epoch 3/60
1084/1084 - 12s - loss: 5.7505 - accuracy: 0.0477
Epoch 4/60
1084/1084 - 12s - loss: 5.5247 - accuracy: 0.0587
Epoch 5/60
1084/1084 - 12s - loss: 5.3208 - accuracy: 0.0692
Epoch 6/60
1084/1084 - 11s - loss: 5.0776 - accuracy: 0.0933
Epoch 7

<keras.callbacks.History at 0x7ffa27eda880>

In [71]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		yhat = numpy.argmax(model.predict(encoded), axis=-1)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

### method to get a random sentance starter


In [72]:
import random

def getSentanceStarter(sentanceStarterLenth, songs):
    randomSongUpperBound = len(songs)
    #
    randomSongIndex = random.randint(0, randomSongUpperBound-1)
    randomSong = songs[randomSongIndex]
    splitSong = randomSong.split("\n")
    randomLineIndex = random.randint(0, len(splitSong)-1)
    line = splitSong[randomLineIndex]
    splitLine = line.split(" ")
    joinedSelection = " ".join(splitLine[0:sentanceStarterLenth])
    return joinedSelection



In [73]:
for i in range(10):
    print(generate_seq(model, tokenizer, max_length-1, getSentanceStarter(2, songs), 10))
    getSentanceStarter(3, songs)

But I love it but i love it but i love it
Now he took my chick up to the ground to get out
I get my weed from california that's that shit i get my
Will have on it bet on it bet on it bet on
Un mondo di giustizia e di speranzaognuno dia la mano al suo
Ask myself why had you everyday i hope you understandi'm trying to
Oh, oh oh oh oh oh oh oh oh oh oh oh
 boy i cannot pretend i'm not fazed only here to
La-la-la, la-la-la-la la la la la la la la la la la
And that we won't be long 'til i hit the dancefloor i


In [74]:
# print(generate_seq(model, tokenizer, "I", 6))
print(generate_seq(model, tokenizer, max_length-1, 'I can', 100))

#After the model is fit, we test it by passing it a given word from the vocabulary and having the model predict the next word. 
# Here we pass in ‘Jack‘ by encoding it and calling model.predict_classes() to get the integer output for the predicted word. 
# This is then looked up in the vocabulary mapping to give the associated word.

I can feel the beat i don't wanna be friends i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all i need it all


In [75]:
# they went precious times there i know who holds
# who holds my handev'ry step is my wish i
# wish i was born and raisedthe place where i
# where i grew content to be my queen
# my queen you're the one i want to thank

## eval with BLEU score
https://towardsdatascience.com/how-to-evaluate-text-generation-models-metrics-for-automatic-evaluation-of-nlp-models-e1c251b04ec1
