### N-Gram Language Model

In [44]:
import pandas as pd
import ngram_laplace_lm_model as lm
import nltk
import numpy as np
from nltk.lm.preprocessing import padded_everygram_pipeline

import utils 

In [45]:
# constants 
NGRAM = 3
VERBOSE = True
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"
NEWLINE = " NEW "

GENRE = "Heavy Metal"
SONG_LIMIT = None

# define a percentage of the data to use for training
SPLIT_PC = .80

# True to train by groups of lines, False to train by single lines 
BY_VERSE = True 

In [46]:
# read in cleaned data
song_df = pd.read_csv('clean_data.csv')
print(song_df.shape)
song_df.head(5)

(163020, 5)


Unnamed: 0,artist,song_name,lyrics,language,genres
0,ivete sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en,"['Pop', ' Axé', ' Romântico']"
1,ivete sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,"['Pop', ' Axé', ' Romântico']"
2,ivete sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",en,"['Pop', ' Axé', ' Romântico']"
3,ivete sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",en,"['Pop', ' Axé', ' Romântico']"
4,ivete sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,en,"['Pop', ' Axé', ' Romântico']"


In [47]:
# get song lyrics of the given genre 
from collections import Counter


song_lyrics = utils.get_lyrics_in_genre(song_df, GENRE, verbose=VERBOSE, by_verse=BY_VERSE, song_limit=SONG_LIMIT)

# calculate the last index for the training data
END = int(len(song_lyrics) * SPLIT_PC) 

# separate train and test data
train_lyrics = song_lyrics[0:END]
test_lyrics = song_lyrics[END:]

# check dimensions
print("Number of lines in training data:", len(train_lyrics))
print("Number of lines in test data:", len(test_lyrics))

print("Lyric example:", train_lyrics[0])


Selected 18849 / 18849 in the genre Heavy Metal
Total sequences: 142294
Number of lines in training data: 113835
Number of lines in test data: 28459
Lyric example: become one with imagination NEW no more fairytales (no more fairytales) NEW our souls will unite together NEW we will lift the veil (lift the veil)


In [48]:
# method to help format generated sentences 
def clean_lyric(lyric_tokens: list) -> str:
    """
    Return the given sequence of tokens as a single string without special tokens like <s> or </s>

    Args:
        lyric_tokens (list): list of tokens for the generated sequence

    Returns:
        The tokens joined in a single string without special characters 

    """
    lyric_str = ' '.join(lyric_tokens)
    lyric_str = lyric_str.replace(NEWLINE, '\n')
    return lyric_str.replace(SENTENCE_BEGIN, '').replace(SENTENCE_END, '').strip()

#### Using NLTK Model with Kneser-Ney Smoothing 

In [123]:
def create_ngram_kneser_ney_model(sequences: list, ngram: int = NGRAM, verbose: bool = True):
	"""
	 Creates a trained n-gram language model using Kneser-Ney Smoothing. Model will be trained on songs in the given 
	 music genre. 

	 Args:
		sequences (list): a list of training sequence strings, not tokenized 
		ngram (int): the n-gram order of the language model to create
		verbose (bool): if True, prints information about the training data 
				
	Returns:
		A trained KneserNeyInterpolated
	"""
	# split each line into tokens 
	tokens = [utils.tokenize_line(seq, ngram) for seq in sequences]

	# allow padded_everygram_pipeline to create ngrams for the model 
	ngrams_generator, padded_sents = padded_everygram_pipeline(ngram, tokens)

	model = nltk.lm.Laplace(ngram)
	model.fit(ngrams_generator, padded_sents)
     
	if verbose:
		print("Number of tokens:", len(tokens))
		print("Vocabulary Size:", len(model.vocab))
	
	return model

In [124]:
kneser_ney_model = create_ngram_kneser_ney_model(train_lyrics)

Number of tokens: 1344
Vocabulary Size: 1172


In [93]:
# Generate 10 lines and print them out 
NUM_SEQ = 15 
seed = [SENTENCE_BEGIN] * (NGRAM - 1)
max_len = 15 # nltk's models generate sequences of a fixed length 

print("Sample Generated Lyrics:", GENRE, "\n")
for i in range(NUM_SEQ):
    lyric_tokens = list(kneser_ney_model.generate(max_len, seed))
    print(clean_lyric(lyric_tokens))

Sample Generated Lyrics: Pop 

i got pounds , jungkook ]
you should crown me
[ chorus
we 're on and the phony innocence
and pleasure is a long time to act , i need your head my girl
i want to relay
so high flyer
liberation
education


KeyboardInterrupt: 

In [49]:
perplexities = []

for lyric in test_lyrics:
    lyric_test_tokens = utils.tokenize_line(lyric, 2)
    print(lyric_test_tokens)

    #test_ngrams = list(nltk.bigrams(nltk.lm.preprocessing.pad_both_ends(lyric.split(' '), n=2)))
    test_ngrams = list(nltk.bigrams(lyric_test_tokens,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>"))
    print(list(test_ngrams))

    #test_ngrams = [('<s>',), ('<s>',)]
    #grams = list(nltk.ngrams(nltk.lm.preprocessing.pad_both_ends(lyric.split(), n=NGRAM-1), n=NGRAM))
    #grams = [('<UNK>', 'hello')]
    
    print([tuple(lyric_test_tokens)])
    print(kneser_ney_model.perplexity([tuple(lyric_test_tokens)]))
    perplexities.append(kneser_ney_model.perplexity([tuple(lyric_test_tokens)]))

np.median(perplexities)


['<s>', 'man', 'condemned', 'again', 'NEW', 'fallen', 'at', 'the', 'image', 'NEW', 'man', 'condemned', 'again', 'of', 'pain', 'NEW', 'i', 'am', 'the', 'son', 'of', 'the', 'earth', 'NEW', 'pain', '!', '</s>']
[('<s>', 'man'), ('man', 'condemned'), ('condemned', 'again'), ('again', 'NEW'), ('NEW', 'fallen'), ('fallen', 'at'), ('at', 'the'), ('the', 'image'), ('image', 'NEW'), ('NEW', 'man'), ('man', 'condemned'), ('condemned', 'again'), ('again', 'of'), ('of', 'pain'), ('pain', 'NEW'), ('NEW', 'i'), ('i', 'am'), ('am', 'the'), ('the', 'son'), ('son', 'of'), ('of', 'the'), ('the', 'earth'), ('earth', 'NEW'), ('NEW', 'pain!'), ('pain!', '</s>')]
[('<s>', 'man', 'condemned', 'again', 'NEW', 'fallen', 'at', 'the', 'image', 'NEW', 'man', 'condemned', 'again', 'of', 'pain', 'NEW', 'i', 'am', 'the', 'son', 'of', 'the', 'earth', 'NEW', 'pain', '!', '</s>')]


NameError: name 'kneser_ney_model' is not defined

#### Using Own N-Gram Language Model with Laplace Smoothing

In [50]:
def create_ngram_laplace_model(sequences: list, ngram: int = NGRAM, verbose: bool = True):
	"""
	 Creates a trained n-gram language model using Laplace Smoothing. Model will be trained on songs in the given 
	 music genre. 

	 Args:
		sequences (list): a list of training sequence strings, not tokenized
		ngram (int): the n-gram order of the language model to create
		verbose (bool): if True, prints information about the training data 

	Returns:
		A trained NGramLaplaceLanguageModel
	"""
	tokens = utils.tokenize(sequences, ngram)
	model = lm.NGramLaplaceLanguageModel(ngram)
	model.train(tokens, verbose=verbose)

	return model

In [51]:
laplace_model = create_ngram_laplace_model(train_lyrics)

Number of tokens: 4352028
N-gram examples: [('<s>', '<s>', 'become'), ('<s>', 'become', 'one'), ('become', 'one', 'with'), ('one', 'with', 'imagination'), ('with', 'imagination', 'NEW')]
Vocabulary Size: 26552


In [52]:
NUM_SEQ = 5

# Generate lines and print them out 
print("Sample Generated Lyrics:", GENRE, "\n")
for i in range(NUM_SEQ):
    lyric_tokens = laplace_model.generate_sentence()
    print(clean_lyric(lyric_tokens))
    print()

Sample Generated Lyrics: Heavy Metal 

a medication for the last breath 's in the ruthless cold
valhalla , i 'm god everybody dies

damned whore
strip me down
into the looking glass

what can i come to life
take my horse
i want you at the window , know it 's the stars
looking for a sign
that the pain and the hare he bounds across the world
the mad dog howling at the start till the end of my confusion
out of my pain
have a gun
the anger
cruelty

thanks for the loss sinks in fear

( solo )



In [None]:
"""
Sample Generated Lyrics: Heavy Metal

the seeking calm
you 're killing us ,
you 're breaking us ,
to all these meaningless feelings
you 're countdown , you oooeh suicide momentary with thoughts and narration
love is gone
when i try to open up my heart
do you hear me now ?
there 's a soul reaching out in fear
no reply in the time were here
there 's a soul reaching out in fear
so stand up and be satisfaction
as a man he was a danger to himself
it should be all we talk about
got me so i do n't know torn away


Sample Generated Lyrics: Pop
a million eyes stare into space
and she 'll always be my macarroni girl
we 'll be alright
is it the only defense against the wilderness ?
i am your lover-to-be
i ca n't take it
i wrecking with you once upon a love
come over
in the spirit of loves –
cause baby i
break
i 'm infected tonight
any night , any day
one foot in front of the other
you 're gon na be all right
"""

"\nSample Generated Lyrics: Heavy Metal\n\nthe seeking calm\nyou 're killing us ,\nyou 're breaking us ,\nto all these meaningless feelings\nyou 're countdown , you oooeh suicide momentary with thoughts and narration\nlove is gone\nwhen i try to open up my heart\ndo you hear me now ?\nthere 's a soul reaching out in fear\nno reply in the time were here\nthere 's a soul reaching out in fear\nso stand up and be satisfaction\nas a man he was a danger to himself\nit should be all we talk about\ngot me so i do n't know torn away\n\n\nSample Generated Lyrics: Pop\na million eyes stare into space\nand she 'll always be my macarroni girl\nwe 'll be alright\nis it the only defense against the wilderness ?\ni am your lover-to-be\ni ca n't take it\ni wrecking with you once upon a love\ncome over\nin the spirit of loves –\ncause baby i\nbreak\ni 'm infected tonight\nany night , any day\none foot in front of the other\nyou 're gon na be all right\n"

In [143]:
# EVALUATE PERPLEXITY 
def laplace_evaluate_perplexity(model, test_lines: list, ngram: int=NGRAM): 
    """
    Evaluates the given model by finding  the average perplexity of the given test sequences. 
    """
    perplexities = []

    for line in test_lines:
        test_tokens = utils.tokenize_line(line, ngram)
        perplexities.append(laplace_model.perplexity(test_tokens))

    return np.mean(perplexities)


print(laplace_evaluate_perplexity(laplace_model, test_lyrics))

157.0527937128197
