### N-Gram Language Model

In [1]:
import pandas as pd
import ngram_laplace_lm_model as lm
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline

import utils 

In [2]:
# constants 
NGRAM = 4
VERBOSE = True
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"

GENRE = "Heavy Metal"
SONG_LIMIT = 400

# True to train by groups of lines, False to train by single lines 
BY_VERSE = False 

In [3]:
# read in cleaned data
song_df = pd.read_csv('clean_data.csv')
print(song_df.shape)
song_df.head(5)

(171855, 5)


Unnamed: 0,artist,song_name,lyrics,language,genres
0,ivete sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en,"['Pop', ' Axé', ' Romântico']"
1,ivete sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,"['Pop', ' Axé', ' Romântico']"
2,ivete sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",en,"['Pop', ' Axé', ' Romântico']"
3,ivete sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",en,"['Pop', ' Axé', ' Romântico']"
4,ivete sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,en,"['Pop', ' Axé', ' Romântico']"


In [4]:
# get song lyrics of the given genre 
song_lyrics = utils.get_lyrics_in_genre(song_df, GENRE, verbose=VERBOSE, by_verse=BY_VERSE, song_limit=SONG_LIMIT)

Selected 400 / 19817 in the genre Heavy Metal
Total sequences: 13322


#### Using NLTK Model with Kneser-Ney Smoothing 

In [5]:
def create_ngram_kneser_ney_model(sequences: list, ngram: int, verbose: bool = False, song_limit: int = None):
	"""
	 Creates a trained n-gram language model using Kneser-Ney Smoothing. Model will be trained on songs in the given 
	 music genre. 

	 Args:
				sequences (list): a list of training sequence strings, not tokenized 
				ngram (int): the n-gram order of the language model to create
        		verbose (bool): if True, prints information about the training data 
				song_limit (int): if present, the number of songs to include in the training data (used to cut down on training/generation time)

		Returns:
				A trained KneserNeyInterpolated
	"""
	tokens = [utils.tokenize_line(seq, ngram) for seq in sequences]

	# allowing padded_everygram_pipeline to create ngrams for the model 
	ngrams_generator, padded_sents = padded_everygram_pipeline(ngram, tokens)

	model = nltk.lm.KneserNeyInterpolated(ngram)
	model.fit(ngrams_generator, padded_sents)
     
	if verbose:
		print("Number of tokens:", len(tokens))
		print("Vocabulary Size:", len(model.vocab))
	
	return model

In [6]:
kneser_ney_model = create_ngram_kneser_ney_model(song_lyrics, NGRAM, verbose=VERBOSE)

Number of tokens: 13322
Vocabulary Size: 6347


In [18]:
# Generate 10 lines and print them out 
seqs = []
for i in range(10):
    print("Generating sequence", i, "out of 10")
    seqs.append(list(kneser_ney_model.generate(20, "<s> <s>".split())))

print("\nSample Generated Lyrics:", GENRE)
for seq in seqs:
    seq = " ".join(seq)
    seq = seq.replace('</s>', '').strip() # clean to be more readable 
    print(seq)

Generating sequence 0 out of 10
Generating sequence 1 out of 10
Generating sequence 2 out of 10
Generating sequence 3 out of 10
Generating sequence 4 out of 10
Generating sequence 5 out of 10
Generating sequence 6 out of 10
Generating sequence 7 out of 10
Generating sequence 8 out of 10
Generating sequence 9 out of 10

Sample Generated Lyrics: Heavy Metal
shut up! shut up!
(out of the best of me
riding steadfast (on the)
b|-----------------------|-----10-7---------------------------------------|
preparing for the prime
god say's file taxes late
eternally my soul will rot [rot... rot]
e|-------------|------------|--------------|------------|-------------||
a|------------------------|------------------------|
scratching furiously at scabbed and oozing wounds


#### Using Own N-Gram Language Model with Laplace Smoothing

In [19]:
def create_ngram_laplace_model(sequences: list, ngram: int, verbose: bool = False, song_limit: int = None):
	"""
	 Creates a trained n-gram language model using Laplace Smoothing. Model will be trained on songs in the given 
	 music genre. 

	 Args:
		sequences (list): a list of training sequence strings, not tokenized
		ngram (int): the n-gram order of the language model to create
		verbose (bool): if True, prints information about the training data 
		song_limit (int): if present, the number of songs to include in the training data (used to cut down on training/generation time)

	Returns:
		A trained NGramLaplaceLanguageModel
	"""
	tokens = utils.tokenize(sequences, ngram)
	model = lm.NGramLaplaceLanguageModel(ngram)
	model.train(tokens, verbose=verbose)

	return model

In [20]:
laplace_model = create_ngram_laplace_model(song_lyrics, NGRAM, verbose=VERBOSE)

Number of tokens: 160682
N-gram examples: [('<s>', '<s>', '<s>', "pushin'"), ('<s>', '<s>', "pushin'", 'the'), ('<s>', "pushin'", 'the', 'law'), ("pushin'", 'the', 'law', 'again'), ('the', 'law', 'again', '</s>')]
Vocabulary Size: 4085


In [21]:
print("Sample Generated Lyrics:", GENRE)

# Generate 10 lines and print them out 
lyrics = laplace_model.generate(10)
for lyric in lyrics:
    lyric = ' '.join(lyric)
    lyric = lyric.replace('<s>', '').replace('</s>', '').strip() # remove sentence start / stop tokens to be more readable 
    print(lyric)

Sample Generated Lyrics: Heavy Metal
every curse like violation of our lives,
<UNK> for showing an <UNK>
upon us - starless <UNK>
somewhere, somehow
take you and your blade and break you both in two
i'll do it my own way home
and i know you're waiting for your midnite lover
it's your life make it right
universe open wide
babez for breakfast,
