### N-Gram Language Model

In [54]:
import pandas as pd
import ngram_laplace_lm_model as lm
import nltk
import numpy as np
from nltk.lm.preprocessing import padded_everygram_pipeline

import utils 

In [100]:
# constants 
NGRAM = 3
NUM_SEQ_TO_GENERATE = 10 # how many lines to generate 
VERBOSE = True

SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"
NEWLINE = " NEW "

# filepaths 
country_train_filepath = "country_train.csv"
country_val_filepath = "country_val.csv"

metal_train_filepath = "metal_train.csv"
meta_val_filepath = "metal_val.csv"

# savepaths 


# True to train by groups of lines, False to train by single lines 
BY_VERSE = False 

In [46]:
# read in data
country_train_lyrics = pd.read_csv(country_train_filepath, header=None)[0].to_list()
country_val_lyrics = pd.read_csv(country_val_filepath, header=None)[0].to_list()
print("Number of training lines for Country:", len(country_train_lyrics))
print("Number of validation lines for Country:", len(country_val_lyrics))
print()

metal_train_lyrics = pd.read_csv(metal_train_filepath, header=None)[0].to_list()
metal_val_lyrics = pd.read_csv(meta_val_filepath, header=None)[0].to_list()
print("Number of training lines for Heavy Metal:", len(metal_train_lyrics))
print("Number of validation lines for Heavy Metal:", len(metal_val_lyrics))

Number of training lines for Country: 149771
Number of validation lines for Country: 18610

Number of training lines for Heavy Metal: 149771
Number of validation lines for Heavy Metal: 18610


#### Function to Format Generated Sequences to be more Readable

In [33]:
# method to help format generated sentences 
def clean_lyric(lyric_tokens: list) -> str:
    """
    Return the given sequence of tokens as a single string without special tokens like <s> or </s>

    Args:
        lyric_tokens (list): list of tokens for the generated sequence

    Returns:
        The tokens joined in a single string without special characters 

    """
    lyric_str = ' '.join(lyric_tokens)
    lyric_str = lyric_str.replace(NEWLINE, '\n')
    return lyric_str.replace(SENTENCE_BEGIN, '').replace(SENTENCE_END, '').strip()

#### Using NLTK Model with Kneser-Ney Smoothing 

In [101]:
def create_ngram_kneser_ney_model(sequences: list, ngram: int = NGRAM, verbose: bool = True):
	"""
	 Creates a trained n-gram language model using Kneser-Ney Smoothing. Model will be trained on songs in the given 
	 music genre. 

	 Args:
		sequences (list): a list of training sequence strings, not tokenized 
		ngram (int): the n-gram order of the language model to create
		verbose (bool): if True, prints information about the training data 
				
	Returns:
		A trained KneserNeyInterpolated
	"""
	# split each line into tokens 
	tokens = [utils.tokenize_line(seq, ngram) for seq in sequences]

	# allow padded_everygram_pipeline to create ngrams for the model 
	ngrams_generator, padded_sents = padded_everygram_pipeline(ngram, tokens)

	model = nltk.lm.KneserNeyInterpolated(ngram)
	model.fit(ngrams_generator, padded_sents)
     
	if verbose:
		print("Number of tokens:", len(tokens))
		print("Vocabulary Size:", len(model.vocab))
	
	return model

In [102]:
print("Country KneserNey Model:")
kneser_ney_country_model = create_ngram_kneser_ney_model(country_train_lyrics)
print()

print("Heavy Metal KneserNey Model:")
kneser_ney_metal_model = create_ngram_kneser_ney_model(metal_train_lyrics)

Country KneserNey Model:
Number of tokens: 149771
Vocabulary Size: 18866

Heavy Metal KneserNey Model:
Number of tokens: 149771
Vocabulary Size: 25274


In [103]:
seed = [SENTENCE_BEGIN] * (NGRAM - 1)
max_len = 20 # nltk's models generate sequences of a fixed length 

print("Country Generated Lyrics:\n")
for i in range(NUM_SEQ_TO_GENERATE):
    lyric_tokens = list(kneser_ney_country_model.generate(max_len, seed))
    print(clean_lyric(lyric_tokens))

print()
print("Heavy Metal Generated Lyrics:\n")
for i in range(NUM_SEQ_TO_GENERATE):
    lyric_tokens = list(kneser_ney_metal_model.generate(max_len, seed))
    print(clean_lyric(lyric_tokens))

Country Generated Lyrics:

homeless , will buoy me on .


KeyboardInterrupt: 

In [104]:
def kn_median_perplexity(model, lines: list, ngram: int=NGRAM): 
    """
    Evaluates the given model by finding the median perplexity of the given test sequences. 
    """
    perplexities = []

    for line in lines:
        test_tokens = utils.tokenize_line(line, ngram)
        test_ngrams = list(nltk.ngrams(test_tokens, n=ngram))
        perplexities.append(model.perplexity(test_ngrams))

    return np.median(perplexities)

print("Median Validation Perplexity for Country Model:", kn_median_perplexity(kneser_ney_country_model, country_val_lyrics))
print("Median Validation Perplexity for Heavy Metal Model:", kn_median_perplexity(kneser_ney_metal_model, metal_val_lyrics))

KeyboardInterrupt: 

#### Using Own N-Gram Language Model with Laplace Smoothing

In [89]:
def create_ngram_laplace_model(sequences: list, ngram: int = NGRAM, verbose: bool = True):
	"""
	 Creates a trained n-gram language model using Laplace Smoothing. Model will be trained on songs in the given 
	 music genre. 

	 Args:
		sequences (list): a list of training sequence strings, not tokenized
		ngram (int): the n-gram order of the language model to create
		verbose (bool): if True, prints information about the training data 

	Returns:
		A trained NGramLaplaceLanguageModel
	"""
	tokens = utils.tokenize(sequences, ngram)
	model = lm.NGramLaplaceLanguageModel(ngram)
	model.train(tokens, verbose=verbose)

	return model

In [90]:
print("Country Laplace Model:")
laplace_country_model = create_ngram_laplace_model(country_train_lyrics)

Country Laplace Model:
Number of tokens: 2399485
N-gram examples: [('<s>', '<s>', '<s>', '<s>', 'i'), ('<s>', '<s>', '<s>', 'i', "'ve"), ('<s>', '<s>', 'i', "'ve", 'seen'), ('<s>', 'i', "'ve", 'seen', 'how'), ('i', "'ve", 'seen', 'how', 'you')]
Vocabulary Size: 11236


In [91]:
print("Heavy Metal Laplace Model:")
laplace_metal_model = create_ngram_laplace_model(metal_train_lyrics)

Heavy Metal Laplace Model:
Number of tokens: 2146433
N-gram examples: [('<s>', '<s>', '<s>', '<s>', 'my'), ('<s>', '<s>', '<s>', 'my', 'journey'), ('<s>', '<s>', 'my', 'journey', 'began'), ('<s>', 'my', 'journey', 'began', 'after'), ('my', 'journey', 'began', 'after', 'the')]
Vocabulary Size: 14350


In [92]:
print("Country Generated Lyrics:\n")
for i in range(NUM_SEQ_TO_GENERATE):
    lyric_tokens = laplace_country_model.generate_sentence()
    print(clean_lyric(lyric_tokens))

Country Generated Lyrics:

lord above me knows i love you
a beautiful sight , weæš®e happy tonight
i get along with you
however you look at it , whatever you believe
she can crawl it
'cause it 's beer thirty , and it 's time to go out on a huntin ' spree
since she up and walked away
when he holds me i can feel he 's hard giving love 's true and real
the sleeping child , you 're holding
( <UNK> solo )


In [93]:
print("Heavy Metal Generated Lyrics:\n")
for i in range(NUM_SEQ_TO_GENERATE):
    lyric_tokens = laplace_metal_model.generate_sentence()
    print(clean_lyric(lyric_tokens))

Heavy Metal Generated Lyrics:

and now i close the door
you will forger the pain
ca n't somebody tell me am i the top of the chain
just be my human hand
death from above
before their <UNK>
if you just forget
you called me up the other day just when i thought you would
ticket at the other , ‘ cos i 'm no <UNK> in distress
with the mentor ? s anger


In [94]:
# EVALUATE PERPLEXITY 
def laplace_median_perplexity(model, lines: list, ngram: int=NGRAM): 
    """
    Evaluates the given model by finding the median perplexity of the given test sequences. 
    """
    perplexities = []

    for line in lines:
        test_tokens = utils.tokenize_line(line, ngram)
        perplexities.append(model.perplexity(test_tokens))

    return np.median(perplexities)


# seeing perplexity on training data as reference to compare against validation perplexity
print("Median Training Perplexity for Country Model:", laplace_median_perplexity(laplace_country_model, country_train_lyrics))
print("Median Training Perplexity for Heavy Metal Model:", laplace_median_perplexity(laplace_metal_model, metal_train_lyrics))

print()

# perplexity on data that the models have not seen yet 
print("Median Validation Perplexity for Country Model:", laplace_median_perplexity(laplace_country_model, country_val_lyrics))
print("Median Validation Perplexity for Heavy Metal Model:", laplace_median_perplexity(laplace_metal_model, metal_val_lyrics))

Median Validation Perplexity for Country Model: 2531.502204342474
Median Validation Perplexity for Heavy Metal Model: 3581.757352722282
Median Training Perplexity for Country Model: 1346.5277568873
Median Training Perplexity for Heavy Metal Model: 1904.5755312503975


### Experimentation - Testing out Different NGRAM values

Train models on train dataset, report perplexity on validation dataset, final model will be evaluated on test dataset 

##### Country Laplace Model 

__ngram=1__
1. perplexity on training set: 469.48482936007724
1. perplexity on validation set: 458.24079917067286
2. time to train: 15.7s
3. time to generate 10 lines: 1m, 18.9s
4. example lyrics:\
and by we wo this in undress that last of\
i loveable to of , we missin in sweet barney hello\
wake 'm from she honky those get babies '' that , you\
stranger to , just who ooh\
not all in worth house who i ] something out many it countryside i records just and [ i 's

__ngram=2__
1. perplexity on training set: 287.90516137012776
1. perplexity on validation set: 337.8503322927694
2. time to train: 16.5s
3. time to generate 10 lines: 23.3s
4. example lyrics:\
love , oh wipe each other one without wishin that 's the by\
except what to realize\
put you take you\
he loved her mother\
no chance\
on and who i was there


__ngram=3__
1. perplexity on training set: 880.498661231573
1. perplexity on validation set: 1205.002805205404
2. time to train: 16.2s
3. time to generate 10 lines: 23.0s
4. example lyrics:\
i said i will\
and little jeanie 's sake .\
technicolor , river queen , three on high\
when a road with my fiddle\
yes everything i have shown

__ngram=4__
1. perplexity on training set:  1139.1474901855402
1. perplexity on validation set: 2046.0507074046266
2. time to train: 17.7s
3. time to generate 10 lines: 35.3s
4. example lyrics:\
if heaven 's real\
i do n't know\
one night at a time\
well you nursed me through the valley filled with snow\
we always wear a great big world are we\
on a cloud nine ride

__ngram=5__
1. perplexity on training set: 1346.5277568873
1. perplexity on validation set: 2531.502204342474
2. time to train: 16.2s
3. time to generate 10 lines: 39.2s
4. example lyrics:\
lord above me knows i love you\
a beautiful sight , weæš®e happy tonight\
i get along with you\
however you look at it , whatever you believe\
she can crawl it\
'cause it 's beer thirty , and it 's time to go out on a huntin ' spree\


##### Heavy Metal Laplace Model

__ngram=1__
1. perplexity on training set: 550.9829570794548
1. perplexity on validation set: 516.3249810238908
2. time to train: 14.4s
3. time to generate 10 lines: 2m, 2.4s
4. example lyrics:\
are me be , melt times , black 've the\
, cause these rain disappear blue look diseased so for soul sinister fear do approach my dance\
inside everything all run i into\
loneliness gon the i turns and mind , i the love time it hands\
overmastered own love 's 're it

__ngram=2__
1. perplexity on training set: 512.2497432426746
1. perplexity on validation set: 606.7330883081569
2. time to train: 15.0s
3. time to generate 10 lines: 49.5s
4. example lyrics:\
out all you\
but i hide , hey , dokken ,\
lay you pain\
colder than you make me in fire\
learning life go


__ngram=3__
1. perplexity on training set: 1057.3627882221006
1. perplexity on validation set: 1944.8382552263183
2. time to train: 14.0s
3. time to generate 10 lines: 45.3s
4. example lyrics:\
oppressions wall they will learn\
mad magicians tinsel nightmares\
a thousand young\
wherever you are too much abuse of wasted human ... debris\
turning bottled water into wine

__ngram=4__
1. perplexity on training set: 1649.1497465256639
1. perplexity on validation set: 2972.0462423487797
2. time to train: 16.4s
3. time to generate 10 lines: 53.5s
4. example lyrics:\
but now we retaliate\
devoid the fake with full disdain\
boiling in rage - sophisticated cage\
well , i know\
you feel it

__ngram=5__
1. perplexity on training set: 1904.5755312503975
1. perplexity on validation set: 3581.757352722282
2. time to train: 15.4s
3. time to generate 10 lines: 1m, 0.3s
4. example lyrics:\
and now i close the door\
you will forger the pain\
ca n't somebody tell me am i the top of the chain\
just be my human hand\
death from above