### N-Gram Language Model

In [54]:
#imports 
import pandas as pd
import ngram_laplace_lm_model as lm
import numpy as np

import utils 

In [55]:
# constants 
NGRAM = 3
NUM_SEQ_TO_GENERATE = 10 # how many lines to generate with our models 
VERBOSE = True

# special tokens 
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"

# filepaths 
country_train_filepath = "country_train.csv"
country_val_filepath = "country_val.csv"

metal_train_filepath = "metal_train.csv"
metal_val_filepath = "metal_val.csv"

# savepaths 

In [56]:
# read in data
country_train_lyrics = pd.read_csv(country_train_filepath, header=None)[0].to_list()
country_val_lyrics = pd.read_csv(country_val_filepath, header=None)[0].to_list()
print("Number of training lines for Country:", len(country_train_lyrics))
print("Number of validation lines for Country:", len(country_val_lyrics))
print()

metal_train_lyrics = pd.read_csv(metal_train_filepath, header=None)[0].to_list()
metal_val_lyrics = pd.read_csv(metal_val_filepath, header=None)[0].to_list()
print("Number of training lines for Heavy Metal:", len(metal_train_lyrics))
print("Number of validation lines for Heavy Metal:", len(metal_val_lyrics))

Number of training lines for Country: 149771
Number of validation lines for Country: 18610

Number of training lines for Heavy Metal: 149771
Number of validation lines for Heavy Metal: 18610


In [63]:
def create_ngram_laplace_model(training_sequences: list, ngram: int = NGRAM, verbose: bool = True):
	"""
	 Creates a trained n-gram language model using Laplace Smoothing using the given training data 

	 Args:
		training_sequences (list): a list of training sequence strings
		ngram (int): the n-gram order of the language model to create
		verbose (bool): if True, prints information about the training data 

	Returns:
		A trained NGramLaplaceLanguageModel
	"""
	tokens = utils.tokenize(training_sequences, ngram)
	model = lm.NGramLaplaceLanguageModel(ngram)
	model.train(tokens, verbose=verbose)

	return model

In [64]:
print("Country Laplace Model:")
laplace_country_model = create_ngram_laplace_model(country_train_lyrics)

Country Laplace Model:
Number of tokens: 1800401
N-gram examples: [('<s>', '<s>', 'i'), ('<s>', 'i', "'ve"), ('i', "'ve", 'seen'), ("'ve", 'seen', 'how'), ('seen', 'how', 'you')]
Vocabulary Size: 11236


In [65]:
print("Heavy Metal Laplace Model:")
laplace_metal_model = create_ngram_laplace_model(metal_train_lyrics)

Heavy Metal Laplace Model:
Number of tokens: 1547349
N-gram examples: [('<s>', '<s>', 'my'), ('<s>', 'my', 'journey'), ('my', 'journey', 'began'), ('journey', 'began', 'after'), ('began', 'after', 'the')]
Vocabulary Size: 14350


#### Generate New Sequences 

In [66]:
# method to help format generated sentences 
def clean_lyric(lyric_tokens: list) -> str:
    """
    Return the given sequence of tokens as a single string without special tokens 
    Args:
        lyric_tokens (list): list of tokens for the generated sequence

    Returns:
        The tokens joined in a single string without special characters 

    """
    lyric_str = ' '.join(lyric_tokens)
    return lyric_str.replace(SENTENCE_BEGIN, '').replace(SENTENCE_END, '').strip()

In [67]:
print("Country Generated Lyrics:\n")
for i in range(NUM_SEQ_TO_GENERATE):
    lyric_tokens = laplace_country_model.generate_sentence()
    print(clean_lyric(lyric_tokens))

Country Generated Lyrics:

oh no
i wo n't get to where i belong
if jesus walked the world
<UNK> never thought id be obliged to any one woman
come on , got a single reindeer and his rope 's pulled way to show your affection
the way it made
on my ear
why do n't feel like this town
stand still , can i stand in line
no all around me and you 're out in the wind started to rain


In [68]:
print("Heavy Metal Generated Lyrics:\n")
for i in range(NUM_SEQ_TO_GENERATE):
    lyric_tokens = laplace_metal_model.generate_sentence()
    print(clean_lyric(lyric_tokens))

Heavy Metal Generated Lyrics:

carniwar
i do
purgatory !
i 'm alive
i 'm watching you
take a look at the ground , crused immortal
epidemic of addiction , my sweet revenge for the first floor of your life
every minute a shattered silence
that 's haunting me
mysteries of time


#### Evaluate Perplexity

In [69]:
def median_perplexity(model, lines: list, ngram: int=NGRAM, should_truncate: bool=False): 
    """
    Evaluates the given model by finding the median perplexity of the given test sequences. 

    Args:
        model : the  N-gram Language Model 
        lines (list): a list of strings of data to evaluate perplexity on 
        ngram (int): the n-gram order used by the model
        should_truncate (bool): an optional truncation parameter that shortens the sequences to the same length used by RNNs
                                (to make perplexity more comparable between models)

    Returns: median perplexity over the given sequences 
    """
    perplexities = []
    for line in lines:
        if should_truncate:
            line = line[:10]

        test_tokens = utils.tokenize_line(line, ngram)
        perplexities.append(model.perplexity(test_tokens))

    return np.median(perplexities)


# perplexity on data that the models have not seen yet 
print("Median Validation Perplexity for Country Model:", median_perplexity(laplace_country_model, country_val_lyrics))
print("Median Validation Perplexity for Heavy Metal Model:", median_perplexity(laplace_metal_model, metal_val_lyrics))

Median Validation Perplexity for Country Model: 1205.002805205404
Median Validation Perplexity for Heavy Metal Model: 1944.8382552263183


### Experimentation - Testing out Different NGRAM values

Training perplexity listed simply for comparison. Validation perplexity should be used when choosing an NGRAM value

##### Country Model 

__ngram=2__
1. perplexity on training set: 287.90516137012776
1. perplexity on validation set: 337.8503322927694
4. example lyrics:\
love , oh wipe each other one without wishin that 's the by\
except what to realize\
put you take you\
he loved her mother\
no chance\
on and who i was there


__ngram=3__
1. perplexity on training set: 880.498661231573
1. perplexity on validation set: 1205.002805205404
4. example lyrics:\
i said i will\
and little jeanie 's sake .\
technicolor , river queen , three on high\
when a road with my fiddle\
yes everything i have shown

__ngram=4__
1. perplexity on training set:  1139.1474901855402
1. perplexity on validation set: 2046.0507074046266
4. example lyrics:\
if heaven 's real\
i do n't know\
one night at a time\
well you nursed me through the valley filled with snow\
we always wear a great big world are we\
on a cloud nine ride

__ngram=5__
1. perplexity on training set: 1346.5277568873
1. perplexity on validation set: 2531.502204342474
4. example lyrics:\
lord above me knows i love you\
a beautiful sight , weæš®e happy tonight\
i get along with you\
however you look at it , whatever you believe\
she can crawl it\
'cause it 's beer thirty , and it 's time to go out on a huntin ' spree\


##### Heavy Metal Model

__ngram=2__
1. perplexity on training set: 512.2497432426746
1. perplexity on validation set: 606.7330883081569
4. example lyrics:
out all you\
but i hide , hey , dokken ,\
lay you pain\
colder than you make me in fire\
learning life go


__ngram=3__
1. perplexity on training set: 1057.3627882221006
1. perplexity on validation set: 1944.8382552263183
4. example lyrics:\
oppressions wall they will learn\
mad magicians tinsel nightmares\
a thousand young\
wherever you are too much abuse of wasted human ... debris\
turning bottled water into wine

__ngram=4__
1. perplexity on training set: 1649.1497465256639
1. perplexity on validation set: 2972.0462423487797
4. example lyrics:\
but now we retaliate\
devoid the fake with full disdain\
boiling in rage - sophisticated cage\
well , i know\
you feel it

__ngram=5__
1. perplexity on training set: 1904.5755312503975
1. perplexity on validation set: 3581.757352722282
4. example lyrics:\
and now i close the door\
you will forger the pain\
ca n't somebody tell me am i the top of the chain\
just be my human hand\
death from above