### N-Gram Language Model

In [1]:
import pandas as pd
import ngram_utils as utils 

In [2]:
# constants 
NGRAM = 4
VERBOSE = True
SENTENCE_BEGIN = "<s>"
SENTENCE_END = "</s>"

In [3]:
# read in cleaned data
song_df = pd.read_csv('clean_data.csv')
print(song_df.shape)
song_df.head(5)

(171855, 5)


Unnamed: 0,artist,song_name,lyrics,language,genres
0,ivete sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en,"['Pop', ' Axé', ' Romântico']"
1,ivete sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,"['Pop', ' Axé', ' Romântico']"
2,ivete sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",en,"['Pop', ' Axé', ' Romântico']"
3,ivete sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",en,"['Pop', ' Axé', ' Romântico']"
4,ivete sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,en,"['Pop', ' Axé', ' Romântico']"


#### Using NLTK Model with Kneser-Ney Smoothing 

In [4]:
# Using Power-Pop as an example because it has relatively few songs 
power_pop_model = utils.create_ngram_kneser_ney_model(song_df, "Power-Pop", NGRAM, verbose=VERBOSE)

Selected 802 / 802 in the genre Power-Pop
Total lines: 30587
Number of tokens: 30587
Vocabulary Size: 10710


In [5]:
# Generate 10 lines and print them out 
for _ in range(10):
    sentence = list(power_pop_model.generate(15, "<s> <s>".split())) 
    sentence = " ".join(sentence)
    sentence = sentence.replace('</s>', '').strip()
    print(sentence)

hop in a submersibel and you can argue that you've been misdirected
(as if he didn't know you)
away from all the falling bombs
(both): brainscan! brainscan!
onto a floor that's freshly waxed
whoever's in the room
hockey teams have playoff dreams
>>>he's a whore, (ooo, the stories i can tell)
overboard and down
stifler's mom has got it goin' on


#### Using Own N-Gram Language Model with Laplace Smoothing

In [6]:
blues_model = utils.create_ngram_laplace_model(song_df, "Blues", NGRAM, verbose=VERBOSE)

Selected 4664 / 4664 in the genre Blues
Total lines: 156337
Number of tokens: 1643963
N-gram examples: [('<s>', '<s>', '<s>', 'gravity'), ('<s>', '<s>', 'gravity', 'is'), ('<s>', 'gravity', 'is', 'working'), ('gravity', 'is', 'working', 'against'), ('is', 'working', 'against', 'me')]
Vocabulary Size: 15578


In [7]:
# Generate 10 lines and print them out 
lyrics = blues_model.generate(10)
for lyric in lyrics:
    lyric = ' '.join(lyric)
    lyric = lyric.replace('<s>', '').replace('</s>', '').strip()
    print(lyric)

for i know that it's not wrong what you did
all the colors in the shadows
from zero to friends
break away and leave me
i'm acting like a baby child
look out.
what's wrong, dear mother
just friends, but not like before
cher:
playing naturally <UNK>
