In [37]:
import numpy as np
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import bigrams, ngrams, everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import MLE, KneserNeyInterpolated, Lidstone, Laplace, AbsoluteDiscountingInterpolated

import lyrics

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/dzionek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df

Unnamed: 0,artist,song,lyric
0,Logic,​man i is,knockin doors down showin parts around\nima co...
1,The Roots,The Seed,i\nknocked up 9 months ago\nand what she finna...
2,Fabolous,Diced Pineapples,shawty so cold pussy winter fresh\nreservation...
3,Nicki Minaj,Roman Reloaded,bang my shit bang it bangbang\nbang my shit ba...
4,MC Lyte,Ride Wit Me,lytro yeah hm yo\nyeah fuck the rest be nobody...
...,...,...,...
16872,Joey BadA$$,Satellite,and i guess its my turn to shine\nthe bright l...
16873,Ludacris,Southern Fried Intro,hey yeah i want all you proud sistas to stand ...
16874,Dr. Dre,Lyrical Gangbang,this should be played at high volume\npreferab...
16875,J Cole,LAnd of the Snakes,yeah uhhuh\nthis the shit i used to roll down ...


## Tokenization

Default word tokenizer removes new line symbols.

In [7]:
# # Test
# lyric = [word_tokenize(line) + ['\n'] for line in train['lyric'][0].split('\n')]
# flat_list = list(itertools.chain(*lyric))[:-1]  # remove last new line symbol

In [8]:
def tokenize(dataset):
    return [
        nltk.flatten(([
            word_tokenize(line) + ['\n']
            for line in lyric.split('\n')
        ]))[:-1]
        for lyric in list(dataset['lyric'])
        if not isinstance(lyric, float)  # remove nan values
    ]

tokenized_train = tokenize(train_df)
tokenized_test = tokenize(test_df)

## Setup and stats

In [9]:
N = 3

In [10]:
def generate_ngrams(dataset, every=False):
    if every:
        return [list(everygrams(pad_both_ends(example, n=N), max_len=N)) for example in dataset]
    else:
        return [list(ngrams(pad_both_ends(example, n=N), n=N)) for example in dataset]

train_ngrams = generate_ngrams(tokenized_train, every=True)
test_ngrams = generate_ngrams(tokenized_test)

In [11]:
def generate_vocabulary(dataset):
    return list(nltk.lm.preprocessing.flatten(pad_both_ends(example, n=N) for example in dataset))

train_vocabulary = generate_vocabulary(tokenized_train)
test_vocabulary = generate_vocabulary(tokenized_test)

In [12]:
print("Training set contains", len(set(train_vocabulary)), "distinct tokens.")
print("Test set contains", len(set(test_vocabulary)), "distinct tokens.")
print("Test set has", len(set(test_vocabulary) - set(train_vocabulary)), "tokens not in train test.")

Training set contains 101176 distinct tokens.
Test set contains 52342 distinct tokens.
Test set has 11684 tokens not in train test.


In [13]:
from collections import Counter
Counter(train_vocabulary).most_common(10)

[('\n', 1155857),
 ('the', 373895),
 ('i', 311584),
 ('you', 239336),
 ('a', 208618),
 ('and', 183033),
 ('to', 173764),
 ('my', 148695),
 ('it', 127058),
 ('me', 123936)]

## Training

In [14]:
lm = Lidstone(1e-4, N)
lm.fit(train_ngrams, train_vocabulary)

In [15]:
len(lm.vocab)

101177

## Quantitative evaluation

### Perplexity (PPL)

In [16]:
lm.perplexity(nltk.lm.preprocessing.flatten(train_ngrams))

127.59839790243956

In [17]:
lm.perplexity(nltk.lm.preprocessing.flatten(test_ngrams))

943.0695898938937

### Generate songs after first line

In [18]:
re.sub(r'[ ]+\n[ ]+', r'\n', ' '.join(lm.generate(100, text_seed=['I', 'found'])))

'me a chance roll the dice over hang it back\nthe child will be illest in your clothes slugs ripping through your\neast new york citycity new york ridin on the cock\nho you want me to the polo fleece to the block i walk into the palms\nremain calm\nheaded to the toes and never keep it in the jacket she smoke trees considerably\ni just paused to scratch a record label told him switch dials\nwoo yeah\nhot boys\nthe judge here comes the hot whips ahh\ni'

In [19]:
first_lines = [song[:song.index('\n')+1] for song in tokenized_test if '\n' in song]
len(first_lines)

4200

In [20]:
first_lines[0]

['godfathers', 'in', 'the', 'house', '\n']

Generate all continuations of rap songs

In [21]:
results = []
for first_line in first_lines:
    result = ' '.join(first_line)[:-2] + '\n'
    result += re.sub(r'[ ]+\n[ ]+', r'\n', ' '.join(lm.generate(100, text_seed=first_line, random_seed=0)))
    result = re.sub(' </s>.*', '', result)
    results.append(result)

In [47]:
print(results[8])

alright we gather here around the midnight hour
this rap shit
like i still fuck the plug told her hello and then i walk with tall tops
silver tops tan tops aqua tops orange tops
just to get this money
so if you out
kool moe dee
two cups nigga
you see me
how many wan na know what it seems like your mans missing plans thicken
while im eating 50 fried clam
but there were two perfect kids in the funky output
five thousand volt thunderbolts
you got ta worry dont worry about my issue


### Rhyme Density (RD)

In [48]:
results_rd = np.array([lyrics.get_rhyme_density(song) for song in results])

In [53]:
round(np.mean(results_rd),2)

0.55

In [55]:
round(np.std(results_rd),2)

0.16

### Syllable Count Difference (SCD)

In [56]:
results_scd = np.array([lyrics.get_syllable_count_difference(song) for song in results])

In [93]:
round(np.mean(results_scd),2)

4.52

In [94]:
round(np.std(results_scd),2)

2.34

### Longest Rhyme (LR)

In [95]:
results_lr = np.array([lyrics.get_longest_rhyme(song) for song in results])

In [96]:
print(round(np.mean(results_lr),2), round(np.std(results_lr),2))

3.31 0.67


### Unique Words (UW)

In [97]:
results_uw = np.array([lyrics.get_unique_words(song) for song in results])

In [98]:
print(round(np.mean(results_uw),2), round(np.std(results_uw),2))

0.8 0.06


### Get those statistics for training and test sets

#### Train

In [63]:
train_rd = np.array([
    lyrics.get_rhyme_density(song)
    for song in train_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [64]:
round(np.mean(train_rd),2)

0.72

In [65]:
round(np.std(train_rd),2)

0.14

In [69]:
train_scd = np.array([
    lyrics.get_syllable_count_difference(song)
    for song in train_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [70]:
round(np.mean(train_scd),2)

1.76

In [71]:
round(np.std(train_scd),2)

3.4

In [81]:
train_lr = np.array([
    lyrics.get_longest_rhyme(song)
    for song in train_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [82]:
print(round(np.mean(train_lr),2), round(np.std(train_lr),2))

5.15 1.2


In [86]:
train_uw = np.array([
    lyrics.get_unique_words(song)
    for song in train_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [89]:
print(round(np.mean(train_uw),2), round(np.std(train_uw),2))

0.44 0.1


#### Test

In [66]:
test_rd = np.array([
    lyrics.get_rhyme_density(song)
    for song in test_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [67]:
round(np.mean(test_rd),2)

0.72

In [68]:
round(np.std(test_rd),2)

0.14

In [72]:
test_scd = np.array([
    lyrics.get_syllable_count_difference(song)
    for song in test_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [73]:
round(np.mean(test_scd),2)

1.74

In [74]:
round(np.std(test_scd),2)

2.28

In [83]:
test_lr = np.array([
    lyrics.get_longest_rhyme(song)
    for song in test_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [84]:
print(round(np.mean(test_lr),2), round(np.std(test_lr),2))

5.11 1.2


In [90]:
test_uw = np.array([
    lyrics.get_unique_words(song)
    for song in test_df['lyric']
    if isinstance(song, str) and '\n' in song])

In [91]:
print(round(np.mean(test_uw),2), round(np.std(test_uw),2))

0.44 0.1


In [103]:
test_df

Unnamed: 0,artist,song,lyric
0,Ice-T,Pimpin’ Ain’t Easy [Godfather Theme],godfathers in the house\ngrab yo bitches\npimp...
1,Jean Grae,What You Came For,tell me what ya\ntell me what ya\ntell me what...
2,Lil' Wayne,How to Love,cut the music up\na little louder yeah\nyou ha...
3,E-40,"Gas, Break, Dip",calling all hustlers calling all players\nple...
4,Lil' Kim,Spicy,muy caliente\ntime to keep it cool thats what...
...,...,...,...
4215,Gang Starr,Check the Technique,you puny protozoa youre so minute you didnt kn...
4216,Big Sean,Jit/Juke,ho i got my cell phone ringin nowadays its har...
4217,Gang Starr,Put Up or Shut Up,stupid you know its time to sit and think befo...
4218,Drake,Free Smoke,is it the strength of your feelings\noverthrow...


## Qualitative evaluation

In [101]:
print(re.sub(r'[ ]+\n[ ]+', r'\n', ' '.join(lm.generate(100, text_seed=['cut', 'the', 'music', 'up', '\n']))))

what what what
got fly hoes kneel
stand your loneliness
im holding you down to a 50000
check it yo
jump from gee to gee
so im like all about bacon or sausage
the ultimate freak off
wave em round before you fuck that we just laying
and im the best
my lady say its up there to settle
now we pawns in this shit for me baby its okay
alright one more hit
ayy put two pills and im so thankful
for gods were el


In [102]:
print(re.sub(r'[ ]+\n[ ]+', r'\n', ' '.join(lm.generate(100, text_seed=['godfather', 'in', 'the', 'house', '\n']))))

hey yo cause im on that
i know my language
but i stay on the top down riding clean
its goin down
about the fresh trim at
bitch im a savage
aint nothing wrong with the vibe aint no guards playin cards aint no more exceptions
losing my balance
just rockin it
having you be hangin in bars naked for dollars
breakin all these bitches is insane
a young og then i see you looking for me
day ta day
clown dont make you do baby is


In [99]:
first_lines[0]

['godfathers', 'in', 'the', 'house', '\n']

In [51]:
for i in range(10):
    print(f'### Sample number {seed+1} ###')
    print(' '.join(first_lines[i])[:-2])
    print(re.sub(r'[ ]+\n[ ]+', r'\n', ' '.join(lm.generate(100, text_seed=first_lines[i]))))
    print()

### Sample number 1 ###
godfathers in the house
i guess she realized living was hard times
as we move too fast
eds dead </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>

### Sample number 1 ###
tell me what ya
you got ta follow
anywhere im at the crap tables down
the shit that i rhyme i get it ive been running from nothing
i got ta get back is the film gon feel different
pretend im not sexy
but im makin sure youre gon na stay and listen to me
you aint care
nobody wants to get high with my eyes sparkly like a cloak cause murder is murder
momma found four shells
yea we party right all damn night i get pussy like a groupie
i need