In [1]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, CuDNNLSTM, LSTM, Dense, Dropout,Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import string, os
import pandas as pd
import numpy as np
from keras import utils as np_utils
from tqdm import tqdm
import re

In [2]:
def get_lyrics(songs, artist, title=None):
    if title is None: return songs[(songs.artist == artist)]
    return songs[(songs.artist == artist) & (songs.title == title)]

In [3]:
songs = pd.read_csv('./datasets/labeled_lyrics_cleaned.csv',
                    usecols=["artist", "seq", "song"])
songs.rename(columns={"seq": "lyrics", "song": "title"}, inplace=True)
songs.drop_duplicates(inplace=True, ignore_index=True)

In [4]:
# Bob Marley, Bon Jovi, Boney M., Eminem, Iron Maiden, Madonna,
# R.E.M., Red Hot Chilli Peppers, The Beatles, The Rolling Stones, U2
# for name, his_songs in songs.groupby("artist"):
#     if len(his_songs) > 95:
#         print(name, len(his_songs))
# print('\n'.join(sorted(set(songs.artist))))
# del name, his_songs
print(next(iter(get_lyrics(songs, "Bob Marley", "Three Little Birds").lyrics)))
print(len(list(get_lyrics(songs, "Bob Marley").lyrics)))

Don't worry about a thing
'Cause every little thing gonna be alright
Singing' don't worry about a thing
'Cause every little thing gonna be alright

Rise up this mornin'
Smiled with the risin' sun
Three little birds
Pitch by my doorstep
Singin' sweet songs
Of melodies pure and true
Saying', (this is my message to you)

Singing' don't worry 'bout a thing
'Cause every little thing gonna be alright
Singing' don't worry (don't worry) 'bout a thing
'Cause every little thing gonna be alright

Rise up this mornin'
Smiled with the risin' sun
Three little birds
Pitch by my doorstep
Singin' sweet songs
Of melodies pure and true
Sayin', this is my message to you

Singin' don't worry about a thing, worry about a thing, oh
Every little thing gonna be alright, don't worry
Singin' don't worry about a thing, I won't worry
"'Cause every little thing gonna be alright

Singin' don't worry about a thing
'Cause every little thing gonna be alright, I won't worry
Singin', don'

In [5]:

# all_lyrics = list(set(get_lyrics(songs, "Bob Marley").lyrics))
all_lyrics = list(set(songs.lyrics))[0:2000]
lyrics_cnt = len(all_lyrics)
'''
all_lyrics = list(set(songs.lyrics))[0:1000]

all_lyrics = []
all_lyrics.extend(list(lyrics.values))
#print(all_lyrics)

for a in all_lyrics:
  if not isinstance(a, str):
    print("found one")
    all_lyrics.remove(a)

print(len(all_lyrics))
# all_lyrics = all_lyrics[:200]

translator = str.maketrans('', '', string.punctuation)

all_lyrics = all_lyrics[0:1000]
'''
None
# poetry = pd.read_csv('/content/gdrive/My Drive/Kaggle/PoetryFoundationData.csv',quotechar='"')

# poetry.head()

# poem = poetry.Poem
# poem_cnt = len(poem)

# all_poems = []
# all_poems.extend(list(poem.values))


# for a in all_poems:
#   if not isinstance(a, str):
#     all_poems.remove(a)

# all_poems = all_poems[:50]
# joined = all_poems+all_lyrics







In [6]:
import enchant
d = enchant.Dict("en_US")


all_lyrics_en = []
for song in tqdm(all_lyrics):
    _cnt_y = 0
    _cnt_n = 0
    for line in song.split('\r\n'):
        for word in line.split():
            if d.check(word):
                _cnt_y += 1
            else:
                _cnt_n += 1
    if _cnt_n > (_cnt_y + _cnt_n) * 0.4:
        # print(song)
        continue
    all_lyrics_en.append(song)


100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:59<00:00, 33.61it/s]


In [7]:
all_lyrics = all_lyrics_en[:200]

In [8]:

def clean_text(txt):
    # txt = "".join(v for v in txt if v not in string.punctuation).lower()
    # txt = txt.encode("utf8").decode("ascii",'ignore')
    rgx = re.compile(r"\[[^\n\]]*]")
    return rgx.sub('', txt) 

def get_sequence_of_tokens(corpus,
                           lines_in_n_gram=2, 
                           endl_as_token=False,
                           ignoring_empty_lines=False):
    if endl_as_token:
        corpus.append(["endl"])
    tokenizer.fit_on_texts(corpus)
    corpus.pop()
    total_words = len(tokenizer.word_index) + 1

    input_sequences = []
    for song in tqdm(corpus):
        if ignoring_empty_lines:
            rgx = re.compile(r"(\r\n)+")
            song = rgx.sub("\r\n", song)
        if endl_as_token:
            song = song.replace("\r\n", " endl\r\n")
        # print(song.split('\r\n'))
        token_lists = tokenizer.texts_to_sequences(song.split('\r\n'))
        # print(token_lists)
        for i in range(lines_in_n_gram, len(token_lists)):
            flat = [jt for it in token_lists[i-lines_in_n_gram:i] for jt in it]
            for j in range(1, len(flat)):
                n_gram_sequence = flat[:j+1]
                input_sequences.append(n_gram_sequence)
        '''
        prev_token_list = []
        for line in song.split('\r\n'):
            if not line:
                continue
#             print(line)
            token_list = list(tokenizer.texts_to_sequences([line])[0])
            #print(token_list)
            # input_sequences.append(list(prev_token_list) + list(token_list))
            two_lines = prev_token_list + token_list
            for i in range(1, len(two_lines)):
                n_gram_sequence = two_lines[:i+1]
                input_sequences.append(n_gram_sequence)
#             print(input_sequences)
            prev_token_list = token_list
        '''
    return input_sequences, total_words

def get_padded_sequences(input_sequences):
    max_seq_len = max( [len(x) for x in input_sequences])
    padded_input_sequences =  np.array(pad_sequences(input_sequences, 
                                                     maxlen=max_seq_len, padding='pre'))
    return padded_input_sequences, max_seq_len


def prep_train_data(padded_input_sequences,total_words):
    
    x_train = padded_input_sequences[:,:-1]
    labels = padded_input_sequences[:,-1]
    labels = keras.utils.np_utils.to_categorical(labels, num_classes=total_words)
    
    return x_train, labels

In [9]:
# corpus = [clean_text(x) for x in all_lyrics]
corpus = all_lyrics
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r')
input_sequences, total_words = get_sequence_of_tokens(corpus, 4, True, False)
padded_input_sequences, max_seq_len = get_padded_sequences(input_sequences)
x_train, labels = prep_train_data(padded_input_sequences,total_words)

100%|██████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 1542.58it/s]


In [10]:
print('\n'.join(sorted(tokenizer.word_index)), end="\n\n\n")
print('\n'.join(tokenizer.sequences_to_texts(input_sequences[:20])))
print(max_seq_len)

'91
'ave'
'bought
'bout
'cause
'cos
'coz
'crae
'cross
'cuz
'do
'em
'go
'goodnight
'i
'lax
'll
'n
'neath
'round
's
'specially
'there
'til
'tis
'twas
1
10
100
112
12
14
15
16
1st
2
20
21
24
25
2nd
2x
3
31
33
350
35th
380's
3x
3x's
4
441
45
5
5'9
6x
8
808's
85
a
aand
able
about
above
abroad
absence
absolute
abyss
accelerator
ache
achievin
aching
across
act
acted
actin'
added
address
adeline
adios
admit
adopt
adore
adored
adverts
advice
af
affair
affect
affection
afford
afraid
africa
after
afterglow
again
against
age
agenda
aging
ago
agony
agreed
ah
ahead
ahh
aim
ain
ain't
air
al
alabaster
aladdin's
alarm
alight
aline
alive
all
allowed
almond
almost
alone
along
aloud
already
alright
also
although
always
am
ambassador
ambushed
amen
america's
american
amigo
ammunition
amnesia
among
amongst
amount
amsterdam
amusing
an
an'
anarchists
anchor
and
andy
anemic
anew
angel
angels
anger
angers
angry
animal
animals
ann
another
answer
answers
anthem
antone
any
anybody
anyhow
anymore
anyone
anyplace
any

In [11]:
def lstm_model(max_seq_len,total_words):
    
    input_len = max_seq_len - 1 #zadnju predvidam
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add an LSTM Layer
    model.add(Bidirectional(LSTM(150, return_sequences=True)))  # A dropout layer for regularisation
    model.add(Dropout(0.2))# Add another LSTM Layer
    model.add(LSTM(100))
    model.add(Dense(total_words/2, activation='relu'))
    # In the last layer, the shape should be equal to the total number of words present in our corpus
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')
    #(# Pick a loss function and an optimizer)print(model.summary())
    
    return model
'''
def lstm_model(max_seq_len,total_words):
    
    input_len = max_seq_len - 1 #zadnju predvidam
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(CuDNNLSTM(128))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

'''

model = lstm_model(max_seq_len,total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 54, 10)            39750     
                                                                 
 cu_dnnlstm (CuDNNLSTM)      (None, 128)               71680     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 3975)              512775    
                                                                 
Total params: 624,205
Trainable params: 624,205
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(x_train, labels, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
  49/3828 [..............................] - ETA: 21s - loss: 3.1623

In [None]:
def generate_lyrics(seed_txt, next_words_cnt , max_seq_len, model):
  
    for i in range(0, next_words_cnt):
        token_list = tokenizer.texts_to_sequences([seed_txt])[0]
        padded_token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        
        prediction = np.argmax(model.predict(padded_token_list), axis=-1)

        #print('prediction',prediction)
        #tokenizer.word_index.items() --> rijecnik (rijec,index)
        
        
        for (word, index) in tokenizer.word_index.items():
            output_word = ""
            if (prediction == index):
                seed_txt += " " + word
                break
      
        
    return seed_txt.title()

def generate_lyrics_endl_as_token(seed_txt,
                                  lines_in_n_gram,
                                  num_lines,
                                  max_line_len,
                                  max_seq_len,
                                  model):
        st_list = seed_txt.replace('\r', "").split('\n')
        st_list = st_list[max(0, len(st_list) - lines_in_n_gram):]
        token_list = tokenizer.texts_to_sequences(st_list)
        lines_remaining = num_lines - len(token_list)
        flat = [jt for it in token_list for jt in it]
        endl_token = tokenizer.texts_to_sequences(["endl"])[0][0]
        tokenized_song = []
        curr_line = []
        while lines_remaining:
            # print(lines_remaining)
            if len(curr_line) >= max_line_len:
                prediction = endl_token
            else:
                padded_token_list = pad_sequences([flat], maxlen=max_seq_len-1, padding='pre')
                prediction = np.argmax(model.predict(padded_token_list), axis=-1)[0]
            curr_line.append(prediction)
            flat.append(prediction)
            if prediction == endl_token:
                lines_remaining -= 1
                if len(token_list) >= lines_in_n_gram: del token_list[0]
                token_list.append(curr_line)
                tokenized_song.append(curr_line[:-1])
                curr_line = []
                flat = [jt for it in token_list for jt in it]
        # print(tokenized_song[1:])
        print('\n'.join(
            tokenizer.sequences_to_texts(tokenized_song)
        ))

In [None]:
# model.save("model_200")

In [None]:
import random
from collections import defaultdict
unique = []
for i in range(0,len(corpus)):
    unique.append(corpus[i].split())

'''
for i in range(0,20):
  random_list = unique[random.randint(0,len(unique)-1)]
  seed = random_list[random.randint(0,len(random_list))]
  print( generate_lyrics(seed,random.randint(3,15),max_seq_len, model ) )
'''
# duljina pjesme iz iste distribucije kao ostale
# koristenje corpusa tu?
num_lines = random.choice([it.count("\r\n") for it in all_lyrics])
max_line_len = max([max([len(it.split()) for it in song.split("\r\n")]) for song in all_lyrics])


seed_txt = """Here's a little song I wrote
You might want to sing it note for note
Don't worry, be happy"""

generate_lyrics_endl_as_token(seed_txt,
                              lines_in_n_gram=4,
                              num_lines=num_lines,
                              max_line_len=max_line_len,
                              max_seq_len=max_seq_len,
                              model=model)
