In [1]:
import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, SimpleRNN, Dropout, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import string, os
import random
import re
import pandas as pd
import numpy as np
# from keras import utils as np_utils
from tqdm import tqdm # pip3 install tqdm
import enchant # pip3 install pyenchant

In [2]:
ENDL_AS_TOKEN = True # False not supported
DATA_SUBSET = "60%en_US" # or "Artist - Bob Dylan"
LYRICS_LIMIT = 200 # 0 for no limit, don't use negative numbers
REMOVE_TAGS = True
N_GRAMS_LINES = 4
DROP_DUPLICATES = True
REMOVE_EMPTY_ROWS = False
REMOVE_DOUBLE_EMPTY_ROWS = True

In [3]:
def get_lyrics(songs, artist, title=None):
    if title is None: return songs[(songs.artist == artist)]
    return songs[(songs.artist == artist) & (songs.title == title)]

def is_correct_language(lyrics, enchant_dict, retain_rate):
    ws = keras.preprocessing.text.text_to_word_sequence(lyrics)
    return sum(map(enchant_dict.check, ws)) >= retain_rate * len(ws)

In [4]:
songs = pd.read_csv('./datasets/labeled_lyrics_cleaned.csv',
                    usecols=["artist", "seq", "song"])
songs.rename(columns={"seq": "lyrics", "song": "title"}, inplace=True)

In [5]:

songs.dropna(inplace=True)
if REMOVE_TAGS:
    songs.lyrics = songs.lyrics.str.strip().replace(re.compile(r"\r|(\[[^\n\]]*])"), '')
else:
    songs.lyrics = songs.lyrics.str.strip().replace('\r', '')
if REMOVE_EMPTY_ROWS:
    songs.lyrics = songs.lyrics.str.replace(re.compile(r"\n+"), '\n')
if REMOVE_DOUBLE_EMPTY_ROWS and not REMOVE_EMPTY_ROWS:
    songs.lyrics = songs.lyrics.str.replace(re.compile(r"\n\n+"), '\n\n')
if ENDL_AS_TOKEN:
    songs.lyrics = songs.lyrics.str.replace('\n', ' endl\n') + ' endl'
if DROP_DUPLICATES:
    songs.drop_duplicates(subset="lyrics", inplace=True, ignore_index=True)
all_lyrics = []
if '%' in DATA_SUBSET:
    song_iterator = songs.iterrows()
    language = DATA_SUBSET[DATA_SUBSET.index('%') + 1:]
    percentage = int(DATA_SUBSET[:DATA_SUBSET.index('%')]) * 0.01
    d = enchant.Dict(language)
    for i in tqdm(range(LYRICS_LIMIT or len(songs))):
        index, song = next(song_iterator)
        while(not(is_correct_language(song.lyrics, d, percentage))):
            index, song = next(song_iterator)
        all_lyrics.append(song.lyrics)
elif DATA_SUBSET.startswith("Artist - "):
    artists_songs = get_lyrics(songs, DATA_SUBSET[9:])
    if LYRICS_LIMIT:
        all_lyrics = list(artists_songs.lyrics.iloc[:LYRICS_LIMIT])
    else:
        all_lyrics = list(artists_songs.lyrics)
lyrics_cnt = len(all_lyrics)

100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:05<00:00, 39.25it/s]


In [17]:
# Bob Marley, Bon Jovi, Boney M., Eminem, Iron Maiden, Madonna,
# R.E.M., Red Hot Chilli Peppers, The Beatles, The Rolling Stones, U2
# for name, his_songs in songs.groupby("artist"):
#     if len(his_songs) > 95:
#         print(name, len(his_songs))
# print('\n'.join(sorted(set(songs.artist))))
# del name, his_songs
# print(next(iter(get_lyrics(songs, "Bob Marley", "Three Little Birds").lyrics)))
# print(len(list(get_lyrics(songs, "Bob Dylan").lyrics)))
# _temp_t = Tokenizer()
# _temp_t.fit_on_texts(songs.lyrics)
from pprint import pprint
pprint(tokenizer.word_index["hello"])
pprint(vars(tokenizer).keys())

1788
dict_keys(['word_counts', 'word_docs', 'filters', 'split', 'lower', 'num_words', 'document_count', 'char_level', 'oov_token', 'index_docs', 'word_index', 'index_word'])


In [7]:
def get_sequence_of_tokens(corpus, tokenizer, lines_in_n_gram=2):
    input_sequences = []
    for song in tqdm(corpus):
        token_lists = tokenizer.texts_to_sequences(song.split('\n'))
        for i in range(lines_in_n_gram, len(token_lists)):
            flat = [jt for it in token_lists[i-lines_in_n_gram:i] for jt in it]
            for j in range(1, len(flat)):
                n_gram_sequence = flat[:j+1]
                input_sequences.append(n_gram_sequence)
    return input_sequences

def get_padded_sequences(input_sequences):
    max_seq_len = max([len(x) for x in input_sequences])
    padded_input_sequences = np.array(pad_sequences(input_sequences, 
                                                     maxlen=max_seq_len,
                                                     padding='pre'))
    return padded_input_sequences, max_seq_len


def prep_train_data(padded_input_sequences, total_words):
    
    x_train = padded_input_sequences[:,:-1]
    labels = padded_input_sequences[:,-1]
    labels = keras.utils.np_utils.to_categorical(labels, num_classes=total_words)
    
    return x_train, labels

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_lyrics)
total_words = len(tokenizer.word_index) + 1
input_sequences = get_sequence_of_tokens(all_lyrics, tokenizer, 4)
padded_input_sequences, max_seq_len = get_padded_sequences(input_sequences)
x_train, labels = prep_train_data(padded_input_sequences,total_words)

100%|███████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 950.41it/s]


In [9]:
# print('\n'.join(sorted(tokenizer.word_index)), end="\n\n\n")
# print('\n'.join(tokenizer.sequences_to_texts(input_sequences[:20])))
print(max_seq_len)

58


In [15]:
'''
def lstm_model(max_seq_len,total_words):
    
    input_len = max_seq_len - 1 #zadnju predvidam
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add an LSTM Layer
    model.add(Bidirectional(LSTM(150, return_sequences=True)))  # A dropout layer for regularisation
    model.add(Dropout(0.2))# Add another LSTM Layer
    model.add(LSTM(100))
    model.add(Dense(total_words/2, activation='relu'))
    # In the last layer, the shape should be equal to the total number of words present in our corpus
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')
    #(# Pick a loss function and an optimizer)print(model.summary())
    
    return model
'''
def lstm_model(max_seq_len,total_words):
    
    input_len = max_seq_len - 1 #zadnju predvidam
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(128))
    model.add(Dropout(0.1))
    
    # model.add(SimpleRNN(128))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model
model = lstm_model(max_seq_len,total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 57, 10)            31580     
                                                                 
 lstm (LSTM)                 (None, 128)               71168     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 3158)              407382    
                                                                 
Total params: 510,130
Trainable params: 510,130
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(x_train, labels, epochs=20, verbose=1, initial_epoch=0)
# model = keras.models.load_model("model_200_50")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ec83944190>

In [17]:
import random
def generate_lyrics(seed_txt, next_words_cnt , max_seq_len, model):
  
    for i in range(0, next_words_cnt):
        token_list = tokenizer.texts_to_sequences([seed_txt])[0]
        padded_token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        
        prediction = np.argmax(model.predict(padded_token_list), axis=-1)

        #print('prediction',prediction)
        #tokenizer.word_index.items() --> rijecnik (rijec,index)
        
        
        for (word, index) in tokenizer.word_index.items():
            output_word = ""
            if (prediction == index):
                seed_txt += " " + word
                break
      
        
    return seed_txt.title()

def generate_lyrics_endl_as_token(seed_txt,
                                  lines_in_n_gram,
                                  num_lines,
                                  max_line_len,
                                  max_seq_len,
                                  model,
                                  no_anaphora=False,
                                  keep_seed=False):
    # cleaning the seed text, adding endl and splitting into lines
    st_list = (seed_txt.strip().replace('\r', "").replace('\n', " endl\n")
               + " endl").split('\n')
    # keep at most last lines_in_gram lines
    st_list = st_list[max(0, len(st_list) - lines_in_n_gram):]
    # seed text to tokens (list of tokens for each line)
    token_list = tokenizer.texts_to_sequences(st_list)
    if len(token_list) < lines_in_n_gram = 1:
        
    # how many lines do we need
    lines_remaining = num_lines - (len(token_list) if keep_seed else 0)
    # flatten all the tokens into single list
    flat = [jt for it in token_list for jt in it]
    endl_token = tokenizer.word_index["endl"]
    tokenized_song = []
    curr_line = []
    while lines_remaining:
        # print(lines_remaining)
        if len(curr_line) >= max_line_len:
            prediction = endl_token
        else:
            padded_token_list = pad_sequences([flat], maxlen=max_seq_len-1, padding='pre')
            predictions = model.predict(padded_token_list)
            # print(type(predictions))
            if (no_anaphora):
                for prev_line in token_list:
                    if (len(curr_line) < len(prev_line)
                        and curr_line == prev_line[:len(curr_line)]):
                        # print(curr_line)
                        word_in_prev_line = prev_line[len(curr_line)]
                        # print(word_in_prev_line)
                        predictions[0][word_in_prev_line] = np.min(predictions, axis=-1)
            prediction = np.argmax(predictions, axis=-1)[0]
        curr_line.append(prediction)
        flat.append(prediction)
        if prediction == endl_token:
            lines_remaining -= 1
            if len(token_list) >= lines_in_n_gram: del token_list[0]
            token_list.append(curr_line)
            tokenized_song.append(curr_line[:-1])
            curr_line = []
            flat = [jt for it in token_list for jt in it]
    # print(tokenized_song[1:])
    print('\n'.join(
        tokenizer.sequences_to_texts(tokenized_song)
    ))

In [18]:
model.save("cudnn model_200_10")



INFO:tensorflow:Assets written to: cudnn model_200_10\assets


INFO:tensorflow:Assets written to: cudnn model_200_10\assets


In [19]:

'''unique = []
for i in range(0,len(corpus)):
    unique.append(corpus[i].split())'''

'''
for i in range(0,20):
  random_list = unique[random.randint(0,len(unique)-1)]
  seed = random_list[random.randint(0,len(random_list))]
  print( generate_lyrics(seed,random.randint(3,15),max_seq_len, model ) )
'''
# duljina pjesme iz iste distribucije kao ostale
# koristenje corpusa tu?
num_lines = random.choice([it.count("\n") for it in all_lyrics])
max_line_len = max([max([len(it.split()) for it in song.split("\n")]) for song in all_lyrics])


seed_txt = """Here's a little song I wrote
You might want to sing it note for note
Don't worry, be happy"""

# seed_txt = ""

generate_lyrics_endl_as_token(seed_txt,
                              lines_in_n_gram=4,
                              num_lines=num_lines,
                              max_line_len=max_line_len,
                              max_seq_len=max_seq_len,
                              model=model,
                              no_anaphora=True)



i know i shouldn't
but that's the way it is
with the touch of your hand
you can't fool why i do

i don't know what i really am i
if i saw you
everyone's wrong but me
you know it'll work alright
lady be good do what you should

'cause i found my heart in a crowd
i prefer to live my side
to feel my fisty cuffs
you know you were the best he ever had oh oh oh

