In [4]:
import json, time, re, string, keras, adanet, pickle
import pandas as pd
import psycopg2 as pg2
import numpy as np


from numpy import random
from psycopg2.extras import RealDictCursor, Json
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical

import matplotlib.pyplot as plt

%matplotlib inline
%run ../assets/sql_cred.py

In [7]:
def filename_format_log(file_path, 
                        logfile = '../assets/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
   
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    
    stamp = re.search('(?<!^)(?<!\.)[0-z]+_[0-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Saved at: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

In [8]:
def con_cur_to_db(dbname=DBNAME, dict_cur=None):
    con = pg2.connect(host=IP_ADDRESS,
                  dbname=dbname,
                  user=USER,
                  password=PASSWORD)
    if dict_cur:
        cur = con.cursor(cursor_factory=RealDictCursor)
    else:
        cur = con.cursor()
    return con, cur
    
def execute_query(query, dbname=DBNAME, dict_cur=None, command=False):
    con, cur = con_cur_to_db(dbname, dict_cur)
    cur.execute(f'{query}')
    if not command:
        data = cur.fetchall()
        con.close()
        return data
    con.commit() #sends to server
    con.close() #closes server connection

In [9]:
lyric_df = pd.read_csv('../assets/1548873539_clean_lyrics.csv')

In [10]:
lyric_df.head()

Unnamed: 0,lyrics,clean_text
0,\n\nIf your needle is near\nNeedle is near\nYo...,if your needle is near \n needle is near \n yo...
1,\n\n[Verse 1]\nBrown skin girl on the other si...,brown skin girl on the other side of the room ...
2,"\n\n[Verse 1]\nIt's simple, I love it\nHaving ...",its simple i love it \n having you near me hav...
3,\n\n[Intro: Whistling]\n\n[Verse 1]\nA great b...,a great big bang and dinosaurs \n fiery rainin...
4,\n\n[Verse 1]\nIsn't she lovely\nIsn't she won...,isnt she lovely \n isnt she wonderful \n isnt ...


In [11]:
lyric_df = lyric_df.drop(index=[193], axis=0)

In [12]:
lyric_df.describe()

Unnamed: 0,lyrics,clean_text
count,1800,1800
unique,1800,1800
top,\n\nAnd I wake up today\nAnd I’m feeling so go...,i wont go livin in the past \n but i believe t...
freq,1,1


In [13]:
lyric_df[lyric_df['clean_text'].str.contains(r'(\s{6,})')]

  """Entry point for launching an IPython kernel.


Unnamed: 0,lyrics,clean_text
31,\n\n3/3\nBoy Rex - The Bloodmonths - 8/8\nJoey...,boy rex the bloodmonths \n joey fatts ill ...
142,"\n\n[Verse 1]\nJenny, Jenny, who can I turn to...",jenny jenny who can i turn to \n you give me s...
195,\n\n[NEW] 1. TWICE - What Is Love?\n[NEW] 2. ...,twice what is love \n exocbx blooming day ...
260,\n\n[Verse 1]:\nElectric lights\nBlow my mind\...,electric lights \n blow my mind \n i feel alri...
552,"\n\nBaby, baby yea\nYou run on my mind yea\nSa...",baby baby yea \n you run on my mind yea \n sam...
938,\n\n[Verse 1]\nThere are two of us on the run\...,there are two of us on the run \n going so fas...
960,"\n\n[Verse 1]\nYou're my baby, my lover, my la...",youre my baby my lover my lady \n all night yo...
1106,\n\nORIGINALTracklist1. Let's Go Crazy\n2. T...,originaltracklist lets go crazy \n take me ...
1169,\n\n[Intro-Live Acoustic]\nKnow your place amo...,know your place among the dark arms of the woo...
1495,\n\n[Verse 1]\nIt was a day\nJust like any oth...,it was a day \n just like any other day \n i w...


In [14]:
lyric_df = lyric_df.drop(index=1768, axis=0)

In [15]:
def split_sequence(text, sequence_length = 7, output_length = 4):
    
    X, y = [], []
    
    split_text = re.split('(\n)|(\[.+\])|\s', text)
    split_text = list(filter(None, split_text))
    split_text = text
    
    for i in range(len(split_text) - sequence_length):
        X.append(split_text[i:i + sequence_length])
        y.append(split_text[i + sequence_length:i + sequence_length + output_length])
        
    return X, y

In [16]:
# tokenize_lyrics(
df=lyric_df
lyrics_col=['clean_text']
seq_len=4 
output_len=1
save_dir='../assets'
# )

X = []
y = []

corpus = []

print('Processing lyrics...')
for _, track in df[lyrics_col].iterrows():
    lyrics = track[0]
    lyrics_spaced = re.sub(r'( +)', ' ', lyrics)
    lyrics_split = lyrics_spaced.split(' ')
    corpus.extend(lyrics_split)

    for i in range(len(lyrics_split) - seq_len):
        X.append(np.array(lyrics_split[i:i + seq_len]))
        y.extend(np.array(lyrics_split[i + seq_len:i + seq_len + output_len]))

print('Creating encoding dicts from corpus...')
words = sorted(list(set(corpus)))

Processing lyrics...
Creating encoding dicts from corpus...


In [24]:
print(f'Count of unique words (i.e., features): {len(words)}')
words_index = dict((c, i+1) for i, c in enumerate(words))
index_words = dict((i+1, c) for i, c in enumerate(words))

Count of unique words (i.e., features): 13468


In [18]:
#         formatted_name, now, file_description= filename_format_log(f'{save_dir}/tokenizer.pkl')

#         with open(formatted_name, 'wb+') as f:
#             pickle.dump(tokenizer, f)
#         print(f'Tokenizer saved to {formatted_name}.')          

In [19]:
print('Indexing sequences...')
X_indexed = [[words_index[word] for word in row] for row in X]
y_indexed = [words_index[word]for word in y]
print('Number of sequences')
print('Partitioning and converting to labels...')

Indexing sequences...
Number of sequences
Partitioning and converting to labels...


In [33]:
#     partition, labels = generate_samples(X_indexed, y_indexed)

#     np.save(f'{save_dir}/data.npy', partition)
X_reshape = np.reshape(X_indexed, (len(X_indexed), seq_len))

y_cat = to_categorical(y_indexed)

print('Lyrics successfully tokenized, sequenced, and indexed.') 

Lyrics successfully tokenized, sequenced, and indexed.


In [21]:
def generate_samples(X_indexed, y_cat, seq_len=4, random_seed = 42):
    X_train = [] 
    X_test = []
    y_train = []
    y_test = []
    
    rand_ind = random.choice(range(len(X_indexed)), len(X_indexed), replace=False)
    
    train_ind = rand_ind[:int(np.ceil(len(rand_ind)*.8))]
    test_ind = rand_ind[-int(np.ceil(len(rand_ind)*.2))+1:]

    for i in train_ind:
        X_train.append(X_indexed[i])
        y_train.append(y_cat[i])
        
    for i in test_ind:
        X_test.append(X_indexed[i])
        y_test.append(y_cat[i])
        
    return np.reshape(X_train, (len(X_train), seq_len, 1)) , np.reshape(X_test, (len(X_test), seq_len, 1)), np.array(y_train), np.array(y_test)

In [None]:
X_train, X_test, y_train, y_test = generate_samples(X_indexed, y_cat)

In [49]:
type(X_reshape)

numpy.ndarray

In [46]:
X_reshape.shape

(576641, 4, 1)

In [48]:
type(y_cat)

numpy.ndarray

In [51]:
type(y_train)

list

In [31]:
# Compile model
model = Sequential()
model.add(Embedding(len(words)+1, 3000, mask_zero=True))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(100, activation='relu'))
model.add(Dense(y_cat.shape[1], activation='softmax'))

model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy'])

In [34]:
# Train model on dataset
model.fit(X_reshape,
          y_cat,
          verbose=1,
         )

# Save model
# formatted_name, now, file_description= filename_format_log('../assets/LSTM_Model.pkl')

# with open(formatted_name, 'wb+') as f:
#     pickle.dump(model, f)

Epoch 1/1
  2400/576641 [..............................] - ETA: 4:25:13 - loss: 7.6608 - acc: 0.1125

KeyboardInterrupt: 

In [None]:
# Print model summary
print(model.summary())

In [None]:
history = model.fit (
    X_train, y_train,
    epochs = 150,
    batch_size = 2500,
    verbose = 1,
    
)

# formatted_name, now, file_description= filename_format_log('../assets/LSTM_Model.pkl')

# with open(formatted_name, 'wb+') as f:
#     pickle.dump(model, f)

print(model.summary())

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(18,6))

ax[0].plot(history.history['loss'])
ax[0].set_title("Loss", fontsize=15);
ax[0].set_xlabel("epochs",fontsize=15);

ax[1].plot(history.history['acc'])
ax[1].set_title("Accuracy",fontsize=15);
ax[1].set_xlabel("epochs",fontsize=15);

In [None]:
def generate_lyrics(seed, model=model, seq_len=4, song_len=50):
    seed_clean = seed.lower().split(' ')
    doc = []

    while len(doc) < song_len:
        text = [seed_clean]
        sequence = [tokenizer.texts_to_sequences([word])[0] for word in text]
        pad_sequence = pad_sequences(sequence, maxlen=seq_len, truncating='pre')
        sequence_reshape = np.reshape(pad_sequence, (len(test_indexed), 4, 1))

        yhat = model.predict_classes(sequence_reshape, verbose=0)

        for word, index in tokenizer.word_index.items():
            if index == yhat:
                seed_clean.append(word)
                doc.append(word)

    return ' '.join(doc)

In [None]:
lyrics = generate_lyrics('needles are for lovers', song_len=150)

In [None]:
lyrics