# Notebook 03: Preprocessing

In [None]:
# ! pip install psycopg2

In [2]:
import keras
print(keras.__version__)

Using TensorFlow backend.


2.2.4


In [3]:
import tensorflow as tf
print(tf.__version__)

1.12.0


In [126]:
import json, time, re, string, keras, adanet, pickle
import pandas as pd
import psycopg2 as pg2
import numpy as np

from numpy import random
from psycopg2.extras import RealDictCursor, Json
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical

import matplotlib.pyplot as plt

%matplotlib inline
%run ../assets/sql_cred.py

In [8]:
def filename_format_log(file_path, 
                        logfile = '../assets/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
   
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    
    stamp = re.search('(?<!^)(?<!\.)[0-z]+_[0-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Saved at: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

In [9]:
def con_cur_to_db(dbname=DBNAME, dict_cur=None):
    con = pg2.connect(host=IP_ADDRESS,
                  dbname=dbname,
                  user=USER,
                  password=PASSWORD)
    if dict_cur:
        cur = con.cursor(cursor_factory=RealDictCursor)
    else:
        cur = con.cursor()
    return con, cur
    
def execute_query(query, dbname=DBNAME, dict_cur=None, command=False):
    con, cur = con_cur_to_db(dbname, dict_cur)
    cur.execute(f'{query}')
    if not command:
        data = cur.fetchall()
        con.close()
        return data
    con.commit() #sends to server
    con.close() #closes server connection

In [10]:
lyric_df = pd.read_csv('../assets/1548873539_clean_lyrics.csv')

In [11]:
lyric_df.head()

Unnamed: 0,lyrics,clean_text
0,\n\nIf your needle is near\nNeedle is near\nYo...,if your needle is near \n needle is near \n yo...
1,\n\n[Verse 1]\nBrown skin girl on the other si...,brown skin girl on the other side of the room ...
2,"\n\n[Verse 1]\nIt's simple, I love it\nHaving ...",its simple i love it \n having you near me hav...
3,\n\n[Intro: Whistling]\n\n[Verse 1]\nA great b...,a great big bang and dinosaurs \n fiery rainin...
4,\n\n[Verse 1]\nIsn't she lovely\nIsn't she won...,isnt she lovely \n isnt she wonderful \n isnt ...


In [12]:
lyric_df = lyric_df.drop(index=[193], axis=0)

In [13]:
lyric_df.describe()

Unnamed: 0,lyrics,clean_text
count,1800,1800
unique,1800,1800
top,\n\n[Verse 1]\nSee the stone set in your eyes\...,ive making time for tunes \n making tunes a pl...
freq,1,1


In [14]:
lyric_df[lyric_df['clean_text'].str.contains(r'(\s{6,})')]

  """Entry point for launching an IPython kernel.


Unnamed: 0,lyrics,clean_text
31,\n\n3/3\nBoy Rex - The Bloodmonths - 8/8\nJoey...,boy rex the bloodmonths \n joey fatts ill ...
142,"\n\n[Verse 1]\nJenny, Jenny, who can I turn to...",jenny jenny who can i turn to \n you give me s...
195,\n\n[NEW] 1. TWICE - What Is Love?\n[NEW] 2. ...,twice what is love \n exocbx blooming day ...
260,\n\n[Verse 1]:\nElectric lights\nBlow my mind\...,electric lights \n blow my mind \n i feel alri...
552,"\n\nBaby, baby yea\nYou run on my mind yea\nSa...",baby baby yea \n you run on my mind yea \n sam...
938,\n\n[Verse 1]\nThere are two of us on the run\...,there are two of us on the run \n going so fas...
960,"\n\n[Verse 1]\nYou're my baby, my lover, my la...",youre my baby my lover my lady \n all night yo...
1106,\n\nORIGINALTracklist1. Let's Go Crazy\n2. T...,originaltracklist lets go crazy \n take me ...
1169,\n\n[Intro-Live Acoustic]\nKnow your place amo...,know your place among the dark arms of the woo...
1495,\n\n[Verse 1]\nIt was a day\nJust like any oth...,it was a day \n just like any other day \n i w...


In [15]:
lyric_df = lyric_df.drop(index=1768, axis=0)

In [16]:
def split_sequence(text, sequence_length = 7, output_length = 4):
    
    X, y = [], []
    
    split_text = re.split('(\n)|(\[.+\])|\s', text)
    split_text = list(filter(None, split_text))
    split_text = text
    
    for i in range(len(split_text) - sequence_length):
        X.append(split_text[i:i + sequence_length])
        y.append(split_text[i + sequence_length:i + sequence_length + output_length])
        
    return X, y

In [22]:
def generate_samples(X_indexed, y_indexed):
    partition = {}
    labels = {}

    random_seed = 42
    rand_ind = random.choice(range(len(X_indexed)), len(X_indexed), replace=False)
    
    partition['train'] = rand_ind[:int(np.ceil(len(rand_ind)*.8))]
    partition['validation'] = rand_ind[-int(np.ceil(len(rand_ind)*.2))+1:]

    for i in rand_ind:
        labels[i] = y_indexed[i]
        
    return partition, labels

In [134]:
def tokenize_lyrics(df, lyrics_col, seq_len, output_len, save_dir='../assets'):
    X = []
    y = []
    
    corpus = []
    
    print('Processing lyrics...')
    for _, track in df[lyrics_col].iterrows():
        lyrics = track[0]
        lyrics_spaced = re.sub(r'( +)', ' ', lyrics)
        lyrics_split = lyrics_spaced.split(' ')
        corpus.extend(lyrics_split)
                
        for i in range(len(lyrics_split) - seq_len):
            X.append(np.array(lyrics_split[i:i + seq_len]))
            y.extend(np.array(lyrics_split[i + seq_len:i + seq_len + output_len]))
            
    print('Fitting Tokenizer...')
    tokenizer = Tokenizer(oov_token=0)
    tokenizer.filters = tokenizer.filters.replace('\n', '')
    tokenizer.fit_on_texts(corpus)

    vocab_size = len(tokenizer.word_index) + 1
        
    print(f'Vocab size = {vocab_size}')
#         formatted_name, now, file_description= filename_format_log(f'{save_dir}/tokenizer.pkl')

#         with open(formatted_name, 'wb+') as f:
#             pickle.dump(tokenizer, f)
#         print(f'Tokenizer saved to {formatted_name}.')          

    print('Indexing sequences...')
    X_indexed = [[tokenizer.texts_to_sequences([word])[0] for word in row] for row in X]
    y_indexed = [tokenizer.texts_to_sequences([word])[0] for word in y]
    
    print('Partitioning and converting to labels...')

    partition, labels = generate_samples(X_indexed, y_indexed)
    
    np.save(f'{save_dir}/data.npy', partition)
#     X_reshape = np.reshape(X_indexed, (len(X_indexed), seq_len, 1))
 
#     y_cat = to_categorical(y_indexed)
    
    print('Lyrics successfully tokenized, sequenced, and indexed.') 
    
    return partition, labels, vocab_size

In [135]:
partition, labels, vocab_size = tokenize_lyrics(df=lyric_df,
                                   lyrics_col=['clean_text'],
                                   seq_len=4,
                                   output_len=1,
                                   save_dir='../assets'
                                  )

Processing lyrics...
Fitting Tokenizer...
Vocab size = 13470
Indexing sequences...
Partitioning and converting to labels...
Lyrics successfully tokenized, sequenced, and indexed.


In [96]:
partition

{'train': array([356649, 109240,   2957, ...,  87503, 436021,  78552]),
 'validation': array([435192, 493343, 395344, ..., 133438, 191157, 320662])}

In [97]:
len(partition)

2

In [114]:
type(partition['train'][0])

numpy.int64

In [117]:
not_int = []
for i, part in enumerate(partition['train']):
    if not np.issubdtype(partition['train'][0],np.int64):
        not_int.append(i)

In [118]:
not_int

[]

In [45]:
test = X_indexed[103231]

In [49]:
len(X_indexed)

576641

In [48]:
test

[[2], [4], [57], [127]]

In [53]:
labels[363226]

[101]

In [146]:
np.load('../assets/data.npy')

array({'train': array([471140, 539156, 183285, ..., 148349,  67356,  86803]), 'validation': array([449682, 181791, 197150, ..., 369885, 349343,  46369])}, dtype=object)

In [55]:
partition['train'].shape[0]

461313

In [181]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=5000, dim=(5000,1,4), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i] = self.list_IDs[ID]

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [182]:
# Parameters
params = {'dim': (5000,1,4),
          'batch_size': 5000,
          'n_classes': vocab_size,
          'n_channels': 1,
          'shuffle': True
         }

# Datasets
partition = partition
labels = labels

# Generators
training_generator = DataGenerator(partition['train'], labels, **params)
validation_generator = DataGenerator(partition['validation'], labels, **params)

In [183]:
# Compile model
model = Sequential()
model.add(LSTM(32, input_shape=(4, 1)))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy'])

In [184]:
# Train model on dataset
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    verbose=1,
                    workers=6
                   )

# Save model
# formatted_name, now, file_description= filename_format_log('../assets/LSTM_Model.pkl')

# with open(formatted_name, 'wb+') as f:
#     pickle.dump(model, f)

Epoch 1/1


ValueError: setting an array element with a sequence.

In [None]:
# Print model summary
print(model.summary())

In [None]:
history = model.fit (
    X_train, y_train,
    epochs = 150,
    batch_size = 2500,
    verbose = 1,
    
)

# formatted_name, now, file_description= filename_format_log('../assets/LSTM_Model.pkl')

# with open(formatted_name, 'wb+') as f:
#     pickle.dump(model, f)

print(model.summary())

In [None]:
fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(18,6))

ax[0].plot(history.history['loss'])
ax[0].set_title("Loss", fontsize=15);
ax[0].set_xlabel("epochs",fontsize=15);

ax[1].plot(history.history['acc'])
ax[1].set_title("Accuracy",fontsize=15);
ax[1].set_xlabel("epochs",fontsize=15);

In [None]:
def generate_lyrics(seed, model=model, seq_len=4, song_len=50):
    seed_clean = seed.lower().split(' ')
    doc = []

    while len(doc) < song_len:
        text = [seed_clean]
        sequence = [tokenizer.texts_to_sequences([word])[0] for word in text]
        pad_sequence = pad_sequences(sequence, maxlen=seq_len, truncating='pre')
        sequence_reshape = np.reshape(pad_sequence, (len(test_indexed), 4, 1))

        yhat = model.predict_classes(sequence_reshape, verbose=0)

        for word, index in tokenizer.word_index.items():
            if index == yhat:
                seed_clean.append(word)
                doc.append(word)

    return ' '.join(doc)

In [None]:
lyrics = generate_lyrics('needles are for lovers', song_len=150)

In [None]:
lyrics

In [None]:
# formatted_name, now, file_description= filename_format_log(file_path = '../assets/lyric_X74.pkl')

# with open(formatted_name, 'wb+') as f:
#     pickle.dump(X, f)

In [None]:
# formatted_name, now, file_description= filename_format_log(file_path = '../assets/lyric_y74.pkl')

# with open(formatted_name, 'wb+') as f:
#     pickle.dump(y, f)

lyric_df['cleaner_text'] = lyric_df['clean_text'].map(lambda x: re.sub(r'( +)', ' ', x).split(' '))

[corpus.add(word) for song in lyric_df['clean_text'].map(lambda x: re.sub(r'( +)', ' ', x).split(' ')).values for word in song];

seq_len, output_len = 4, 1

def sequencer(lyrics_split, seq_len=4, output_len=1):
    X, y = [], []
    for i in range(len(lyrics_split) - seq_len):
        X.append(lyrics_split[i:i + seq_len])
        y.extend(lyrics_split[i + seq_len:i + seq_len + output_len])
    return X, y

foo = pd.DataFrame([1,2,3])

result = lyric_df.cleaner_text.map(sequencer)

result[0][1]


lyrics_split[i + 4:i + 4 + 1]