# Notebook 03: Preprocessing

In [None]:
# ! pip install psycopg2

In [71]:
import keras
print(keras.__version__)

2.2.4


In [72]:
import tensorflow as tf
print(tf.__version__)

1.12.0


In [73]:
import json, time, re, string, keras, adanet, pickle
import pandas as pd
import psycopg2 as pg2
import numpy as np

from numpy import random
from psycopg2.extras import RealDictCursor, Json
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical

import matplotlib.pyplot as plt

%matplotlib inline
%run ../assets/sql_cred.py

In [49]:
def filename_format_log(file_path, 
                        logfile = '../assets/file_log.txt', 
                        now = round(time.time()), 
                        file_description = None): 
   
    try:
        ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start() 
    except:
        raise NameError('Please enter a relative path with a file extension.') 
    
    stamp = re.search('(?<!^)(?<!\.)[0-z]+_[0-z]+(?=\.)', file_path).start()
    formatted_name = f'{file_path[:stamp]}{now}_{file_path[stamp:]}'  
    if not file_description:
        file_description = f'Saved at: {time.asctime(time.gmtime(now))}'
    with open(logfile, 'a+') as f:
        f.write(f'{formatted_name}: {file_description}\n')
    return formatted_name, now, file_description

In [50]:
def con_cur_to_db(dbname=DBNAME, dict_cur=None):
    con = pg2.connect(host=IP_ADDRESS,
                  dbname=dbname,
                  user=USER,
                  password=PASSWORD)
    if dict_cur:
        cur = con.cursor(cursor_factory=RealDictCursor)
    else:
        cur = con.cursor()
    return con, cur
    
def execute_query(query, dbname=DBNAME, dict_cur=None, command=False):
    con, cur = con_cur_to_db(dbname, dict_cur)
    cur.execute(f'{query}')
    if not command:
        data = cur.fetchall()
        con.close()
        return data
    con.commit() #sends to server
    con.close() #closes server connection

In [51]:
lyric_df = pd.read_csv('../assets/1549341381_lyric_df.csv')

In [52]:
lyric_df.head()

Unnamed: 0,lyrics,clean_lyrics,total_words_track,unique_words_track,total_lines_track,unique_lines_track,mean_words_line,mean_unique_words_line
0,\n\nIf your needle is near\nNeedle is near\nYo...,if your needle is near \n needle is near \n yo...,57,20,14,8,5.9,5.1
1,\n\n[Verse 1]\nBrown skin girl on the other si...,brown skin girl on the other side of the room ...,132,52,24,13,7.4,5.8
2,"\n\n[Verse 1]\nIt's simple, I love it\nHaving ...",its simple i love it \n having you near me hav...,151,63,29,21,7.1,5.8
3,\n\n[Intro: Whistling]\n\n[Verse 1]\nA great b...,a great big bang and dinosaurs \n fiery rainin...,126,76,20,18,8.2,7.2
4,\n\n[Verse 1]\nIsn't she lovely\nIsn't she won...,isnt she lovely \n isnt she wonderful \n isnt ...,108,66,21,20,7.0,6.1


In [59]:
lyric_df.describe()

Unnamed: 0,total_words_track,unique_words_track,total_lines_track,unique_lines_track,mean_words_line,mean_unique_words_line
count,1785.0,1785.0,1785.0,1785.0,1785.0,1785.0
mean,284.514286,96.840896,40.661064,27.560224,11.542241,8.510756
std,136.291119,44.114334,18.013478,12.20159,62.028609,24.885099
min,13.0,5.0,1.0,1.0,4.2,3.6
25%,195.0,74.0,28.0,20.0,7.9,6.6
50%,267.0,92.0,38.0,25.0,8.8,7.4
75%,346.0,112.0,51.0,33.0,9.9,8.3
max,2230.0,956.0,224.0,189.0,2230.0,956.0


In [68]:
def split_sequence(text, sequence_length = 7, output_length = 4):
    
    X, y = [], []
    
    split_text = re.split('(\n)|(\[.+\])|\s', text)
    split_text = list(filter(None, split_text))
    split_text = text
    
    for i in range(len(split_text) - sequence_length):
        X.append(split_text[i:i + sequence_length])
        y.append(split_text[i + sequence_length:i + sequence_length + output_length])
        
    return X, y

In [69]:
def generate_samples(X_indexed, y_indexed):
    partition = {}
    labels = {}

    random_seed = 42
    rand_ind = random.choice(range(len(X_indexed)), len(X_indexed), replace=False)
    
    partition['train'] = rand_ind[:int(np.ceil(len(rand_ind)*.8))]
    partition['validation'] = rand_ind[-int(np.ceil(len(rand_ind)*.2))+1:]

    for i in rand_ind:
        labels[i] = y_indexed[i]
        
    return partition, labels

In [70]:
def tokenize_lyrics(df, lyrics_col, seq_len, output_len, save_dir='../assets'):
    X = []
    y = []
    
    corpus = []
    
    print('Processing lyrics...')
    for _, track in df[lyrics_col].iterrows():
        lyrics = track[0]
        lyrics_spaced = re.sub(r'( +)', ' ', lyrics)
        lyrics_split = lyrics_spaced.split(' ')
        corpus.extend(lyrics_split)
                
        for i in range(len(lyrics_split) - seq_len):
            X.append(np.array(lyrics_split[i:i + seq_len]))
            y.extend(np.array(lyrics_split[i + seq_len:i + seq_len + output_len]))
            
    print('Fitting Tokenizer...')
    tokenizer = Tokenizer(oov_token=0)
    tokenizer.filters = tokenizer.filters.replace('\n', '')
    tokenizer.fit_on_texts(corpus)

    vocab_size = len(tokenizer.word_index) + 1
        
    print(f'Vocab size = {vocab_size}')
#         formatted_name, now, file_description= filename_format_log(f'{save_dir}/tokenizer.pkl')

#         with open(formatted_name, 'wb+') as f:
#             pickle.dump(tokenizer, f)
#         print(f'Tokenizer saved to {formatted_name}.')          

    print('Indexing sequences...')
    X_indexed = [[tokenizer.texts_to_sequences([word])[0] for word in row] for row in X]
    y_indexed = [tokenizer.texts_to_sequences([word])[0] for word in y]
    
    print('Partitioning and converting to labels...')

    partition, labels = generate_samples(X_indexed, y_indexed)
    
    np.save(f'{save_dir}/data.npy', partition)
#     X_reshape = np.reshape(X_indexed, (len(X_indexed), seq_len, 1))
 
#     y_cat = to_categorical(y_indexed)
    
    print('Lyrics successfully tokenized, sequenced, and indexed.') 
    
    return partition, labels, vocab_size

In [135]:
partition, labels, vocab_size = tokenize_lyrics(df=lyric_df,
                                   lyrics_col=['clean_text'],
                                   seq_len=4,
                                   output_len=1,
                                   save_dir='../assets'
                                  )

Processing lyrics...
Fitting Tokenizer...
Vocab size = 13470
Indexing sequences...
Partitioning and converting to labels...
Lyrics successfully tokenized, sequenced, and indexed.


In [96]:
partition

{'train': array([356649, 109240,   2957, ...,  87503, 436021,  78552]),
 'validation': array([435192, 493343, 395344, ..., 133438, 191157, 320662])}

In [117]:
not_int = []
for i, part in enumerate(partition['train']):
    if not np.issubdtype(partition['train'][0],np.int64):
        not_int.append(i)

In [181]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=5000, dim=(5000,1,4), n_channels=1,
                 n_classes=10, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i] = self.list_IDs[ID]

            # Store class
            y[i] = self.labels[ID]

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

In [182]:
# Parameters
params = {'dim': (5000,1,4),
          'batch_size': 5000,
          'n_classes': vocab_size,
          'n_channels': 1,
          'shuffle': True
         }

# Datasets
partition = partition
labels = labels

# Generators
training_generator = DataGenerator(partition['train'], labels, **params)
validation_generator = DataGenerator(partition['validation'], labels, **params)

In [None]:
# lyric_df['cleaner_text'] = lyric_df['clean_text'].map(lambda x: re.sub(r'( +)', ' ', x).split(' '))

In [None]:
[corpus.add(word) for song in lyric_df['clean_text'].map(lambda x: re.sub(r'( +)', ' ', x).split(' ')).values for word in song];

In [None]:
# seq_len, output_len = 4, 1

In [None]:
# def sequencer(lyrics_split, seq_len=4, output_len=1):
#     X, y = [], []
#     for i in range(len(lyrics_split) - seq_len):
#         X.append(lyrics_split[i:i + seq_len])
#         y.extend(lyrics_split[i + seq_len:i + seq_len + output_len])
#     return X, y

In [None]:
# foo = pd.DataFrame([1,2,3])

In [None]:
# result = lyric_df.cleaner_text.map(sequencer)

In [None]:
# result[0][1]

In [None]:
# lyrics_split[i + 4:i + 4 + 1]

In [None]:
def sample(yhat, temperature=1.0):
    yhats = np.asarray(yhat).astype('float64')
    yhat = np.log(yhat) / temperature
    exp_preds = np.exp(yhat)
    yhat = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, yhat, 1)
    return np.argmax(probas)

In [150]:
# def generate_lyrics(self,
seed = 'This is an example seed'
seq_len = 4
song_len = 50
temperature = 1.0

seed_clean = [seed.lower().split(' ')]
doc = seed_clean

seed_len = len(seed_clean[0])
num_seq = int(np.ceil(seed_len/seq_len))


In [159]:
# while len(doc) < np.abs((song_len-seed_len)):
text = seed_clean
sequence = [tokenizer.texts_to_sequences([word])[0] for word in text]

In [160]:
sequence

[[35, 30, 184, 10668, 2806]]

In [161]:
num_seq

2

In [162]:
if seed_len % seq_len != 0:
    pad = (num_seq*seq_len) - seed_len

In [163]:
sequence[0].extend(np.zeros(pad, dtype='int'))

In [164]:
sequence

[[35, 30, 184, 10668, 2806, 0, 0, 0]]

In [165]:
sequence = np.reshape(sequence[0], (num_seq, seq_len))

In [166]:
sequence

array([[   35,    30,   184, 10668],
       [ 2806,     0,     0,     0]])

In [None]:
pad_sequence = pad_sequences(sequence, maxlen=seq_len, truncating='pre')
sequence_reshape = np.reshape(pad_sequence, (1, seq_len))

yhat = self.model.predict(sequence_reshape, verbose=0)[0]
next_index = self.sample(yhat, temperature)

for word, index in tokenizer.word_index.items():
    if index == next_index:
        seed_clean.append(word)
        doc.append(word)

self.output = ' '.join(doc)
print(self.output)

In [None]:
def track_structure(track):
    words_track = []    
    lines_track = []
    
    lyrics_spaced = re.sub(r'( +)', ' ', track)
    
    lyrics_split = lyrics_spaced.split(' ')
    
    lines_split = lyrics_spaced.split('\n')
    lines_track.append(lines_split)
    
    no_nl = re.sub(r'\n ', '', lyrics_spaced)
    lyrics_split = no_nl.split(' ')
    words_track.append(lyrics_split)

In [None]:
# print('Creating encoding dicts from corpus...')
# words = sorted(list(set(corpus)))
# print(f'Count of unique words (i.e., features): {len(words)}')
# words_index = dict((c, i+1) for i, c in enumerate(words))
# index_words = dict((i+1, c) for i, c in enumerate(words))

In [11]:
# def split_sequence(text, sequence_length = 7, output_length = 4):
    
#     X, y = [], []
    
#     split_text = re.split('(\n)|(\[.+\])|\s', text)
#     split_text = list(filter(None, split_text))
#     split_text = text
    
#     for i in range(len(split_text) - sequence_length):
#         X.append(split_text[i:i + sequence_length])
#         y.append(split_text[i + sequence_length:i + sequence_length + output_length])
        
#     return X, y

In [None]:
# print('Indexing sequences...')
# X_indexed = [[words_index[word] for word in row] for row in X]
# y_indexed = [words_index[word] for word in y]
# print('Number of sequences')
# print('Partitioning and converting to labels...')

In [None]:
#     partition, labels = generate_samples(X_indexed, y_indexed)

#     np.save(f'{save_dir}/data.npy', partition)
# X_reshape = np.reshape(X_indexed, (len(X_indexed), seq_len))

# y_cat = to_categorical(y_indexed)

# print('Lyrics successfully tokenized, sequenced, and indexed.') 

In [21]:
# def generate_samples(X_indexed, y_cat, seq_len=4, random_seed = 42):
#     X_train = [] 
#     X_test = []
#     y_train = []
#     y_test = []
    
#     rand_ind = random.choice(range(len(X_indexed)), len(X_indexed), replace=False)
    
#     train_ind = rand_ind[:int(np.ceil(len(rand_ind)*.8))]
#     test_ind = rand_ind[-int(np.ceil(len(rand_ind)*.2))+1:]

#     for i in train_ind:
#         X_train.append(X_indexed[i])
#         y_train.append(y_cat[i])
        
#     for i in test_ind:
#         X_test.append(X_indexed[i])
#         y_test.append(y_cat[i])
        
#     return np.reshape(X_train, (len(X_train), seq_len)) , np.reshape(X_test, (len(X_test), seq_len)), np.array(y_train), np.array(y_test)

In [22]:
# X_train, X_test, y_train, y_test = generate_samples(X_indexed, y_cat)