In [56]:
import pandas as pd 
import numpy as np 
# https://github.com/petrosDemetrakopoulos/RNN-Beatles-lyrics-generator
# https://github.com/starry91/Lyric-Generator#2-lyric-generator-based-on-word-level-rnn

In [57]:
data = pd.read_csv("sample-dataset.csv")
data.head()

Unnamed: 0,artist,genre,title,lyrics
0,Migos,rap,Stir Fry,"Woo, woo, woo, woo\nWoo, woo, woo, woo\n\nDanc..."
1,Snoop Dogg,rap,Drop It Like It‚Äôs Hot,"Snoop\nSnoop\n\nWhen the pimp's in the crib, m..."
2,Drake,rap,Headlines,I might be too strung out on compliments\nOver...
3,Lil Uzi Vert,rap,XO TOUR Llif3,"Are you alright?\nI'm alright, I'm quite alrig..."
4,Lil Uzi Vert,rap,The Way Life Goes,"That's true (That's true), that's right (That ..."


In [58]:
data['t-lyric'] = data['title'] + " @@@ " + data['lyrics']
data['t-lyric'] = data['t-lyric'].str.lower()
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,Migos,rap,Stir Fry,"Woo, woo, woo, woo\nWoo, woo, woo, woo\n\nDanc...","stir fry @@@ woo, woo, woo, woo\nwoo, woo, woo..."
1,Snoop Dogg,rap,Drop It Like It‚Äôs Hot,"Snoop\nSnoop\n\nWhen the pimp's in the crib, m...",drop it like it‚äôs hot @@@ snoop\nsnoop\n\nwh...
2,Drake,rap,Headlines,I might be too strung out on compliments\nOver...,headlines @@@ i might be too strung out on com...
3,Lil Uzi Vert,rap,XO TOUR Llif3,"Are you alright?\nI'm alright, I'm quite alrig...",xo tour llif3 @@@ are you alright?\ni'm alrigh...
4,Lil Uzi Vert,rap,The Way Life Goes,"That's true (That's true), that's right (That ...",the way life goes @@@ that's true (that's true...


In [59]:
stopChars = [',','(',')','.','-','[',']','"']
# preprocessing the corpus by converting all letters to lowercase, 
# replacing blank lines with blank string and removing special characters
def preprocessText(text):
#     text = text.replace('\n', ' ').replace('\t','')
    processedText = text.lower()
    for char in stopChars:
        processedText = processedText.replace(char,'')
    return processedText
data['t-lyric'] = data['t-lyric'].apply(preprocessText)

In [40]:
# tokenization 
def corpusToList(corpus):
    corpusList = [w for w in corpus.split(' ')] 
    corpusList = [i for i in corpusList if i] #removing empty strings from list
    return corpusList
data['t-lyric'] = data['t-lyric'].apply(corpusToList)

In [41]:
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,Migos,rap,Stir Fry,"Woo, woo, woo, woo\nWoo, woo, woo, woo\n\nDanc...","[stir, fry, @@@, woo, woo, woo, woo, woo, woo,..."
1,Snoop Dogg,rap,Drop It Like It‚Äôs Hot,"Snoop\nSnoop\n\nWhen the pimp's in the crib, m...","[drop, it, like, it‚äôs, hot, @@@, snoop, snoo..."
2,Drake,rap,Headlines,I might be too strung out on compliments\nOver...,"[headlines, @@@, i, might, be, too, strung, ou..."
3,Lil Uzi Vert,rap,XO TOUR Llif3,"Are you alright?\nI'm alright, I'm quite alrig...","[xo, tour, llif3, @@@, are, you, alright?, i'm..."
4,Lil Uzi Vert,rap,The Way Life Goes,"That's true (That's true), that's right (That ...","[the, way, life, goes, @@@, that's, true, that..."


In [13]:
# trim each word for leading or trailing spaces / tabs.
map(str.strip, data['t-lyric']) # trim words

<map at 0x12a404910>

In [14]:
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,Migos,rap,Stir Fry,"Woo, woo, woo, woo\nWoo, woo, woo, woo\n\nDanc...","[stir, fry, @@@, woo, woo, woo, woo, woo, woo,..."
1,Snoop Dogg,rap,Drop It Like It‚Äôs Hot,"Snoop\nSnoop\n\nWhen the pimp's in the crib, m...","[drop, it, like, it‚äôs, hot, @@@, snoop, snoo..."
2,Drake,rap,Headlines,I might be too strung out on compliments\nOver...,"[headlines, @@@, i, might, be, too, strung, ou..."
3,Lil Uzi Vert,rap,XO TOUR Llif3,"Are you alright?\nI'm alright, I'm quite alrig...","[xo, tour, llif3, @@@, are, you, alright?, i'm..."
4,Lil Uzi Vert,rap,The Way Life Goes,"That's true (That's true), that's right (That ...","[the, way, life, goes, @@@, that's, true, that..."


In [15]:
corpus_words = [x for sublist in data['t-lyric'] for x in sublist]
vocab = sorted(set(corpus_words))
print('vocab length:', len(corpus_words))
print('Unique words in corpus: {}'.format(len(vocab)))

vocab length: 3166
Unique words in corpus: 676


In [16]:
# creating numeric map; representing words with numberes 
# map specific number to each specific word of our corpus, and vice versa 
word2idx = {u: i for i, u in enumerate(vocab)}
idx2words = np.array(vocab)
word_as_int = np.array([word2idx[c] for c in corpus_words])

## Prediction

- User inputs a song title, and how many words they want the song to be. 
- Network does, for example, 100 predictions, and in the training phrase we know what word we need to generate. 
- (genre, song title); have a marker that it's the end of the title 

In [3]:
from torch import nn
from torch import optim
from torchtext import data
from torchtext.datasets import UDPOS
from torch.autograd import Variable as V 

import torch
import torch.nn as nn
import argparse
import time
import reader

In [27]:
class RNN(nn.Module):
  # Simple RNN Language model 
    def __init__(self, embedding_size, num_steps, batch_size, hidden_size, vocab_size):
        super(RNN, self).__init__()
        
        # parameters 
        self.embedding_size = embedding_size
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        
        # passing through network 
        self._embedding = nn.Embedding(vocab_size, embedding_size)
        self._rnn = nn.RNN(input_size=embedding_size, hidden_size=hidden_size)
        self._linear = nn.Linear(in_features=hidden_size, out_features=vocab_size)
        self.init_weights()
        self.hidden = self.init_hidden(batch_size)

    def forward(self, inputs, hidden):
        embedding = self._embedding(inputs)
        rnn_output, self.hidden = self._rnn(embedding, hidden)
        decoded = self._linear(rnn_output)
        return decoded.view(rnn_output.size(0), rnn_output.size(1), decoded.size(1))
    
#     def init_hidden(self, batch_size):
#         weight = next(self.parameters()).data
#         return (V(weight.new( batch_size ).zero_().cuda()),
#                 V(weight.new( batch_size ).zero_()).cuda())
#     def init_weights(self):
#         init_range = 0.1
#         self._embedding.weight.data.uniform_(-init_range, init_range)
#         self._linear.bias.data.fill_(0)
#         self._linear.weight.data.uniform_(-init_range, init_range)
#     # cannot retain the entire history, so reset during training 
#     def reset_history(self):
#         self.hidden = tuple(V(v.data) for v in self.hidden)

This part should be simple; we take in a sequence of word tokens, embed them, put them through the RNN, and then emit a probability distribution over the next word for each input word. We're saving the hidden state in the model object and adding a reset history method. 

In [28]:
# creating numeric map; representing words with numbers 
# map specific number to each specific word of our corpus, and vice versa 
word2idx = {u: i for i, u in enumerate(vocab)}
idx2words = np.array(vocab)
word_as_int = np.array([word2idx[c] for c in corpus_words])

In [29]:
word_as_int

array([537, 226,   7, ..., 386, 360, 274])

# Set-up + Training

In [30]:
embedding_size = 20
num_steps = 5
batch_size = 1
hidden_size = 20
vocab_size = len(vocab)

In [54]:
# model = RNN(embedding_size, num_steps, batch_size, hidden_size, vocab_size) 
# model.cuda()
# criterion = nn.CrossEntropyLoss(ignore_index=1)
# optimizer = optim.Adam(model.parameters(), lr=.01)

## Character Level RNN 

Attempting a character level-RNN for a single genre
https://www.kaggle.com/super13579/let-s-auto-write-the-deep-purple-lysics-pytorch

In [60]:
data1 = pd.read_csv("sample-dataset.csv")
data1['lyrics']= data1['lyrics'].apply(preprocessText)
print(data1.head())

         artist genre                    title  \
0         Migos   rap                 Stir Fry   
1    Snoop Dogg   rap  Drop It Like It‚Äôs Hot   
2         Drake   rap                Headlines   
3  Lil Uzi Vert   rap            XO TOUR Llif3   
4  Lil Uzi Vert   rap        The Way Life Goes   

                                              lyrics  
0  woo woo woo woo\nwoo woo woo woo\n\ndance with...  
1  snoop\nsnoop\n\nwhen the pimp's in the crib ma...  
2  i might be too strung out on compliments\nover...  
3  are you alright?\ni'm alright i'm quite alrigh...  
4  that's true that's true that's right that righ...  


In [61]:
DP_text = data1['lyrics'].str.cat(sep='\n').lower()
print('corpus length:', len(DP_text))

corpus length: 15232


In [62]:
# Counting characters appeared in all lyrics
chars = sorted(list(set(DP_text)))
print(chars)
print('total chars:', len(chars))

['\n', ' ', '!', "'", '4', '7', '8', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '´', '¶', 'ä', 'î', 'ò', '‚', '√']
total chars: 41


In [63]:
# Create a dictionary of characters, see the index of characters.
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

print(char_to_int)

{'\n': 0, ' ': 1, '!': 2, "'": 3, '4': 4, '7': 5, '8': 6, '?': 7, 'a': 8, 'b': 9, 'c': 10, 'd': 11, 'e': 12, 'f': 13, 'g': 14, 'h': 15, 'i': 16, 'j': 17, 'k': 18, 'l': 19, 'm': 20, 'n': 21, 'o': 22, 'p': 23, 'q': 24, 'r': 25, 's': 26, 't': 27, 'u': 28, 'v': 29, 'w': 30, 'x': 31, 'y': 32, 'z': 33, '´': 34, '¶': 35, 'ä': 36, 'î': 37, 'ò': 38, '‚': 39, '√': 40}


In [64]:
seq_length = 50 # The sentence window size
step = 1 # The steps between the windows
sentences = []
next_chars = []

# Create Target and sentences window
for i in range(0, len(DP_text) - seq_length, step):
    sentences.append(DP_text[i: i + seq_length]) # range from current index to sequence length charaters 
    next_chars.append(DP_text[i + seq_length]) # the next character
    
sentences = np.array(sentences)
next_chars = np.array(next_chars)

#Print Sentence Window and next charaters
print('Sentence Window')
print (sentences[:5])
print('Target charaters')
print (next_chars[:5])
print('Number of sequences:', len(sentences))

Sentence Window
['woo woo woo woo\nwoo woo woo woo\n\ndance with my dog'
 'oo woo woo woo\nwoo woo woo woo\n\ndance with my dogs'
 'o woo woo woo\nwoo woo woo woo\n\ndance with my dogs '
 ' woo woo woo\nwoo woo woo woo\n\ndance with my dogs i'
 'woo woo woo\nwoo woo woo woo\n\ndance with my dogs in']
Target charaters
['s' ' ' 'i' 'n' ' ']
Number of sequences: 15182


In [65]:
# transferring the character to index 
def getdata(sentences, next_chars):
    X = np.zeros((len(sentences),seq_length))
    y = np.zeros((len(sentences)))
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t] = char_to_int[char]
        y[i] = char_to_int[next_chars[i]]
    return X, y

In [66]:
train_x,train_y = getdata(sentences, next_chars)
print('Shape of training_x:', train_x.shape)
print('Shape of training_y:', train_y.shape)

Shape of training_x: (15182, 50)
Shape of training_y: (15182,)


## Building out the model

In [67]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class Simple_LSTM(nn.Module):
    def __init__(self,n_vocab,hidden_dim, embedding_dim,dropout = 0.2):
        super(Simple_LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,dropout = dropout,num_layers = 2)
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.fc = nn.Linear(hidden_dim, n_vocab)
    
    def forward(self, seq_in):
        # for LSTM, input should be (Sequnce_length,batchsize,hidden_layer), so we need to transpose the input
        embedded = self.embeddings(seq_in.t()) 
        lstm_out, _ = self.lstm(embedded)
        # Only need to keep the last character 
        ht=lstm_out[-1] 
        out = self.fc(ht)
        return out

In [75]:
X_train_tensor = torch.tensor(train_x, dtype=torch.long).cuda()
Y_train_tensor = torch.tensor(train_y, dtype=torch.long).cuda()

AssertionError: Torch not compiled with CUDA enabled

In [72]:
from torch.utils.data import Dataset, DataLoader
train = torch.utils.data.TensorDataset(X_train_tensor,Y_train_tensor)
train_loader = torch.utils.data.DataLoader(train, batch_size = 128)

In [74]:
model = Simple_LSTM(47,256,256)
model.cuda()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.002) # Using Adam optimizer

AssertionError: Torch not compiled with CUDA enabled

In [None]:
import time # Add time counter
avg_losses_f = []
n_epochs=20

for epoch in range(n_epochs):
    start_time = time.time()
    model.train()
    loss_fn = torch.nn.CrossEntropyLoss()
    avg_loss = 0.
    for i, (x_batch, y_batch) in enumerate(train_loader):
        y_pred = model(x_batch)
        
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        avg_loss+= loss.item()/len(train_loader)
        
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
        epoch + 1, n_epochs, avg_loss, elapsed_time))
    
    avg_losses_f.append(avg_loss)    
    
print('All \t loss={:.4f} \t '.format(np.average(avg_losses_f)))

In [None]:
import matplotlib.pyplot as plt

plt.plot(avg_losses_f)
plt.xlabel('Epoch')
plt.ylabel('Loss value')
plt.show()

creating a function that can sample an index from a probability array 


In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

validate the model

In [None]:
# Define the start sentence
sentence = 'three more millions when you ask how my day go\npoured up a 4, blueberry faygo'

variance = 0.25
generated = ''
original = sentence
window = sentence

for i in range(400):
    x = np.zeros((1, seq_length))
    for t, char in enumerate(window):
        x[0, t] = char_to_int[char] # Change the sentence to index vector shape (1,50)
        
    x_in = Variable(torch.LongTensor(x).cuda())
    pred = model(x_in)
    pred = np.array(F.softmax(pred, dim=1).data[0].cpu())
    next_index = sample(pred, variance)
    next_char = int_to_char[next_index] # index to char

    generated += next_char
    window = window[1:] + next_char # Update Window for next char predict
    
print(original + generated)