In [5]:
import pandas as pd 
import numpy as np 
# https://github.com/petrosDemetrakopoulos/RNN-Beatles-lyrics-generator
# https://github.com/starry91/Lyric-Generator#2-lyric-generator-based-on-word-level-rnn

In [6]:
data = pd.read_csv("master-process-data.csv")
del data['Unnamed: 0']
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,"['ego', 'remix', '@@@', 'oh', 'baby', 'how', '..."
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,"['then', 'tell', 'me', '@@@', ""playin'"", 'ever..."
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,"['honesty', '@@@', 'if', 'you', 'search\nfor',..."
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,"['you', 'are', 'my', 'rock', '@@@', 'oh', 'oh'..."
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,"['black', 'culture', '@@@', 'party', 'the', 'p..."


In [7]:
data['t-lyric'] = data['title'] + " @@@ " + data['lyrics']
data['t-lyric'] = data['t-lyric'].str.lower()
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,ego remix @@@ oh baby how you doing?\nyou know...
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,then tell me @@@ playin' everything so easy\ni...
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,honesty @@@ if you search\nfor tenderness\nit ...
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,you are my rock @@@ oh oh oh i oh oh oh i\nver...
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,black culture @@@ party the people the people ...


In [8]:
stopChars = [',','(',')','.','-','[',']','"']
# preprocessing the corpus by converting all letters to lowercase, 
# replacing blank lines with blank string and removing special characters
def preprocessText(text):
#     text = text.replace('\n', ' ').replace('\t','')
    processedText = text.lower()
    for char in stopChars:
        processedText = processedText.replace(char,'')
    return processedText
data['t-lyric'] = data['t-lyric'].astype(str)
data['t-lyric'] = data['t-lyric'].apply(preprocessText)

In [9]:
# tokenization 
def corpusToList(corpus):
    corpusList = [w for w in corpus.split(' ')] 
    corpusList = [i for i in corpusList if i] #removing empty strings from list
    return corpusList
data['t-lyric'] = data['t-lyric'].apply(corpusToList)

In [10]:
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,"[ego, remix, @@@, oh, baby, how, you, doing?\n..."
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,"[then, tell, me, @@@, playin', everything, so,..."
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,"[honesty, @@@, if, you, search\nfor, tendernes..."
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,"[you, are, my, rock, @@@, oh, oh, oh, i, oh, o..."
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,"[black, culture, @@@, party, the, people, the,..."


In [11]:
# trim each word for leading or trailing spaces / tabs.
map(str.strip, data['t-lyric']) # trim words

<map at 0x1b22a2bb90>

In [12]:
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,"[ego, remix, @@@, oh, baby, how, you, doing?\n..."
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,"[then, tell, me, @@@, playin', everything, so,..."
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,"[honesty, @@@, if, you, search\nfor, tendernes..."
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,"[you, are, my, rock, @@@, oh, oh, oh, i, oh, o..."
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,"[black, culture, @@@, party, the, people, the,..."


In [13]:
corpus_words = [x for sublist in data['t-lyric'] for x in sublist]
vocab = sorted(set(corpus_words))
print('vocab length:', len(corpus_words))
print('Unique words in corpus: {}'.format(len(vocab)))

vocab length: 67263415
Unique words in corpus: 3274867


In [None]:
# creating numeric map; representing words with numberes 
# map specific number to each specific word of our corpus, and vice versa 
word2idx = {u: i for i, u in enumerate(vocab)}
idx2words = np.array(vocab)
word_as_int = np.array([word2idx[c] for c in corpus_words])

## Prediction

- User inputs a song title, and how many words they want the song to be. 
- Network does, for example, 100 predictions, and in the training phrase we know what word we need to generate. 
- (genre, song title); have a marker that it's the end of the title 

# Word Level LSTM

In [None]:
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
data1 = pd.read_csv("master-process-data.csv")
data1['lyrics']= data1['lyrics'].astype(str)
data1['lyrics']= data1['lyrics'].apply(preprocessText)
print(data1.head())

In [20]:
DP_text = word_tokenize(data1['lyrics'].str.cat(sep='\n').lower())
print('corpus length:', len(DP_text))
#todo(dlee, tokenize words)

corpus length: 3440


In [24]:
# Counting characters appeared in all lyrics
words = sorted(list(set(DP_text)))
print('total words:', len(words))

total words: 650


In [25]:
# Create a dictionary of characters, see the index of characters.
char_to_int = dict((c, i) for i, c in enumerate(words))
int_to_char = dict((i, c) for i, c in enumerate(words))

In [26]:
seq_length = 10 # The sentence window size
step = 1 # The steps between the windows
sentences = []
next_chars = []

# Create Target and sentences window
for i in range(0, len(DP_text) - seq_length, step):
    # range from current index to sequence length charaters
    sentences.append(DP_text[i: i + seq_length])  
    next_chars.append(DP_text[i + seq_length]) # the next character
    
sentences = np.array(sentences)
next_chars = np.array(next_chars)

#Print Sentence Window and next charaters
print('Sentence Window')
print (sentences[:5])
print('Target characters')
print (next_chars[:5])
print('Number of sequences:', len(sentences))

Sentence Window
[['woo' 'woo' 'woo' 'woo' 'woo' 'woo' 'woo' 'woo' 'dance' 'with']
 ['woo' 'woo' 'woo' 'woo' 'woo' 'woo' 'woo' 'dance' 'with' 'my']
 ['woo' 'woo' 'woo' 'woo' 'woo' 'woo' 'dance' 'with' 'my' 'dogs']
 ['woo' 'woo' 'woo' 'woo' 'woo' 'dance' 'with' 'my' 'dogs' 'in']
 ['woo' 'woo' 'woo' 'woo' 'dance' 'with' 'my' 'dogs' 'in' 'the']]
Target characters
['my' 'dogs' 'in' 'the' 'nighttime']
Number of sequences: 3430


In [27]:
# transferring the character to index 
def getdata(sentences, next_chars):
    X = np.zeros((len(sentences),seq_length))
    y = np.zeros((len(sentences)))
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t] = char_to_int[char]
        y[i] = char_to_int[next_chars[i]]
    return X, y

In [28]:
train_x,train_y = getdata(sentences, next_chars)
print(train_x)
print('Shape of training_x:', train_x.shape)
print('Shape of training_y:', train_y.shape)

[[632. 632. 632. ... 632. 145. 629.]
 [632. 632. 632. ... 145. 629. 364.]
 [632. 632. 632. ... 629. 364. 162.]
 ...
 [ 86. 284. 320. ... 284. 628. 284.]
 [284. 320. 547. ... 628. 284. 374.]
 [320. 547. 236. ... 284. 374. 348.]]
Shape of training_x: (3430, 10)
Shape of training_y: (3430,)


In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class Simple_LSTM(nn.Module):
    def __init__(self,n_vocab,hidden_dim, embedding_dim,dropout = 0.2):
        super(Simple_LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,dropout = dropout,num_layers = 2)
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.fc = nn.Linear(hidden_dim, n_vocab)
    
    def forward(self, seq_in):
        # for LSTM, input should be (Sequnce_length,batchsize,hidden_layer), so we need to transpose the input
        embedded = self.embeddings(seq_in.t()) 
        lstm_out, _ = self.lstm(embedded)
        # Only need to keep the last character 
        ht=lstm_out[-1] 
        out = self.fc(ht)
        return out

In [30]:
X_train_tensor = torch.tensor(train_x, dtype=torch.long)
Y_train_tensor = torch.tensor(train_y, dtype=torch.long)
print('Shape of training_x:', X_train_tensor.shape)
print('Shape of training_y:', Y_train_tensor.shape)

Shape of training_x: torch.Size([3430, 10])
Shape of training_y: torch.Size([3430])


In [31]:
from torch.utils.data import Dataset, DataLoader
train = torch.utils.data.TensorDataset(X_train_tensor,Y_train_tensor)
train_loader = torch.utils.data.DataLoader(train, batch_size = 128)

In [32]:
model = Simple_LSTM(len(words),256,256)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.002) # Using Adam optimizer

In [33]:
import time # Add time counter
avg_losses_f = []
n_epochs=20

for epoch in range(n_epochs):
    print("Epoch: ", epoch)
    start_time = time.time()
    model.train()
    loss_fn = torch.nn.CrossEntropyLoss()
    avg_loss = 0.
    for i, (x_batch, y_batch) in enumerate(train_loader):
        y_pred = model(x_batch)
        
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        avg_loss+= loss.item()/len(train_loader)
        
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
        epoch + 1, n_epochs, avg_loss, elapsed_time))
    
    avg_losses_f.append(avg_loss)    
    
print('All \t loss={:.4f} \t '.format(np.average(avg_losses_f)))

Epoch:  0
Epoch 1/20 	 loss=5.8391 	 time=3.12s
Epoch:  1
Epoch 2/20 	 loss=5.3785 	 time=2.98s
Epoch:  2
Epoch 3/20 	 loss=4.9413 	 time=2.92s
Epoch:  3
Epoch 4/20 	 loss=4.4266 	 time=3.15s
Epoch:  4
Epoch 5/20 	 loss=3.8698 	 time=3.20s
Epoch:  5
Epoch 6/20 	 loss=3.3261 	 time=3.07s
Epoch:  6
Epoch 7/20 	 loss=2.8072 	 time=3.13s
Epoch:  7
Epoch 8/20 	 loss=2.3601 	 time=3.17s
Epoch:  8
Epoch 9/20 	 loss=1.9756 	 time=3.02s
Epoch:  9
Epoch 10/20 	 loss=1.5993 	 time=3.17s
Epoch:  10
Epoch 11/20 	 loss=1.2876 	 time=3.14s
Epoch:  11
Epoch 12/20 	 loss=0.9869 	 time=3.14s
Epoch:  12
Epoch 13/20 	 loss=0.7217 	 time=3.20s
Epoch:  13
Epoch 14/20 	 loss=0.5157 	 time=3.15s
Epoch:  14
Epoch 15/20 	 loss=0.3938 	 time=3.20s
Epoch:  15
Epoch 16/20 	 loss=0.2901 	 time=3.13s
Epoch:  16
Epoch 17/20 	 loss=0.2170 	 time=3.17s
Epoch:  17
Epoch 18/20 	 loss=0.1737 	 time=3.22s
Epoch:  18
Epoch 19/20 	 loss=0.1460 	 time=3.23s
Epoch:  19
Epoch 20/20 	 loss=0.1319 	 time=3.21s
All 	 loss=2.0694 	

In [34]:
import matplotlib.pyplot as plt

plt.plot(avg_losses_f)
plt.xlabel('Epoch')
plt.ylabel('Loss value')
plt.show()

<Figure size 640x480 with 1 Axes>

In [35]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [42]:
# Define the start sentence
# sentence = 'i read in the news\nthat the average man\nplease kis'
sentence = ["she", "got", "forever", "addicted", "to", "a", "body", "killer", "spillin", "ice"]
variance = 1
generated = []
original = sentence
window = sentence

for i in range(400):
    x = np.zeros((1, seq_length))
    for t, char in enumerate(window):
        x[0, t] = char_to_int[char] # Change the sentence to index vector shape (1,50)
        
    x_in = Variable(torch.LongTensor(x))
    pred = model(x_in)
    pred = np.array(F.softmax(pred, dim=1).data[0].cpu())
    next_index = sample(pred, variance)
    next_char = int_to_char[next_index] # index to char

    generated = generated + [next_char]
    window = window[1:] + [next_char] # Update Window for next char predict
    
print(" ".join(original + generated))

she got forever addicted to a body killer spillin ice with the chickens like suicide wrist red take it i take a second ding cars real you big should four take your rick diamonds on the million hey to give you fuck faded them own look and i got a niggas premiere boss to the left pockets fallin and i roll the best weed ‚äòcause i got it goin ' on i 'm a gangsta but y'all y'all knew that leather big boss yeah i had to do that i keep a blue flag hangin ' all my money but in the way that she treat me gon ' leave you wo n't leave me i call it that casanova she say i 'm insane yeah i might blow my brain out hey xanny help the pain yeah please xanny make it go away i 'm committed not addicted but it keep control of me all the pain now i ca n't feel it i swear it 's listening she to the plan other all way my bands to with the money all my friends are dead push me to the edge all my friends are dead push me to the edge that 's all red inside all white like somethin ' you ride a sled down i just 