In [1]:
import pandas as pd 
import numpy as np 
# https://github.com/petrosDemetrakopoulos/RNN-Beatles-lyrics-generator
# https://github.com/starry91/Lyric-Generator#2-lyric-generator-based-on-word-level-rnn

Training our LSTM (word-level) for the rap genre. 

In [2]:
data = pd.read_csv("master-process-data.csv")
del data['Unnamed: 0']
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,"['ego', 'remix', '@@@', 'oh', 'baby', 'how', '..."
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,"['then', 'tell', 'me', '@@@', ""playin'"", 'ever..."
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,"['honesty', '@@@', 'if', 'you', 'search\nfor',..."
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,"['you', 'are', 'my', 'rock', '@@@', 'oh', 'oh'..."
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,"['black', 'culture', '@@@', 'party', 'the', 'p..."


In [3]:
data['t-lyric'] = data['title'] + " @@@ " + data['lyrics']
data['t-lyric'] = data['t-lyric'].str.lower()
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,ego remix @@@ oh baby how you doing?\nyou know...
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,then tell me @@@ playin' everything so easy\ni...
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,honesty @@@ if you search\nfor tenderness\nit ...
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,you are my rock @@@ oh oh oh i oh oh oh i\nver...
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,black culture @@@ party the people the people ...


In [4]:
stopChars = [',','(',')','.','-','[',']','"']
# preprocessing the corpus by converting all letters to lowercase, 
# replacing blank lines with blank string and removing special characters
def preprocessText(text):
#     text = text.replace('\n', ' ').replace('\t','')
    processedText = text.lower()
    for char in stopChars:
        processedText = processedText.replace(char,'')
    return processedText
data['t-lyric'] = data['t-lyric'].astype(str)
data['t-lyric'] = data['t-lyric'].apply(preprocessText)

In [5]:
# tokenization 
def corpusToList(corpus):
    corpusList = [w for w in corpus.split(' ')] 
    corpusList = [i for i in corpusList if i] #removing empty strings from list
    return corpusList
data['t-lyric'] = data['t-lyric'].apply(corpusToList)

In [6]:
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,"[ego, remix, @@@, oh, baby, how, you, doing?\n..."
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,"[then, tell, me, @@@, playin', everything, so,..."
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,"[honesty, @@@, if, you, search\nfor, tendernes..."
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,"[you, are, my, rock, @@@, oh, oh, oh, i, oh, o..."
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,"[black, culture, @@@, party, the, people, the,..."


In [7]:
# trim each word for leading or trailing spaces / tabs.
map(str.strip, data['t-lyric']) # trim words

<map at 0x21876ad10>

In [8]:
data.head()

Unnamed: 0,artist,genre,title,lyrics,t-lyric
0,beyonce-knowles,Pop,ego remix,oh baby how you doing?\nyou know i'm gonna cut...,"[ego, remix, @@@, oh, baby, how, you, doing?\n..."
1,beyonce-knowles,Pop,then tell me,playin' everything so easy\nit's like you seem...,"[then, tell, me, @@@, playin', everything, so,..."
2,beyonce-knowles,Pop,honesty,if you search\nfor tenderness\nit isn't hard t...,"[honesty, @@@, if, you, search\nfor, tendernes..."
3,beyonce-knowles,Pop,you are my rock,oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...,"[you, are, my, rock, @@@, oh, oh, oh, i, oh, o..."
4,beyonce-knowles,Pop,black culture,party the people the people the party it's pop...,"[black, culture, @@@, party, the, people, the,..."


In [9]:
corpus_words = [x for sublist in data['t-lyric'] for x in sublist]
vocab = sorted(set(corpus_words))
print('vocab length:', len(corpus_words))
print('Unique words in corpus: {}'.format(len(vocab)))

vocab length: 67263415
Unique words in corpus: 3274867


In [10]:
# creating numeric map; representing words with numberes 
# map specific number to each specific word of our corpus, and vice versa 
word2idx = {u: i for i, u in enumerate(vocab)}
idx2words = np.array(vocab)
word_as_int = np.array([word2idx[c] for c in corpus_words])

## Prediction

- User inputs a song title, and how many words they want the song to be. 
- Network does, for example, 100 predictions, and in the training phrase we know what word we need to generate. 
- (genre, song title); have a marker that it's the end of the title 

## Character Level LSTM

Attempting a character level-RNN for a single genre
https://www.kaggle.com/super13579/let-s-auto-write-the-deep-purple-lysics-pytorch

In [11]:
data1 = pd.read_csv("master-process-data.csv")
data1['lyrics'] = data1['lyrics'].astype(str)
data1['lyrics']= data1['lyrics'].apply(preprocessText)
data1['t-lyric'] = data1['t-lyric'].astype(str)
print(data1.head())

   Unnamed: 0           artist genre            title  \
0           0  beyonce-knowles   Pop        ego remix   
1           1  beyonce-knowles   Pop     then tell me   
2           2  beyonce-knowles   Pop          honesty   
3           3  beyonce-knowles   Pop  you are my rock   
4           4  beyonce-knowles   Pop    black culture   

                                              lyrics  \
0  oh baby how you doing?\nyou know i'm gonna cut...   
1  playin' everything so easy\nit's like you seem...   
2  if you search\nfor tenderness\nit isn't hard t...   
3  oh oh oh i oh oh oh i\nverse 1:\nif i wrote a ...   
4  party the people the people the party it's pop...   

                                             t-lyric  
0  ['ego', 'remix', '@@@', 'oh', 'baby', 'how', '...  
1  ['then', 'tell', 'me', '@@@', "playin'", 'ever...  
2  ['honesty', '@@@', 'if', 'you', 'search\nfor',...  
3  ['you', 'are', 'my', 'rock', '@@@', 'oh', 'oh'...  
4  ['black', 'culture', '@@@', 'party', 'the'

In [12]:
data1.head()
del data1['Unnamed: 0']

In [13]:
DP_text = data1['lyrics'].str.cat(sep='\n').lower()
print('corpus length:', len(DP_text))

corpus length: 382842506


In [14]:
# Counting characters appeared in all lyrics
chars = sorted(list(set(DP_text)))
print(chars)
print('total chars:', len(chars))

['\x01', '\x02', '\x03', '\x04', '\x05', '\x06', '\x07', '\x08', '\t', '\n', '\x0b', '\x0c', '\r', '\x0e', '\x0f', '\x10', '\x11', '\x12', '\x13', '\x14', '\x15', '\x16', '\x17', '\x18', '\x19', '\x1a', '\x1b', '\x1c', '\x1d', '\x1e', '\x1f', ' ', '!', '#', '$', '%', '&', "'", '*', '+', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '\\', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x80', '\x81', '\x82', '\x83', '\x84', '\x85', '\x86', '\x87', '\x88', '\x89', '\x8a', '\x8b', '\x8c', '\x8d', '\x8e', '\x8f', '\x90', '\x91', '\x92', '\x93', '\x94', '\x95', '\x96', '\x97', '\x98', '\x99', '\x9a', '\x9b', '\x9c', '\x9d', '\x9e', '\x9f', '\xa0', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '\xad', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', '×', 'ß', 'à', 'á', '

In [15]:
want = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 
       'w', 'x', 'y', 'z', ',', '.', '\n', '!', '?']
chars = [x for x in want] 


In [16]:
# Create a dictionary of characters, see the index of characters.
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

print(char_to_int)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, ',': 26, '.': 27, '\n': 28, '!': 29, '?': 30}


In [None]:
seq_length = 50 # The sentence window size
step = 1 # The steps between the windows
sentences = []
next_chars = []

# Create Target and sentences window
for i in range(0, len(DP_text) - seq_length, step):
    # range from current index to sequence length charaters
    sentences.append(DP_text[i: i + seq_length])  
    next_chars.append(DP_text[i + seq_length]) # the next character
    
sentences = np.array(sentences)
next_chars = np.array(next_chars)

#Print Sentence Window and next charaters
print('Sentence Window')
print (sentences[:5])
print('Target characters')
print (next_chars[:5])
print('Number of sequences:', len(sentences))

In [None]:
# transferring the character to index 
def getdata(sentences, next_chars):
    X = np.zeros((len(sentences),seq_length))
    y = np.zeros((len(sentences)))
    length = len(sentences)
    index = 0
    for i in range(len(sentences)):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            X[i, t] = char_to_int[char]
        y[i] = char_to_int[next_chars[i]]
    return X, y

In [None]:
train_x,train_y = getdata(sentences, next_chars)
print(train_x)
print('Shape of training_x:', train_x.shape)
print('Shape of training_y:', train_y.shape)

## Building out the model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class Simple_LSTM(nn.Module):
    def __init__(self,n_vocab,hidden_dim, embedding_dim,dropout = 0.2):
        super(Simple_LSTM, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,dropout = dropout,num_layers = 2)
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.fc = nn.Linear(hidden_dim, n_vocab)
    
    def forward(self, seq_in):
        # for LSTM, input should be (Sequnce_length,batchsize,hidden_layer), so we need to transpose the input
        embedded = self.embeddings(seq_in.t()) 
        lstm_out, _ = self.lstm(embedded)
        # Only need to keep the last character 
        ht=lstm_out[-1] 
        out = self.fc(ht)
        return out

In [None]:
X_train_tensor = torch.tensor(train_x, dtype=torch.long)
Y_train_tensor = torch.tensor(train_y, dtype=torch.long)

In [None]:
from torch.utils.data import Dataset, DataLoader
train = torch.utils.data.TensorDataset(X_train_tensor,Y_train_tensor)
train_loader = torch.utils.data.DataLoader(train, batch_size = 128)

In [None]:
model = Simple_LSTM(47,256,256)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.002) # Using Adam optimizer

In [None]:
import time # Add time counter
avg_losses_f = []
n_epochs= 50

for epoch in range(n_epochs):
    print("Epoch: ", epoch)
    start_time = time.time()
    model.train()
    loss_fn = torch.nn.CrossEntropyLoss()
    avg_loss = 0.
    for i, (x_batch, y_batch) in enumerate(train_loader):
        y_pred = model(x_batch)
        
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        
        optimizer.step()
        avg_loss+= loss.item()/len(train_loader)
        
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
        epoch + 1, n_epochs, avg_loss, elapsed_time))
    
    avg_losses_f.append(avg_loss)    
    
print('All \t loss={:.4f} \t '.format(np.average(avg_losses_f)))

In [None]:
import matplotlib.pyplot as plt

plt.plot(avg_losses_f)
plt.xlabel('Epoch')
plt.ylabel('Loss value')
plt.show()

creating a function that can sample an index from a probability array 


In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

validate the model

In [None]:
# Define the start sentence
# sentence = 'i read in the news\nthat the average man\nplease kis'
sentence = 'i put the new forgis on the g\ni trap until the blo'
variance = 0.25
generated = ''
original = sentence
window = sentence

for i in range(400):
    x = np.zeros((1, seq_length))
    for t, char in enumerate(window):
        x[0, t] = char_to_int[char] # Change the sentence to index vector shape (1,50)
        
    x_in = Variable(torch.LongTensor(x))
    pred = model(x_in)
    pred = np.array(F.softmax(pred, dim=1).data[0].cpu())
    next_index = sample(pred, variance)
    next_char = int_to_char[next_index] # index to char

    generated += next_char
    window = window[1:] + next_char # Update Window for next char predict
    
print(original + generated)