In [1]:
from os import sysconf
import numpy as np
import pandas as pd
import random
import sys
import os

path = 'Lyrics/lyrics_data/merged_lyrics.txt'
artist_lyrics = open(path, 'r')
lyrics = artist_lyrics.read()

print(lyrics[:300])

lyrics = lyrics[:1000000]

print('Corpus length: ', len(lyrics))

Looking for some education
Made my way into the night
All that bullshit conversation
Baby, can't you read the signs? I won't bore you with the details, baby
I don't even wanna waste your time
Let's just say that maybe
You could help me ease my mind
I ain't Mr. Right But if you're looking for fast lo
Corpus length:  1000000


In [2]:
chars = sorted(list(set(lyrics)))
print(chars)
print('total chars: ', len(chars))

['\t', '\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '¢', 'Ã', 'Þ', 'á', 'é', 'í', 'ð', 'ñ', 'ò', 'ó', 'ú', '–', '—', '‘', '’', '“', '”']
total chars:  100


In [3]:
# Create dictionary of characters to make an index for each char
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

print(char_to_int)

{'\t': 0, '\n': 1, ' ': 2, '!': 3, '"': 4, '&': 5, "'": 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, '[': 53, ']': 54, '`': 55, 'a': 56, 'b': 57, 'c': 58, 'd': 59, 'e': 60, 'f': 61, 'g': 62, 'h': 63, 'i': 64, 'j': 65, 'k': 66, 'l': 67, 'm': 68, 'n': 69, 'o': 70, 'p': 71, 'q': 72, 'r': 73, 's': 74, 't': 75, 'u': 76, 'v': 77, 'w': 78, 'x': 79, 'y': 80, 'z': 81, '¡': 82, '¢': 83, 'Ã': 84, 'Þ': 85, 'á': 86, 'é': 87, 'í': 88, 'ð': 89, 'ñ': 90, 'ò': 91, 'ó': 92, 'ú': 93, '–': 94, '—': 95, '‘': 96, '’': 97, '“': 98, '”': 99}


In [4]:
# Create sentence window

seq_length = 50
step = 1
sentences = []
next_chars = []

#Create target and sentences window
for i in range(0, len(lyrics) - seq_length, step):
    sentences.append(lyrics[i: i+seq_length])
    next_chars.append(lyrics[i+seq_length])

sentences = np.array(sentences)
next_chars = np.array(next_chars)

print('Sentence Window:')
print(sentences[:5])
print('Target charaters')
print (next_chars[:5])
print('Number of sequences:', len(sentences))


Sentence Window:
['Looking for some education\nMade my way into the ni'
 'ooking for some education\nMade my way into the nig'
 'oking for some education\nMade my way into the nigh'
 'king for some education\nMade my way into the night'
 'ing for some education\nMade my way into the night\n']
Target charaters
['g' 'h' 't' '\n' 'A']
Number of sequences: 999950


In [5]:
# Transfer char to index

def get_data(sentences, next_chars):
    x = np.zeros((len(sentences),seq_length))
    y = np.zeros(len(sentences))

    length = len(sentences)
    index = 0

    for i in range(length):
        sentence = sentences[i]
        for t, char in enumerate(sentence):
            x[i, t] = char_to_int[char]
        
        y[i] = char_to_int[next_chars[i]]

    return x, y

train_x, train_y = get_data(sentences, next_chars)

print('Shape of training_x:', train_x.shape)
print('Shape of training_y:', train_y.shape)


Shape of training_x: (999950, 50)
Shape of training_y: (999950,)


In [6]:
# Building the model

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import matplotlib.pyplot as plt

class Simple_LSTM(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, dropout=0.2):
        super(Simple_LSTM, self).__init__()

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, dropout = dropout, num_layers=2)
        self.embeddings = nn.Embedding(n_vocab, embedding_dim)
        self.fc = nn.Linear(hidden_dim, n_vocab)
    def forward(self, seq_in):
        #input should be (Sequnce_length,batchsize,hidden_layer), so we need to transpose the input
        embedded = self.embeddings(seq_in.t())
        lstm_out, _ = self.lstm(embedded)

        ht = lstm_out[-1]
        out = self.fc(ht)
        return out

In [7]:
# Create DataLoader for mini-batch training
device = 'cuda' if torch.cuda.is_available() else 'cpu'

x_train_tensor = torch.tensor(train_x, dtype=torch.long).to(device)
y_train_tensor = torch.tensor(train_y, dtype=torch.long).to(device)

from torch.utils.data import Dataset, DataLoader
train = torch.utils.data.TensorDataset(x_train_tensor,y_train_tensor)
train_loader = torch.utils.data.DataLoader(train, batch_size = 256)


In [8]:
# Training
model = Simple_LSTM(len(chars), 256, 256).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.002)
loss_fn = torch.nn.CrossEntropyLoss()
n_epochs = 40

PATH = 'Lyrics/merged_net.pth'


In [9]:
#Training the network
def train_net(n_epoch, loss_fn, optimizer, model, train_loader, device):

    losses = []
    eval_losses = []
    running_loss = 0
    eval_running_loss = 0

    #loop through n_epoch times
    for epoch in range(n_epoch):

        for i, data in enumerate(train_loader, 0):

            inputs, labels = data[0].to(device), data[1].to(device)

            # Set model to train mode
            model.train()
            #Make predictions
            pred = model(inputs)
            
            # Compute loss
            loss = loss_fn(pred, labels)
            # Compute gradients
            loss.backward()
            # Update params and zero grads
            optimizer.step()
            optimizer.zero_grad()
            
            # print statistics
            running_loss += loss.item()
            if i % len(train_loader) == len(train_loader)-1:    # print every len(train_loader) mini-batches
                print('[%d, %5d] loss: %.5f' %(epoch + 1, i + 1, running_loss / len(train_loader)))
                losses.append(running_loss/len(train_loader)) 
                running_loss = 0.0
       
    plt.subplot(1,2,1)
    plt.plot(losses, label='Training loss')

    print('Finished Training')
    
    torch.save(model.state_dict(), PATH)

    print('Saved the Model!')

In [10]:
#train_net(n_epochs, loss_fn, optimizer, model, train_loader, device)
model.load_state_dict(torch.load(PATH))


<All keys matched successfully>

In [11]:
# Function to choose next char for generated lyrics
# We want some randomness so that the same char is not always picked

def sample(preds, temperature=1.0):
    
    preds = np.asarray(preds).astype('float64')
    
    # Adds a lil bit of randomness
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    probas = np.random.multinomial(1, preds, 1)

    return np.argmax(probas)


In [12]:
# Generate some lyrics
#This must be 50 chars
start_sentence = 'I am a cannabis man \nGot a joint in each of my han'

variance = 0.3
generated = ''
original = start_sentence
window = start_sentence

for i in range(800):
    x = np.zeros((1, seq_length))
    for t, char in enumerate(window):
       #change input to vec of ints
       # loop through current window and encode it to the vector x
       x[0, t] = char_to_int[char]

    # send x to the gpu to be fed into net
    x_in = Variable(torch.LongTensor(x).to(device))
    pred = model(x_in)
    
    # retrieve data from gpu and apply softmax
    pred = np.array(F.softmax(pred, dim=1).data[0].cpu())
    
    #sample func adds a little but of randomness and returns the next char as int
    next_index = sample(pred, variance)
    #convert new int to char
    next_char = int_to_char[next_index]

    #add new char to generated lyrics
    generated += next_char
    
    #shift window along to include new char and leave first char
    window = window[1:] + next_char

print(original + generated)
    

  preds = np.log(preds) / temperature


I am a cannabis man 
Got a joint in each of my hand
There is no more to get and try to be all right
I'm so much true, I don't wanna, I know that trying to get the room town (One and there is no sunshine)
Oh, Jah la la la la la la la
Oh wo-wop, choo-wop, don' you get all the people and some on the sky
I know the right on the lights so much some of the right (when I got a soul)
And we're jammin' (Lord) (I'm glad)
(I want to get together)
Oh, like a back to some true
And then I'm gonna wake to the Lord is this songs (we're gonna be all right)
(I'm tryna go down) (I don't know why) (coming in) (Can't you get ang love)
(I'm gonna be alright) (I wanna) you told the sun
(Like the trult is a soul)
Oh, what you want to get together now!
(I'm gonna wake up the street)
And there is no sense in the train dem it seems to see
One thing I wanna go
There
