In [224]:
import re
import torch
import string
import torch.nn as nn
import pandas as pd
import numpy as np

from langdetect import detect
import torch.optim as optim
import torch.nn.functional as F
from gensim.models import Word2Vec
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [160]:
df = pd.read_csv('lyrics.csv', sep = '\t')
df.drop('song_id', axis = 1, inplace = True)
df.dropna(inplace = True)
df = df.sample(10000)
df

Unnamed: 0,lyrics
9465,"[""I see the look of evil in your eyes\nYou've ..."
16621,"[""Some sunny day, baby\nWhen everything seems ..."
17852,"['[Chorus]\nThat\'s word, we pray (Pray, pray)..."
16903,"[""[Verse 1: Rufus Thomas]\nTold me that you lo..."
7079,"[""Know you are a friend of mine, babe you been..."
...,...
8185,"[""Sometimes I really think\nI'm going crazy in..."
18773,"[""[Verse 1]\nWell, baby used to stay out all n..."
20074,"[""We were just kids\nPlayin' in the rain\nWhen..."
12838,"[""You're an anchor, I'm uptight\nYou're what g..."


In [161]:
def split_text(x):
    translator = str.maketrans('', '', string.punctuation)
    text = x['lyrics']
    sections = text.split('\\n\\n')
    lyrics = str()
    single_text = []
    res = {}
    
    for s in sections:
        single_text += [x.lower().replace('(','').replace(')','').translate(translator) \
                        for x in s[s.find(']') + 1:].split('\\n') if len(x) > 1]
        res['single_text'] =  ' '.join(single_text)
        
    return pd.Series(res)

In [162]:
df = df.join(df.apply(split_text, axis = 1))
df.drop('lyrics', axis = 1, inplace = True)
df.head(5)

Unnamed: 0,single_text
9465,i see the look of evil in your eyes youve been...
16621,some sunny day baby when everything seems ok b...
17852,thats word we pray pray pray we got to pray ju...
16903,told me that you loved me said that your love ...
7079,know you are a friend of mine babe you been go...


In [173]:
text = df['single_text'].str.cat(sep = '\t')

In [174]:
# getting rid of non-English characters
regex = re.compile('[^a-zA-Z0-9]')
text = regex.sub(' ', text)
text = text.split('\t')

In [175]:
df['single_text'] = text
df

Unnamed: 0,single_text
9465,i see the look of evil in your eyes youve been...
16621,some sunny day baby when everything seems ok b...
17852,thats word we pray pray pray we got to pray ju...
16903,told me that you loved me said that your love ...
7079,know you are a friend of mine babe you been go...
...,...
8185,sometimes i really think im going crazy in the...
18773,well baby used to stay out all night long she ...
20074,we were just kids playin in the rain when you ...
12838,youre an anchor im uptight youre what gets me ...


In [215]:
params = {'learning_rate' : 0.01,
          'input_seq' : 200,
          'batch_size' : 64,
          'epochs' : 30}

In [226]:
alltext = ''.join(df['single_text'])
vocabulary = sorted(set(alltext))

char_to_idx = {u:i for i, u in enumerate(vocabulary)}
idx_to_char = np.array(vocabulary)

text_ints = np.array([char_to_idx[c] for c in alltext])

In [229]:
character_data = tf.data.Dataset.from_tensor_slices(text_ints)
seq_length = 150

In [227]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

In [230]:
sequences = character_data.batch(seq_length + 1, drop_remainder = True)
dataset = sequences.map(split_input_target)

In [231]:
dataset = dataset.shuffle(12000).batch(params['batch_size'], drop_remainder = True)

In [248]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.lin = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output, hidden = self.lstm(embedded, hidden)
        output = self.lin(output)
        return output, hidden

In [249]:
class Model:
    def __init__(self, network, optimizer, loss_fn, epochs):
        self.network = network
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.epochs = epochs

    def fit(self, dataloader):
        for epoch in range(self.epochs):
            for i, (x, y) in enumerate(dataloader):

                pred, hidden = self.network(x, None)
                loss = self.loss_fn(pred, y)

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            print(f'epoch {epoch}, loss {loss}')

        return self.network, loss

    def predict(self, *args, **kwargs):
        return self.network(*args, **kwargs)

In [250]:
class lyrics_dataset(Dataset):
    def __init__(self, dat, vocab):
        self.data = dat
        self.vectorizer = CountVectorizer(vocabulary = vocab, stop_words = None)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        text = self.data.iloc[index]['single_text']
        vector = self.vectorizer.transform([text])
        return vector.toarray()

In [251]:
train_set = lyrics_dataset(train_data, vocabulary)
test_set = lyrics_dataset(test_data, vocabulary)

In [252]:
trainloader = DataLoader(train_set, batch_size = params['batch_size'], shuffle = True)
testloader = DataLoader(test_set, batch_size = params['batch_size'], shuffle = False)

In [253]:
input_size = len(vocabulary)
hidden_size = 256
output_size = len(vocabulary)

network = RNN(input_size, hidden_size, output_size)
model = Model(network, optimizer, F.cross_entropy, epochs = params['epochs'])

In [254]:
loss_fn = F.cross_entropy
optimizer = optim.Adam(network.parameters(), lr = params['learning_rate'])

In [255]:
model.fit(dataset)

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not tensorflow.python.framework.ops.EagerTensor