In [3]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords

In [2]:
import plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go

In [4]:
data = pd.read_csv('songdata.csv')
data.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [4]:
del data['link']

In [4]:
len(list(data['artist'].unique()))

643

## Let us focus on Eminem 

In [5]:
eminem = data[(data['artist'] == 'Eminem')]
eminem.head(3)

Unnamed: 0,artist,song,link,text
5062,Eminem,25 to Life,/e/eminem/25+to+life_20883525.html,Too late for the other side \nCaught in a cha...
5063,Eminem,3 A.M.,/e/eminem/3+am_20789506.html,Oh oh \nOh(yea) oh(yea) oh(yea) \nOh oh \nO...
5064,Eminem,3 Verses,/e/eminem/3+verses_20049939.html,I'm the illest rapper to hold a cordless \nPa...


In [171]:
rap = list(eminem['text'])
word=[]
for i in range(0,len(rap)):
    kk = rap[i].replace("\n"," ")
    s=kk.split(' ')
    o = [x for x in s if x]
    word.append(o)

In [98]:
word = [j for i in word for j in i]
word[:5]

['Too', 'late', 'for', 'the', 'other']

### Remove stop words and other unnecessary words

In [99]:
el = ["i'm","get","got"]
stop = set(stopwords.words('english'))
word = [word.lower() for word in word]
words = [i for i in word if i not in stop]
words = [i for i in words if i not in el]

### Removing punctuation from words in lyrics

In [100]:
for i in range(0,len(words)):
    words[i] = re.sub(r'[^\w\s]','',words[i])

### Remove the empty elements in the list

In [92]:
words = [x for x in words if x]

In [11]:
from collections import Counter
labels, values = zip(*Counter(words).items())

### 20 commonly used words in the lyrics along with the number of times they occur

In [13]:
w = Counter(words)
s = w.most_common(20)

In [14]:
x , y = zip(*(s))

In [159]:
data = [go.Bar(x=x,y=y)]
layout = go.Layout(
    title='Words in Eminem Lyrics ',
    xaxis=dict(
        title='Words Used',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Number of times it was used',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)
iplot(go.Figure(data=data, layout = layout))

### Eminem seems to use the word LIKE a lot in his lyrics. He also used bad words frequently

In [13]:
unique_words = sorted(set(words))

In [15]:
len(unique_words)

5204

# Generating Eminem's Rap Lyric

In [22]:
from keras.models import Sequential
from keras.layers.noise import GaussianNoise
from keras.layers import LSTM, Dropout, Dense, Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


### Mapping each unique word in the lyric to a number (5204 unique words)

In [17]:
rap = np.array(rap)

In [18]:
lyric = (''.join(rap))

In [21]:
l = set(lyric)

In [22]:
len(l)

76

### Mapping words to numbers for easier training

In [18]:
vocab= [k for k in l] 
char_ix={c:i for i,c in enumerate(vocab)}
ix_char={i:c for i,c in enumerate(vocab)}

In [317]:
ix_char

{0: '"',
 1: 'L',
 2: 'N',
 3: 'k',
 4: 'M',
 5: 'j',
 6: '4',
 7: '7',
 8: '6',
 9: 'b',
 10: 'P',
 11: 'V',
 12: 'W',
 13: 'i',
 14: 'K',
 15: 'S',
 16: 'y',
 17: ' ',
 18: ')',
 19: 'z',
 20: 'O',
 21: 'C',
 22: 'm',
 23: 'h',
 24: 'f',
 25: 'A',
 26: ',',
 27: '2',
 28: 'd',
 29: 'a',
 30: '-',
 31: 'Q',
 32: '3',
 33: 'v',
 34: ']',
 35: 'l',
 36: '\n',
 37: 'p',
 38: "'",
 39: 'J',
 40: 'Y',
 41: 'q',
 42: 'X',
 43: '0',
 44: 'I',
 45: 'D',
 46: 'E',
 47: 'U',
 48: 'w',
 49: '[',
 50: 'r',
 51: 'G',
 52: 'u',
 53: ':',
 54: '?',
 55: '8',
 56: 'x',
 57: '5',
 58: 'H',
 59: '!',
 60: 'F',
 61: 's',
 62: 'g',
 63: 'B',
 64: 'T',
 65: 'e',
 66: 'c',
 67: 'R',
 68: '.',
 69: 'Z',
 70: 'n',
 71: 't',
 72: '9',
 73: '(',
 74: '1',
 75: 'o'}

In [318]:
maxlen=40
vocab_size=len(vocab)

### Character wise Patterns 

In [314]:
sentences=[]
next_char=[]
for i in range(len(lyric)-maxlen-1):
    sentences.append(lyric[i:i+maxlen])
    next_char.append(lyric[i+maxlen])
sentences

['Too late for the other side  \nCaught in ',
 'oo late for the other side  \nCaught in a',
 'o late for the other side  \nCaught in a ',
 ' late for the other side  \nCaught in a c',
 'late for the other side  \nCaught in a ch',
 'ate for the other side  \nCaught in a cha',
 'te for the other side  \nCaught in a chas',
 'e for the other side  \nCaught in a chase',
 ' for the other side  \nCaught in a chase ',
 'for the other side  \nCaught in a chase  ',
 'or the other side  \nCaught in a chase  \n',
 'r the other side  \nCaught in a chase  \nT',
 ' the other side  \nCaught in a chase  \nTw',
 'the other side  \nCaught in a chase  \nTwe',
 'he other side  \nCaught in a chase  \nTwen',
 'e other side  \nCaught in a chase  \nTwent',
 ' other side  \nCaught in a chase  \nTwenty',
 'other side  \nCaught in a chase  \nTwenty ',
 'ther side  \nCaught in a chase  \nTwenty f',
 'her side  \nCaught in a chase  \nTwenty fi',
 'er side  \nCaught in a chase  \nTwenty fiv',
 'r side  \nCaught in a

In [320]:
X=np.zeros((len(sentences),maxlen,vocab_size))
y=np.zeros((len(sentences),vocab_size))
for ix in range(len(sentences)):
    y[ix,char_ix[next_char[ix]]]=1
    for iy in range(maxlen):
        X[ix,iy,char_ix[sentences[ix][iy]]]=1


In [38]:
from keras.layers import Activation,LSTM,Dense
from keras.optimizers import Adam

In [327]:
model=Sequential()
model.add(LSTM(128,input_shape=(maxlen,vocab_size)))
model.add(Dense(vocab_size))
model.add(Activation('softmax'))
model.summary()
model.compile(optimizer=Adam(lr=0.01),loss='categorical_crossentropy')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 128)               104960    
_________________________________________________________________
dense_3 (Dense)              (None, 76)                9804      
_________________________________________________________________
activation_2 (Activation)    (None, 76)                0         
Total params: 114,764
Trainable params: 114,764
Non-trainable params: 0
_________________________________________________________________


In [328]:
model.fit(X,y,epochs=5,batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x24a9c3b57b8>

In [329]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [331]:
import random
generated=''
start_index=random.randint(0,len(lyric)-maxlen-1)
sent=lyric[start_index:start_index+maxlen]
generated+=sent
for i in range(1900):
    x_sample=generated[i:i+maxlen]
    x=np.zeros((1,maxlen,vocab_size))
    for j in range(maxlen):
        x[0,j,char_ix[x_sample[j]]]=1
    probs=model.predict(x)
    probs=np.reshape(probs,probs.shape[1])
    ix=np.random.choice(range(vocab_size),p=probs.ravel())
    generated+=ix_char[ix]

### Computer generated Character based eminem Lyric (Character based)

In [334]:
generated.split("\n")

['',
 "And I have to say this all sound about vestom firmin' from back?  ",
 "I descurted iss clailing, and I can't stepy  ",
 'And dug tof breather duck or scratching, plant  ',
 "I momming comin' to changers  ",
 'Contemple',
 '',
 'Now eating your  ',
 'You aims fucked what is do this in me lets to it ',
 'Unturns, ar. walk  ',
 'Fuck just cause the phast Born till day to I heal no mokn of crimes  ',
 '  ',
 '[Chorus]  ',
 "All the'r strallin it!  ",
 'Flied sometimes motherfucker  ',
 'Never do youw  ',
 'Junt to  ',
 'The bitacker" that use and suttack strang, now fact when housed on your murdd-off)  ',
 'Holdany new day, cold to m-tatiage of there these crime  ',
 'My asty woald over fucking it, our everything a pick they you, tell wetterint baff  ',
 'Laygen in Else  ',
 '  ',
 'Yat me they invacelm toor greats  ',
 'Now I get awaut from the mean exciit, he to there and world  ',
 "Hife to nastin' now you in your sirture  ",
 '  ',
 '[Chorus:]  ',
 "Dip bole bedor praying druisa

### Since this is character based, many words don't make sense....let's check out the word based model

## Word based Model

In [13]:
from unidecode import unidecode
def get_tokenized_lines(df):
    words = []
    
    for index, row in df['text'].iteritems():
        row = str(row).lower()
        for line in row.split('\n'):
            new_words = re.findall(r"\b[a-z']+\b", unidecode(line))
            words = words + new_words
        
    return words

In [16]:
all_lyric_lines = get_tokenized_lines(eminem)

In [17]:
SEQ_LENGTH = 50 + 1
sequences = list()

for i in range(SEQ_LENGTH, len(all_lyric_lines)):
    seq = all_lyric_lines[i - SEQ_LENGTH: i]
    sequences.append(seq)

print('Total Sequences: %d' % len(sequences))

Total Sequences: 36266


In [18]:
vocab = set(all_lyric_lines)

word_to_index = {w: i for i, w in enumerate(vocab)}
index_to_word = {i: w for w, i in word_to_index.items()}
word_indices = [word_to_index[word] for word in vocab]
vocab_size = len(vocab)

print('vocabulary size: {}'.format(vocab_size))

vocabulary size: 5224


In [19]:
def get_tokenized_lines(lines, seq_len):
    tokenized = np.zeros((len(lines), seq_len))
    
    for r, line in enumerate(lines):
        for c, word in enumerate(line):
            tokenized[r, c] = word_to_index[word]

    return tokenized


In [20]:
tokenized_seq = get_tokenized_lines(sequences, SEQ_LENGTH)

In [21]:
tokenized_seq[:, -1].shape

(36266,)

In [23]:
X, y = tokenized_seq[:, :-1], tokenized_seq[:, -1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = len(X[0])

print("X_shape", X.shape)
print("y_shape", y.shape)


X_shape (36266, 50)
y_shape (36266, 5224)


In [65]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=10)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            261200    
_________________________________________________________________
lstm_5 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_3 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_4 (Dense)              (None, 5224)              527624    
Total params: 939,724
Trainable params: 939,724
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1ef39db9b00>

### I'm Using the Mockingbird lyric as the seed

In [66]:
seed_text = "Hailie i know you miss your mom and i know you miss your dad well i'm gone but i'm trying to give you the life that i never had i can see you're sad even when you smile even when you laugh i can see it in your eyes deep inside"

In [29]:
def texts_to_sequences(texts, word_to_index):
    indices = np.zeros((1, len(texts)), dtype=int)
    
    for i, text in enumerate(texts):
        indices[:, i] = word_to_index[text]
        
    return indices


In [30]:
def my_pad_sequences(seq, maxlen):
    start = seq.shape[1] - maxlen
    
    return seq[:, start: start + maxlen]


In [67]:
def generate_seq(model, word_to_index, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text

    for _ in range(n_words):
        encoded = texts_to_sequences(in_text.split()[1:], word_to_index)
        encoded = my_pad_sequences(encoded, maxlen=seq_length)
        
        yhat = model.predict_classes(encoded, verbose=0)
        out_word = ''
    
        for word, index in word_to_index.items():
            if index == yhat:
                out_word = word
                break
        
        in_text += ' ' + out_word
        result.append(out_word)
        
    return ' '.join(result)


In [72]:
generated = generate_seq(model, word_to_index, seq_length, seed_text, 50)
print(generated)


the way i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know i don't know


### The most probable words seem to repeat in word level embedding ( Needs more epochs or data)

# Words2vec

In [149]:
import gensim 

In [197]:
el = ["i'm","get","got","i've","-","a","i"]
stop = set(stopwords.words('english'))
for i in range(0,len(word)):
        word[i] = [word.lower() for word in word[i]]
        words[i] = [z for z in word[i] if z not in stop]
        words[i] = [z for z in words[i] if z not in el]

### 70 songs are analysed

In [199]:
len(word)

70

In [200]:
model = gensim.models.Word2Vec(
        word,
        size=150,
        window=10,
        min_count=2,
        workers=10)
model.train(word, total_examples=len(word), epochs=10)


(229316, 362160)

In [201]:
print(model.similarity('eminem', 'rap'))

0.999327875558


  """Entry point for launching an IPython kernel.


In [202]:
print(model.similarity('eminem', 'marshall'))

0.998361296544


  """Entry point for launching an IPython kernel.


### Sounds About Right

In [203]:
model.most_similar('eminem')

  """Entry point for launching an IPython kernel.


[('old', 0.9998369216918945),
 ('dead', 0.9998131394386292),
 ('obie', 0.9997954964637756),
 ('outta', 0.9997906684875488),
 ('slim', 0.9997847080230713),
 ("'bout", 0.9997763633728027),
 ('mother', 0.9997687339782715),
 ('until', 0.9997647404670715),
 ('high', 0.9997644424438477),
 ('by', 0.9997599124908447)]