### For this notebook we utilize [this](https://ahmedbesbes.com/sentiment-analysis-on-twitter-using-word2vec-and-keras.html) tutorial to use word2vec on our lyric data. 

#### Because we use TensorFlow we do all of this in the virtual environment "pracenv"

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
lyrics = pd.read_csv('lyrics.csv')
lyrics1 = lyrics.query('genre != "Not Available"')
lyrics1 = lyrics1.dropna()

lyrics2 = lyrics1[lyrics1["genre"] == "Rock"].sample(n=20000)
lyrics3 = lyrics1[lyrics1["genre"] == "Pop"].sample(n=20000)
lyrics4 = lyrics1[lyrics1["genre"] == "Hip-Hop"].sample(n=20000)
lyrics0 = pd.concat([lyrics2, lyrics3, lyrics4])
lyrics0 = lyrics0.reset_index(drop=True)

In [3]:
def tokenize(tweet):
    try:
        #tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tokenizer.tokenize(tweet)
        return tokens
    except:
        return 'NC'
    
def postprocess(data):
    data['tokens'] = data['lyrics'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(lyrics0)

progress-bar: 100%|██████████| 60000/60000 [00:49<00:00, 1210.09it/s]


In [4]:
x_train, x_test, y_train, y_test = train_test_split(np.array(data.tokens), np.array(data.genre), test_size=0.2)

In [5]:
def labelizeLyrics(lyrics, label_type):
    labelized = []
    for i,v in tqdm(enumerate(lyrics)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeLyrics(x_train, 'TRAIN')
x_test = labelizeLyrics(x_test, 'TEST')

48000it [00:00, 67022.72it/s]
12000it [00:00, 33009.29it/s]


In [6]:
tweet_w2v = Word2Vec(size=200, min_count=5)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=48000, epochs=4)

100%|██████████| 48000/48000 [00:00<00:00, 1503525.63it/s]
100%|██████████| 48000/48000 [00:00<00:00, 1763980.23it/s]


(50652082, 66453904)

### Interestingly, when you put a Spanish word in below it gives spanish words as similar

In [9]:
tweet_w2v.most_similar('color')

[('colour', 0.631252646446228),
 ('colours', 0.5790279507637024),
 ('scent', 0.5638301372528076),
 ('grey', 0.5558927059173584),
 ('complexion', 0.5539994239807129),
 ('colors', 0.5525627136230469),
 ('colored', 0.5359302759170532),
 ('disguise', 0.5311019420623779),
 ('shade', 0.5239685773849487),
 ('pale', 0.5225783586502075)]

In [35]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [36]:
# defining the chart
output_notebook()

plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:4000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE        
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:4000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4000 samples in 0.027s...
[t-SNE] Computed neighbors for 4000 samples in 6.124s...
[t-SNE] Computed conditional probabilities for sample 1000 / 4000
[t-SNE] Computed conditional probabilities for sample 2000 / 4000
[t-SNE] Computed conditional probabilities for sample 3000 / 4000
[t-SNE] Computed conditional probabilities for sample 4000 / 4000
[t-SNE] Mean sigma: 0.454658
[t-SNE] KL divergence after 250 iterations with early exaggeration: 80.132668
[t-SNE] KL divergence after 1000 iterations: 2.027988


#### I hadn't noticed this before but that island in the bottom left reveals that there is a German bunch of songs in the data. There are also several Spanish islands but I knew that would happen already

#### Another interesting observation is in the bottom middle there is an island of slang words that usually end in -ing but there end in -in

In [10]:
print('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 30732


In [11]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [12]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, 200) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

48000it [02:53, 276.18it/s]
12000it [00:45, 265.83it/s]


In [17]:
import tensorflow as tf

def convert_to_float(var):
    for i in range(len(var)):
        if var[i] == "Rock":
            var[i] = 0
        if var[i] == "Pop":
            var[i] = 1
        if var[i] == "Hip-Hop":
            var[i] = 2
            
#convert_to_float(y_train)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(32, activation='relu', input_dim=200))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=25, verbose=2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x1733efa90>

In [18]:
convert_to_float(y_test)
score = model.evaluate(test_vecs_w2v, y_test, verbose=2)
print(score[1])

0.41583333333333333


#### This is pretty amazing actually. We achieved nearly 90% accuracy with this model. 

#### Does pretty poorly when trying to classify 3 genres

1