## In this plot we create a word cloud of words in our dictionary using word2vec. See below for the final plot. 

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
lyrics = pd.read_csv('lyrics.csv')
lyrics1 = lyrics.query('genre != "Not Available"')
lyrics1 = lyrics1.dropna()

lyrics2 = lyrics1[lyrics1["genre"] == "Rock"].sample(n=20000)
lyrics3 = lyrics1[lyrics1["genre"] == "Pop"].sample(n=20000)
lyrics4 = lyrics1[lyrics1["genre"] == "Hip-Hop"].sample(n=20000)
lyrics0 = pd.concat([lyrics2, lyrics3])
lyrics0 = lyrics0.reset_index(drop=True)

In [3]:
def tokenize(tweet):
    try:
        #tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tokenizer.tokenize(tweet)
        return tokens
    except:
        return 'NC'
    
def postprocess(data):
    data['tokens'] = data['lyrics'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(lyrics0)

progress-bar: 100%|██████████| 40000/40000 [00:24<00:00, 1653.90it/s]


In [5]:
x_train, x_test, y_train, y_test = train_test_split(np.array(data.tokens), np.array(data.genre), test_size=0.2)

def labelizeLyrics(lyrics, label_type):
    labelized = []
    for i,v in tqdm(enumerate(lyrics)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeLyrics(x_train, 'TRAIN')
x_test = labelizeLyrics(x_test, 'TEST')

32000it [00:00, 104419.53it/s]
8000it [00:00, 241452.64it/s]


In [6]:
tweet_w2v = Word2Vec(size=200, min_count=5)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)], total_examples=48000, epochs=4)

100%|██████████| 32000/32000 [00:00<00:00, 1620419.51it/s]
100%|██████████| 32000/32000 [00:00<00:00, 1657520.57it/s]


(23532388, 30846252)

In [7]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [8]:
# defining the chart
output_notebook()

plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:4000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE        
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:4000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4000 samples in 0.030s...
[t-SNE] Computed neighbors for 4000 samples in 5.977s...
[t-SNE] Computed conditional probabilities for sample 1000 / 4000
[t-SNE] Computed conditional probabilities for sample 2000 / 4000
[t-SNE] Computed conditional probabilities for sample 3000 / 4000
[t-SNE] Computed conditional probabilities for sample 4000 / 4000
[t-SNE] Mean sigma: 0.385936
[t-SNE] KL divergence after 250 iterations with early exaggeration: 79.565971
[t-SNE] KL divergence after 1000 iterations: 1.949273


### From the above plot we see many interesting things about the word groupings. For one, we see on the bottom right an island of all French words and slightly above it an island of all Spanish words. Also we see on the bottom left an island of only words that end in "in" which usually end in "ing". This is a fascinating way to group words which are similar to each other and a good first step in performing NLP.