# Eminem Data Visualization

## Imports

In [None]:
import pandas as pd
import numpy as np
import random
import keras
import nltk
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras import preprocessing
from keras.preprocessing.text import Tokenizer, one_hot, hashing_trick
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, Flatten, Dense, Dropout, SimpleRNN, GRU, LSTM # if GPU version, check out CUDNN
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator, pad_sequences
from keras.preprocessing.text import text_to_word_sequence
from gensim.models import Word2Vec, KeyedVectors

### Data/EDA/Organizing

In [None]:
lyrics_by_song_with_headers = pd.read_csv('./Eminem Lyrics with Headers', index_col=0)

In [None]:
lyrics_by_song_without_headers = pd.read_csv('./Eminem Lyrics without Headers', index_col=0)

In [None]:
f = open('lyrics_eminem.txt', 'r')

with open('lyrics_eminem.txt', 'r') as f:
    raw_data = f.read()

split_data = raw_data.split('\n')

lyrics_by_line = pd.DataFrame(split_data)

lyrics_by_line.columns = ['Eminem Lyrics by Line']

lyrics_by_line # each sample as a row as in each line is a sample which is why we have so many compared to whole songs

In [None]:
lyrics_by_song_with_headers # each sample as a row as in each song is a sample

In [None]:
#lyrics_by_song_with_headers['y'] = lyrics_by_song_with_headers['lyrics'] + ??

In [None]:
lyrics_by_song_without_headers # each sample as a row as in each song is a sample

## Setting X & y

In [None]:
X = lyrics_by_song_with_headers
X # lyrics_by_song_with_headers['lyrics']

In [None]:
X_no_headers = lyrics_by_song_without_headers
X_no_headers

In [None]:
X_by_line = lyrics_by_line['Eminem Lyrics by Line']

X_by_line

In [None]:
len(raw_data)

In [None]:
type(raw_data)

In [None]:
y = lyrics_by_song_with_headers

In [None]:
y

# BIGGER NOTE?!: CAN WE SEQUENCE LYRICS BY SONG WITH SEQUENCE TO MATRIX WITH KERAS TOKENIZER CLASS

# Note: We want X to be our vectorized data and for y to be our successful regular text/lyrics, so X and y will be equal before vectorizing, right?!?!?!??????

In [None]:
len([i for i in X.values])

In [None]:
len([str(i) for i in X.values])

In [None]:
songs = [str(i) for i in X.values] # this gives me 562 strings (each individual string is a song)

In [None]:
songs[0]

# Word2Vec of corpus with nltk tokenizing

In [None]:
lyrics_to_word2vec = X_no_headers['lyrics'].values # for Word2Vec purposes no headers in lyrics

In [None]:
lyrics_to_word2vec = [nltk.word_tokenize(str(lyrics).lower()) for lyrics in lyrics_to_word2vec]

In [None]:
raw_lyrics_to_word2vec = nltk.word_tokenize(raw_data.lower())

In [None]:
raw_model = Word2Vec(lyrics_to_word2vec, min_count = 5, size=100)

In [None]:
model = Word2Vec(lyrics_to_word2vec, min_count = 5, size=100)

In [None]:
model.vector_size

In [None]:
model.wv.most_similar('eminem', topn=25)

In [None]:
raw_model.wv.most_similar('eminem', topn=25)

# Visualization T-sne | Code modified from: https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d

In [None]:
keys = ['eminem', 'fuck', 'dick', 'bitch', 'drugs', 'ass', 'violence', 'mom']

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in raw_model.wv.most_similar(word, topn=30):
        words.append(similar_word)
        embeddings.append(raw_model[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

In [None]:
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm


def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(24, 18))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.8, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=12)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')
    plt.show()

plt.savefig('Similar Words t-sne.png')
tsne_plot_similar_words('Similar words from Eminem Corpus', keys, embeddings_en_2d, word_clusters, 0.7,
                        'similar_words.png')

In [None]:
words_ak = []
embeddings_ak = []
for word in list(raw_model.wv.vocab):
    embeddings_ak.append(raw_model.wv[word])
    words_ak.append(word)
    
tsne_ak_2d = TSNE(perplexity=30, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_ak_2d = tsne_ak_2d.fit_transform(embeddings_ak)

def tsne_plot_2d(label, embeddings, words=[], a=1):
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    x = embeddings[:,0]
    y = embeddings[:,1]
    plt.scatter(x, y, c=colors, alpha=a, label=label)
    for i, word in enumerate(words):
        plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2), 
                     textcoords='offset points', ha='right', va='bottom', size=10)
    plt.legend(loc=4)
    plt.grid(True)
    plt.savefig("hhh.png", format='png', dpi=150, bbox_inches='tight')
    plt.show()

tsne_plot_2d('Eminem Corpus', embeddings_ak_2d, a=0.1)


In [None]:
words_wp = []
embeddings_wp = []
for word in list(raw_model.wv.vocab):
    embeddings_wp.append(raw_model.wv[word])
    words_wp.append(word)
    
tsne_wp_3d = TSNE(perplexity=30, n_components=3, init='pca', n_iter=3500, random_state=12)
embeddings_wp_3d = tsne_wp_3d.fit_transform(embeddings_wp)

from mpl_toolkits.mplot3d import Axes3D


def tsne_plot_3d(title, label, embeddings, a=1):
    fig = plt.figure()
    ax = Axes3D(fig)
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.scatter(embeddings[:, 0], embeddings[:, 1], embeddings[:, 2], c=colors, alpha=a, label=label)
    plt.legend(loc=4)
    plt.title(title)
    plt.show()


tsne_plot_3d('Visualizing Embeddings using t-SNE', 'Eminem Lyrics Corpus', embeddings_wp_3d, a=0.1)

# Embedding/Pre-Processing/Encoding Text Data/Lyrics with Keras

In [None]:
max_len = 1,560 # most words in a single song
training_samples = len(songs) # I want to train the model on every song
max_words = 21_000

In [None]:
t = Tokenizer(num_words=max_words) # instantiating Tokenizer class.

In [None]:
t.document_count

In [None]:
t.fit_on_texts(songs) # fitting tokenizer to my corpus

song_sequences = t.texts_to_sequences(songs) # creating a list of sequences of integers out of my list of strings/lyrics
song_matrix = t.sequences_to_matrix(song_sequences)
one_hot_lyrics = t.texts_to_matrix(songs, mode='binary') # one hot encoded vectors
cvec_lyrics = t.texts_to_matrix(songs, mode='count') # count vectors
tfidf_lyrics = t.texts_to_matrix(songs, mode='tfidf') # tfidf vectors
# above is my vectorized data of songs. can we use this as our x to train a model and then predict a new y?

word_index = t.word_index # saving my token/word index to a variable. word_index is a dict

print(f'Found {len(word_index)} unique tokens/words.')

In [None]:
len(word_index.keys())

In [None]:
word_index

In [None]:
len(song_sequences)

In [None]:
type(song_sequences) # list of integers

In [None]:
type(song_matrix)

In [None]:
len(song_matrix)

In [None]:
print(f'Word Counts: {t.word_counts}')
print(f'Document Count: {t.document_count}')
print(f'Word Index: {t.word_index}')
print(f'Word Docs: {t.word_docs}')

In [None]:
data = pad_sequences(sequences=song_sequences, padding='post')
# transforming my list of sequences, 'sequences', into a 2D Numpy array with a maximum length of max_len and making all sequences same length

In [None]:
len(data)

In [None]:
data