In [1]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.feature_extraction.text import TfidfVectorizer #, CountVectorizer
from sklearn.decomposition import NMF #, LatentDirichletAllocation, TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE

In [2]:
LOG_DIR = '/Volumes/BC_Clutch/Dropbox/recommender_system/data'
NAME_TO_VISUALISE_VARIABLE = "genre_embedding"
TO_EMBED_COUNT = 505


# path_for_mnist_sprites =  os.path.join(LOG_DIR,'mnistdigits.png')
path_for_metadata =  os.path.join(LOG_DIR,'metadata.tsv')
path_for_vectors =  os.path.join(LOG_DIR,'vectors.tsv')

In [3]:
tag_data = pd.read_csv('../data/recsys_tags2.csv')
tag_data = tag_data.set_index(['idx'])
tag_data['tags'] = tag_data['tags'].apply(lambda x: ', '.join(x.split('|')))
tag_data.head()

Unnamed: 0_level_0,name,tags
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3 Doors Down,"00s, 2008, 90s, acoustic, acoustic rock, alter..."
1,30 Seconds to Mars,"00s, <3, acoustic, addictive, alternative, alt..."
2,3OH!3,"00s, 10s, 2007, 2008, 2009, 2010, <3, addictiv..."
3,50 Cent,"00s, american, awesome, catchy, club, dance, g..."
4,A Day to Remember,"<3, acoustic, alternative, american, christian..."


In [4]:
data = tag_data.tags
n_topics = 6
seed = 42

In [5]:
tfidf = TfidfVectorizer(max_df=0.6)
X = tfidf.fit_transform(data)
nmf = NMF(n_components=n_topics, random_state=seed, alpha=.1, l1_ratio=.5, init='nndsvd')
X_reduced = nmf.fit_transform(X)

In [6]:
kmeans = KMeans(n_clusters=n_topics, init="k-means++", random_state=seed)
kmf = kmeans.fit(X_reduced)
labels = kmf.labels_

In [7]:
tsne = TSNE(n_components=2, init='pca',perplexity=40, n_iter=500, random_state=seed)
tsne = tsne.fit_transform(X_reduced)
df_tsne = pd.DataFrame(tsne, columns=["tsne_x", "tsne_y"])
df_tsne["Cluster"] = labels
df_tsne["Artist"] = [a for a in tag_data.name]

In [8]:
# Formatted to load into Google Embedding Projector
# https://projector.tensorflow.org/
vectors = pd.DataFrame(X_reduced)
meta = pd.DataFrame(df_tsne['Artist'])
print(vectors.shape, meta.shape)
meta.to_csv('../data/recsys_tags_tab_meta.tsv',sep='\t',index=False,header=False)
vectors.to_csv('../data/recsys_tags_tab_vectors.tsv',sep='\t',index=False,header=False)

(505, 6) (505, 1)


In [9]:
embedding_var = tf.Variable(vectors,name=NAME_TO_VISUALISE_VARIABLE)

In [11]:
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = embedding_var.name

# Specify where you find the metadata
embedding.metadata_path = path_for_metadata #'metadata.tsv'

In [12]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), 1)

'/Volumes/BC_Clutch/Dropbox/recommender_system/data/model.ckpt-1'

In [13]:
summary_writer = tf.summary.FileWriter(LOG_DIR,sess.graph)

In [14]:
# Say that you want to visualise the embeddings
projector.visualize_embeddings(summary_writer, config)

In [15]:
LOG_DIR

'/Volumes/BC_Clutch/Dropbox/recommender_system/data'

In [18]:
%%sh
tensorboard --logdir=/Volumes/BC_Clutch/Dropbox/recommender_system/data

Process is terminated.
