In [8]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import csv
from sklearn.manifold import TSNE
%matplotlib inline

In [9]:
# Load data
with h5py.File("data/predictions.hdf5", "r") as f:
    artists = [i.decode("utf-8") for i in f["artist_test"][:]]
    songs = [i.decode("utf-8") for i in f["song_test"][:]]
    predictions = f["predictions"][:]

In [10]:
# Prediction sanity check
y_val = [i[8] for i in predictions]
data = pd.DataFrame(data=[artists,songs,y_val]).T
data.columns = ["artist","songs","latent"]
data1 = data.sort_values("latent",ascending=True)
idx = data1.index.values[:7000]

In [11]:
y_val = [i[9] for i in predictions]
data = pd.DataFrame(data=[artists,songs,y_val]).T
data.columns = ["artist","songs","latent"]
data2 = data.sort_values("latent",ascending=True)
idx2 = data2.index.values[:7000]

In [16]:
data1.head(20)

Unnamed: 0,artist,songs,latent
216605,autoKratz,French Girls Play Guitar,-0.0171852
192155,Steed Lord,Feel The Heat,-0.0141249
46351,"Clarence ""Gatemouth"" Brown",Dangerous Critter,-0.00935444
93116,Fischerspooner,All We Are,-0.00905588
254824,Christina Rosenvinge,Tu Por Mi,-0.00882402
29090,Peter Baumann,Meridian Moorland,-0.00857477
80870,RJD2,Mooore,-0.0084967
39065,Cyan Velvet Project,SOUNDTRACK FOR TRAGEDY,-0.00849468
36128,Motel,"Popurri ""Presente y sutil""_ ""A ti""_ ""Dime ven""...",-0.00842609
236055,Mochipet,Do Geese See God?,-0.0083339


In [15]:
data2.head(20)

Unnamed: 0,artist,songs,latent
259262,Gang Starr,Playtawin (Explicit),-0.00480558
211383,Blue Six,Yeah,-0.00414531
222550,Hot Chip,Tchaparian,-0.00379578
115893,DMX,The Prayer IV,-0.00364065
191671,Loudon Wainwright III,Man Who Couldn't Cry,-0.00351083
30001,Tom Cloud,Isolation,-0.00348402
131408,Freddie McGregor,Why Did You Do It (Album Version),-0.00334159
84128,Guadalupe Pineda;Trio Flores;Martinez & Muoz,Perdon,-0.00323943
4394,Peverelist,Junktion,-0.00318592
56250,Earthling,Humandust,-0.00316321


In [5]:
idx_all = list(set(np.append(idx,idx2)))[:10000]

In [6]:
artists_filtered = list(np.array(artists)[idx_all])
songs_filtered = list(np.array(songs)[idx_all])
predictions_filtered = predictions[idx_all]

In [None]:
# TSNE
tisney = TSNE(n_components=2, random_state=0)
vectors2d = tisney.fit_transform(predictions_filtered)

In [7]:
# Save TSNE
# np.savetxt("tsne_vec.csv",vectors2d,delimiter=",")

artists_df = pd.DataFrame(artists_filtered, columns=["artists"])
artists_df.to_csv("data/tsne_artists.csv")

songs_df = pd.DataFrame(songs_filtered, columns=["songs"])
songs_df.to_csv("data/tsne_songs.csv")

In [None]:
# Plot
plt.figure(figsize=(15, 15))
for i, artist in enumerate(artists[2000:]):
    if i > 300:
        break
    x, y = vectors2d[i+2000,:]
    plt.scatter(x, y)
    plt.annotate(artist, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()