In [2]:
import numpy as np
import pandas as pd
import h5py
import glob
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
%matplotlib inline

In [None]:
# Aggregate all X
with h5py.File("data/X/X_A.hdf5","r") as f:
    X = f["mfcc"][:]
    artist = [i.decode("utf-8") for i in f["artist"][:]]
    title = [i.decode("utf-8") for i in f["title"][:]]
    song_id = [i.decode("utf-8") for i in f["song_id"][:]]
        
files = glob.glob("data/X/X_*.hdf5")
files.remove("data/X/X_A.hdf5")

for file in files:
    with h5py.File(file,"r") as f:
        X = np.concatenate((X,f["mfcc"][:]))
        artist += [i.decode("utf-8") for i in f["artist"][:]]
        title += [i.decode("utf-8") for i in f["title"][:]]
        song_id += [i.decode("utf-8") for i in f["song_id"][:]]

In [None]:
# Store aggregated X
with h5py.File("data/X/X.hdf5", "w") as f:
    f.create_dataset("mfcc", data=X)
    ascii_artist = [n.encode("ascii", "ignore") for n in artist]
    f.create_dataset('artist', data=ascii_artist)
    ascii_title = [n.encode("ascii", "ignore") for n in title]
    f.create_dataset('title', data=ascii_title)
    ascii_song_id = [n.encode("ascii", "ignore") for n in song_id]
    f.create_dataset('song_id', data=ascii_song_id)

In [None]:
# Load X
with h5py.File("data/X/X.hdf5","r") as f:
    X = f["mfcc"][:]
    artist = [i.decode("utf-8") for i in f["artist"][:]]
    title = [i.decode("utf-8") for i in f["title"][:]]
    song_id = [i.decode("utf-8") for i in f["song_id"][:]]

In [None]:
# Load Y
f = h5py.File("data/Y/U.hdf5", "r")
Y_temp = f["Y"][:]
songs = [i[0].decode("utf-8") for i in f["songs"][:]]
f.close()

In [None]:
# Match X and Y
indices = [songs.index(song) for song in song_id]

Y = Y_temp[indices] 
songz = []
for i in indices:
    songz.append(songs[i])

In [None]:
# Store data
with h5py.File("data/data.hdf5", "w") as f:
    f.create_dataset("X", data=X)
    ascii_artist = [n.encode("ascii", "ignore") for n in artist]
    f.create_dataset('artist', data=ascii_artist)
    ascii_title = [n.encode("ascii", "ignore") for n in title]
    f.create_dataset('song', data=ascii_title)
    f.create_dataset("Y", data=Y)

In [3]:
# Load data
with h5py.File("data/data.hdf5", "r") as f:
    X = f["X"][:]
    artist = f["artist"][:]
    song = f["song"][:]
    Y = f["Y"][:]

In [4]:
# Train test split
X_train, X_test, artist_train, artist_test, song_train, song_test, y_train, y_test=train_test_split(X,
                                                                                                    artist,
                                                                                                    song,Y,
                                                                                                    test_size=.3)
print(X_train.shape)
print(X_test.shape)
print(artist_train.shape)
print(artist_test.shape)
print(song_train.shape)
print(song_test.shape)
print(y_train.shape)
print(y_test.shape)

(264189, 200, 12)
(113225, 200, 12)
(264189,)
(113225,)
(264189,)
(113225,)
(264189, 10)
(113225, 10)


In [8]:
# Store split data
with h5py.File("data/split_data.hdf5", "w") as f:
    f.create_dataset("X_train", data=X_train)
    f.create_dataset("X_test", data=X_test)
    f.create_dataset('artist_train', data=artist_train)
    f.create_dataset('artist_test', data=artist_test)
    f.create_dataset('song_train', data=song_train)
    f.create_dataset('song_test', data=song_test)
    f.create_dataset('Y_train', data=y_train)
    f.create_dataset('Y_test', data=y_test)

In [6]:
# Collaborative filtering sanity check
Y_1 = [i[0] for i in y_train]
data = pd.DataFrame(data=[artist_train,song,Y_1]).T
data.columns = ["artist","song","latent"]
data.sort_values("latent",ascending=True)

Unnamed: 0,artist,song,latent
214283,b'Eminem',"b""Talkin' To Myself""",-0.350762
224893,b'Eminem',b'Smashing Of The Amps',-0.3338
48319,b'Eminem',b'Louisiana Sky',-0.310395
185438,b'Linkin Park',b'So Sure',-0.306577
250938,b'Eminem / Dina Rae',b'Heart Of Stone (Chant Them)',-0.299142
133267,b'Evanescence',"b""Kowaka D'Amour (Live)""",-0.295787
170593,b'Evanescence',b'The Price Is Too High',-0.295787
163498,b'Rise Against',b'Till You Come to Me',-0.279248
179846,b'Eminem',b'Same Girl',-0.264283
27522,b'Eminem',b'Honey Don\x19t You Understand',-0.264283


In [None]:
# Explore data
latent = [i[2] for i in Y]
extremes = [i for i in latent if i > 0.01 or i < -0.01]
print("Num extremes: "+str(len(extreme)))
print("Max: "+str(max(latent)))
print("Max: "+str(min(latent)))
plt.hist(latent);

In [None]:
mfcc_features = [i[100][11] for i in X]
plt.hist(mfcc_feature);