# Explore an HDF5 file that stores the MSS information

The `pandas` module is used to open and look into the file, but requires code from the _PyTables_ package. To load this package into Python from a console:

> `$ conda install --name python3 PyTables`

### Load libraries

In [3]:
##
## SHOW: we need KMeans
##

import pandas as pd
import numpy as np
import itertools as it
import time
import os 
import sqlite3
import re
from sklearn.cluster import KMeans
print("fin")


fin


In [5]:
##
## SHOW (but don't run during presentation)
##

dbpath="/Users/Pierre/Bentley/classes/MA755/project/sqlite/"

conn_tracks = sqlite3.connect(dbpath+'track_metadata.db')
conn_lyrics = sqlite3.connect(dbpath+'mxm_dataset.db')


In [7]:
##
## SHOW:
##  - get 5000 lyric words from sqlite.
##  - These will be the columns of a pandas data frame
##

g_all_col_names = []
g_res_words = conn_lyrics.execute("select word from words")
g_fr_words = g_res_words.fetchall()
g_res_words.close()
#print(type(g_fr_words))
for word_tuple in g_fr_words:
    g_all_col_names.append(word_tuple[0])

print("fin")

len(g_all_col_names)



fin


5000

In [8]:
##
## SHOW (run) not all million only 0.2% of all songs
##
g_t1 = time.time()

g_res_track_id = conn_lyrics.execute("select distinct track_id from lyrics")
g_df_track_id = g_res_track_id.fetchall()
g_res_track_id.close()

#len(g_df_track_id)

#(g_df_track_id[4][0])
#for track_id_tuple in g_df_track_id:

g_all_row_names = []
for x in range(0,len(g_df_track_id)):
    if (0==(x%500)):
        g_all_row_names.append(g_df_track_id[x][0])

g_t2 = time.time()
# time can be 24,27 seconds
print("total time is "+str(g_t2-g_t1))

len(g_all_row_names)


total time is 12.872944116592407


476

In [9]:
##
## SHOW (don't run): make a W-I-D-E pandas data frame; 475 rows and 5000 columns
##

import time

len(g_all_row_names), len(g_all_col_names)
g_t1 = time.time()

g_pandas = pd.DataFrame(index=g_all_row_names, columns=g_all_col_names)
g_pandas = g_pandas.fillna(0) # with 0s rather than NaNs

g_t2 = time.time();

print("has taken between 6 seconds and 132 seconds")
print("total time is: "+str(g_t2-g_t1)) 


has taken between 6 seconds and 132 seconds
total time is: 5.8371593952178955


In [10]:
## Populate the Pandas DF
##
## SHOW (run)
##

g_t1 = time.time();

for row_name in g_all_row_names:
    g_res_this_terms = conn_lyrics.execute("select distinct word from lyrics where track_id = ?",[row_name])
    g_fr_this_terms = g_res_this_terms.fetchall()
    #close result set!
    g_res_this_terms.close()
    for term_idx in range(0,len(g_fr_this_terms)):
        l_col_name = g_fr_this_terms[term_idx][0]

        ## &&& We're assigning One: this array contains only 0 or 1.
        ## Other possibilities are reasonable
        g_pandas[l_col_name][row_name] = 1

g_t2 = time.time();

# between 12 and 42 seconds for 475 artist_ids
print("total time is: "+str(g_t2-g_t1)) 


total time is: 17.852564573287964


In [11]:
##
## SHOW: 5000-column dataframe shape, not typical of data frames we've
##   seen in statistics classes
##

len(g_pandas.index), len(g_pandas.keys())


(476, 5000)

In [12]:
##
## SHOW (run time ~ 1 min)
##

g_t1 = time.time()

g_num_clusters = 80

km = KMeans(n_clusters=g_num_clusters,
            init='k-means++',
            n_init=10,
            max_iter = 100,
            tol = 10e-04,
            random_state=0)
km.fit(g_pandas)

# 1 to 2 minutes
g_t2 = time.time()
print("total time is "+str(g_t2-g_t1))


    

total time is 60.47240209579468


In [13]:
##
## SHOW (run)(yes, KMeans can work if value of 'k' is large; we'll simply see a number
## of clusters with only one member, but meaningful clusters are also
## found
##

g_label_count = [
    0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0
]

for x in km.labels_:
    g_label_count[x] = g_label_count[x] + 1



In [14]:
##
## SHOW: beginnings of sizes of cluster one through custer eighty.  Most are size 1.
##

g_label_count[1:25]


[2, 1, 70, 1, 73, 213, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1]

In [15]:
##
## SHOW (run): but several clusters are useful
##

g_label_count[35:38]

for cl in [3,5,6,36]:
   print("cluster number "+str(cl)+" has "+str(g_label_count[cl])+" members")



cluster number 3 has 70 members
cluster number 5 has 73 members
cluster number 6 has 213 members
cluster number 36 has 34 members


In [16]:
##
## SHOW: what're song titles in Cluster one? Are these reasonable? (assuming one knows the songs)
## (short execution time)

g_cluster_1_track_id = []

for i in range(0,len(km.labels_)):
    g_track_name = g_pandas.index[i]
    g_cluster_num = km.labels_[i]
    if (g_cluster_num==3):
        g_cluster_1_track_id.append(g_track_name)



In [18]:
##
## SHOW: For each track, query the "songs" title.  Be sure to close the g_res "Result Set"
## object between gathering the data, or you may run our of resource 
## (takes 5 seconds)

np_tracksAndTitles=[]
g_cluster_1_track_id = []

for i in range(0,len(km.labels_)):
    g_track_name = g_pandas.index[i]
    g_cluster_num = km.labels_[i]
    if (g_cluster_num==3):
        g_cluster_1_track_id.append(g_track_name)

for g_track_id in g_cluster_1_track_id:
    g_res = conn_tracks.execute("select track_id, title from songs where track_id = ?", [g_track_id])
    g_fra = g_res.fetchall()
    #Close a RESULT SET? A finite resource?
    g_res.close()
    np_tracksAndTitles.append([g_track_id, g_fra[0][1]])

# a list of 70 rows, eac of which is a list of column values of track_id and song title

pd_tracksAndTitle = pd.DataFrame(np_tracksAndTitles)
pd_tracksAndTitle.names = ["TrackID","Song Name"]

len(pd_tracksAndTitle.index), len(pd_tracksAndTitle.keys())


(70, 2)

In [21]:
##
## SHOW (alternate comments)
##
## Now that we can viewi the titles song, do we think Cluster One is reasonable?
##

#len(pd_tracksAndTitle.index), len(pd_tracksAndTitle.keys())
pd_tracksAndTitle



Unnamed: 0,0,1
0,TRAIDIU128F92F11C7,If My Baby (LP Version)
1,TRASYQQ128F425092B,Mr. Brightside (Jacques Lu Cont's Thin White D...
2,TRBKCEC12903CB8FB1,I'm Afraid I Forgot the Feeling
3,TRBPYCG128F932C2AB,Unsolved Mysteries
4,TRBUHJH128E0792A67,Recovering The Satellites - (10 Spot)
5,TRCCRCT128E07819CE,Bright Lights
6,TRCIHSK128F92E22E5,Laundromat
7,TRCMQED128F4261FD0,Vengeance (LP Version)
8,TRCQZLH128F933DBC1,Listen Close
9,TRCWTGI128F9313CCE,Why Do You Think They Call It Dope?
