In [6]:
import pandas as pd
import numpy  as np
import sklearn.neighbors as sn
import os
import re
import itertools as it
import operator 
import functools

In [7]:
def get_filenames(path):
    return([get_filenames(path+"/"+entry.name)
            if entry.is_dir() 
            else path+"/"+entry.name 
            for entry 
            in os.scandir(path)
           ])

def unlist(alist):
    return(list(it.chain.from_iterable(alist)
               )
          )

def var_list(base,numof):
    return([base+str(ndx) for ndx in range(numof)]
          )

def h1d_array(in_array,n): 
    # n1d is the number of elements in `in_array`
    n1d = functools.reduce(operator.mul,
                           list(in_array.shape))
    # return a 1 row 2D array with `n` columns
    b = np.ndarray(shape=(1,n1d),
                   buffer=in_array,
                   dtype=in_array.dtype
                  )[0:1,0:n]
    return(b)

In [8]:
def make_1row_df(filename='', metadata_vars=[], analysis_vars=[], remove=False):
    # open `filename` as a HDF5 file
    store = pd.HDFStore(filename,"r")
    if remove==True:
        # `metadata_vars` and `analysis_vars` contain the variables to remove
        metadata_vars = list({item for item 
                                  in list(store.root.metadata.songs.read().dtype.names) 
                                  if item not in metadata_vars})
        analysis_vars = list({item for item 
                                  in list(store.root.analysis.songs.read().dtype.names) 
                                  if item not in analysis_vars})
    # else: `metadata_vars` and `analysis_vars` contain the variables to keep
    
    # retrieve the first `n` values as a horizontal array of 1 dimension
    segments_pitches = h1d_array(store.root.analysis.segments_pitches.read(),60)
    segments_timbre  = h1d_array(store.root.analysis.segments_timbre.read(),60)
    bars_confidence  = h1d_array(store.root.analysis.bars_confidence.read(),10)
    artist_terms     = h1d_array(store.root.metadata.artist_terms.read(),3)
    
    # store these values as variables in single dataframes
    at_df = pd.DataFrame(artist_terms    ,columns=var_list('at_',artist_terms    .shape[1]))
    bc_df = pd.DataFrame(bars_confidence ,columns=var_list('bc_',bars_confidence .shape[1]))
    sp_df = pd.DataFrame(segments_pitches,columns=var_list('sp_',segments_pitches.shape[1]))
    st_df = pd.DataFrame(segments_timbre ,columns=var_list('st_',segments_timbre .shape[1]))
    
    # merge these single dataframes into one single row dataframe
    ret = pd.concat([
            # retrieve a single row dataframe from `/metadata/songs`
            pd.DataFrame(store.root.metadata.songs.read(), 
                         columns=metadata_vars),
            # retrieve a single row dataframe from `/analysis/songs`
            pd.DataFrame(store.root.analysis.songs.read(), 
                         columns=analysis_vars),
            #at_df, 
            bc_df, 
            sp_df,
            st_df],
            axis=1) # `axes=1` means stack the dataframes horizontally 
    # close the HDF5 file
    store.close()
    # return the merged dataframe
    return(ret)

In [4]:
path = "D:\\millionsongsubset_full\\MillionSongSubset\\data"
filenames = unlist(unlist(unlist(get_filenames(path))))

mss_df_list = [make_1row_df(filename=filename,
                            metadata_vars=['artist_familiarity','artist_hotttnesss',
                                           'song_hotttnesss','title',
                                           'artist_name',
                                           'artist_location','release',
                                           'artist_longitude','artist_latitude',
                                           'artist_id','song_id','track_id'],
                            # Omit: genre
                            analysis_vars=['duration','key','loudness','mode',
                                           'tempo','time_signature'],
                            # Omit: danceability, energy
                            remove=False
                           )
                for filename in filenames[0:10000] # get data from all 10,000 files
              ]

mss_df = pd.concat(mss_df_list,axis=0).reset_index(drop=True)
save_load_path = 'D:\\ML755'
mss_df.to_pickle(save_load_path+'\\mss_df_sr.pk60')


In [9]:
save_load_path = 'D:\\ML755'
mss_df = pd.read_pickle(save_load_path+'\\mss_df_sr.pk60')


mss_df['mode']            = mss_df['mode']           .astype('float64')
mss_df['key']             = mss_df['key']            .astype('category')
mss_df['time_signature']  = mss_df['time_signature'] .astype('category')


mss_df = pd.get_dummies(mss_df, columns=['key','time_signature'], prefix=['k','ts'])



In [10]:
num_rows = 10000
mss_song_artist = pd.concat([mss_df.loc[:num_rows, 'title'],
                            mss_df.loc[:num_rows, 'artist_name'],
                            mss_df.loc[:num_rows,'artist_id'],
                            mss_df.loc[:num_rows,'song_id']
                       ],
                       axis=1
                       )

mss_song_artist

Unnamed: 0,title,artist_name,artist_id,song_id
0,"b""I Didn't Mean To""",b'Casual',b'ARD7TVE1187B99BFB1',b'SOMZWCG12A8C13C480'
1,b'Soul Deep',b'The Box Tops',b'ARMJAGH1187FB546F3',b'SOCIWDW12A8C13D406'
2,b'Amor De Cabaret',b'Sonora Santanera',b'ARKRRTF1187B9984DA',b'SOXVLOJ12AB0189215'
3,b'Something Girls',b'Adam Ant',b'AR7G5I41187FB4CE6C',b'SONHOTT12A8C13493C'
4,b'Face the Ashes',b'Gob',b'ARXR32B1187FB57099',b'SOFSOCN12A8C143F5D'
5,b'The Moon And I (Ordinary Day Album Version)',b'Jeff And Sheri Easter',b'ARKFYS91187B98E58F',b'SOYMRWW12A6D4FAB14'
6,b'Keepin It Real (Skit)',b'Rated R',b'ARD0S291187B9B7BF5',b'SOMJBYD12A6D4F8557'
7,b'Drop of Rain',b'Tweeterfriendly Music',b'AR10USD1187B99F3F1',b'SOHKNRJ12A6701D1F8'
8,b'Pink World',b'Planet P Project',b'AR8ZCNI1187B9A069B',b'SOIAZJW12AB01853F1'
9,b'Insatiable (Instrumental Version)',b'Clp',b'ARNTLGG11E2835DDB9',b'SOUDSGM12AC9618304'


In [11]:
# withdraw data 

num_rows = 10000
mss_num_df = pd.concat([mss_df.loc[:num_rows, 'sp_0':'sp_59'],
                        mss_df.loc[:num_rows, 'st_0':'st_59'],
                        mss_df.loc[:num_rows,'loudness']
                       ],
                       axis=1
                       )
mss_num_df.head()


Unnamed: 0,sp_0,sp_1,sp_10,sp_11,sp_12,sp_13,sp_14,sp_15,sp_16,sp_17,...,st_51,st_52,st_53,st_54,st_55,st_56,st_57,st_58,st_59,loudness
0,0.946,0.684,1.0,0.742,0.01,0.054,0.015,0.021,0.067,0.17,...,-58.292,16.52,-48.17,27.457,42.717,-13.197,3.489,-16.801,-8.547,-11.197
1,1.0,1.0,1.0,1.0,0.018,0.07,0.04,0.044,0.217,0.074,...,-21.399,-71.754,-23.274,32.708,7.204,38.913,20.644,20.334,-11.435,-9.843
2,1.0,0.911,0.096,0.147,0.489,1.0,0.561,0.258,0.153,0.096,...,13.779,-21.774,-7.484,-20.936,15.136,-1.335,-16.819,8.278,18.478,-9.689
3,0.651,0.592,0.693,0.663,0.506,0.135,0.109,0.102,0.104,0.075,...,12.729,-11.11,22.801,-17.896,-17.476,-21.478,10.794,-6.544,-29.117,-9.013
4,1.0,0.529,0.318,0.331,0.534,0.821,0.198,0.155,0.484,0.489,...,17.525,23.192,-1.157,-20.737,29.243,18.097,-3.008,46.894,13.278,-4.501


In [12]:
mss_num_df2 = pd.concat([mss_df.loc[:num_rows, 'sp_0':'sp_59'],
                        mss_df.loc[:num_rows, 'st_0':'st_59'],
                        mss_df.loc[:num_rows, 'k_0' :'k_11'],
                        mss_df.loc[:num_rows, 'ts_0':'ts_7'],
                        mss_df.loc[:num_rows, 'loudness']
                       ],
                       axis=1
                       )

mss_num_df2.head()

Unnamed: 0,sp_0,sp_1,sp_10,sp_11,sp_12,sp_13,sp_14,sp_15,sp_16,sp_17,...,k_9,k_10,k_11,ts_0,ts_1,ts_3,ts_4,ts_5,ts_7,loudness
0,0.946,0.684,1.0,0.742,0.01,0.054,0.015,0.021,0.067,0.17,...,0,0,0,0,0,0,1,0,0,-11.197
1,1.0,1.0,1.0,1.0,0.018,0.07,0.04,0.044,0.217,0.074,...,0,0,0,0,0,0,1,0,0,-9.843
2,1.0,0.911,0.096,0.147,0.489,1.0,0.561,0.258,0.153,0.096,...,0,0,0,0,1,0,0,0,0,-9.689
3,0.651,0.592,0.693,0.663,0.506,0.135,0.109,0.102,0.104,0.075,...,0,0,0,0,0,0,1,0,0,-9.013
4,1.0,0.529,0.318,0.331,0.534,0.821,0.198,0.155,0.484,0.489,...,0,0,0,0,0,0,1,0,0,-4.501


In [13]:
# normalize all columns
from sklearn import preprocessing
mss_num_df = mss_num_df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
mss_num_df2 = mss_num_df2.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))



#get manhatten distance
dm_m1 = sn.DistanceMetric.get_metric('minkowski',p=1)
dm_m2 = sn.DistanceMetric.get_metric('minkowski',p=2)
mss_num_dm = dm_m1.pairwise(mss_num_df[:])         # distance matric of less variables using mahhatan distance
mss_num_dm2 = dm_m2.pairwise(mss_num_df[:])        # distance matric of less variables using Euclidean ditance
 
mss_num_dm3 = dm_m1.pairwise(mss_num_df2[:])       # distance matric of more variables using mahhatan distance
mss_num_dm4 = dm_m2.pairwise(mss_num_df2[:])       # distance matric of more variables using Euclidean distance







In [11]:
from sklearn.metrics.pairwise import cosine_similarity
mss_dm_cosine = cosine_similarity(mss_num_df[:])    # distance matric of less variables using cosine distance
mss_dm_cosine2 = cosine_similarity(mss_num_df2[:])  # distance matric of more variables using cosine distance
mss_dm_cosine[0:3,0:9]

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [14]:
input_song = "Superconfidential"

def find_loc(input_song):
    i = 0
    for title in mss_df['title']:
        title_df = title.decode("utf-8")
        if input_song == title_df:
            return (i)
        i = i + 1

loc_song = find_loc(input_song)
print(loc_song)

def find_loc2(location):
    j = 0
    a = []
    for distance in mss_num_dm[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm[0:num_rows,location],6)[5] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

        
def find_loc3(location):
    j = 0
    for distance in mss_dm_cosine[0:num_rows,location]:
        if distance == np.partition(mss_dm_cosine[0:num_rows,location],9999)[9998]:
            return j
        j = j + 1

loc_re_song= find_loc2(loc_song)
print(loc_re_song)        
recom_songs = mss_df['title'][loc_re_song[:]]

def name(songs):
    b =[]
    for song in songs:
        new_song = song.decode("utf-8")
        b.append(new_song)
    return b

recom_songs_final = name(recom_songs)
print(recom_songs_final)

26
[2550, 3685, 6865, 7318, 9797]
['G G Kah', 'Where Are You', 'Mindless', 'Espana tiene sabor', 'Final Solution']


In [80]:
input_songs = ['G G Kah', 'Where Are You', 'Mindless']

def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_num_dm[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return = find_all(input_songs)
songs_return



[26,
 138,
 357,
 364,
 456,
 522,
 672,
 833,
 1267,
 1747,
 1891,
 2269,
 2475,
 2663,
 2786,
 2890,
 3215,
 3375,
 3551,
 3659,
 3719,
 3793,
 4017,
 4224,
 4236,
 4507,
 5460,
 5489,
 5941,
 6205,
 6605,
 6646,
 7131,
 7285,
 7305,
 7318,
 7429,
 7768,
 7803,
 7872,
 8219,
 8733,
 8889,
 9049,
 9279,
 9347,
 9587,
 9818,
 9849,
 9858,
 250,
 326,
 342,
 1052,
 1240,
 1389,
 1532,
 2196,
 2202,
 2451,
 2669,
 2760,
 2805,
 2955,
 3587,
 3797,
 4655,
 4898,
 5291,
 5537,
 5849,
 5936,
 6131,
 6344,
 6616,
 6749,
 6974,
 7225,
 7286,
 7460,
 7713,
 7761,
 8216,
 8336,
 8465,
 8588,
 8601,
 8902,
 9029,
 9035,
 9049,
 9253,
 9309,
 9388,
 9418,
 9679,
 9795,
 9,
 26,
 364,
 420,
 556,
 595,
 1166,
 1346,
 1347,
 1845,
 1967,
 1970,
 2048,
 2081,
 2145,
 2404,
 2540,
 2619,
 2773,
 3088,
 3192,
 3685,
 3715,
 3913,
 4465,
 4496,
 4615,
 4691,
 5183,
 5323,
 5616,
 6129,
 6672,
 6747,
 6757,
 6761,
 6921,
 6939,
 7318,
 7623,
 7714,
 7763,
 7803,
 7914,
 8253,
 9642,
 9741,
 9797,
 9818,

In [78]:
import collections
def match(songs):
    song_find = []
    for item, count in collections.Counter(songs).items():
        if count > 1:
            song_find.append(item)
            # print (song_find)
            return song_find
    if len(song_find) == 0:
        print ("not found")
result = match(songs_return)
print(result)
    

[26]


In [85]:
from collections import Counter
songs_return.sort()
counts = Counter(songs_return)
print(counts)


Counter({26: 2, 9049: 2, 9818: 2, 364: 2, 7803: 2, 7318: 2, 2048: 1, 3587: 1, 7429: 1, 4615: 1, 8336: 1, 9: 1, 522: 1, 8465: 1, 9741: 1, 9309: 1, 3088: 1, 6616: 1, 9795: 1, 9347: 1, 9858: 1, 8216: 1, 3375: 1, 1052: 1, 8733: 1, 2669: 1, 2081: 1, 4898: 1, 7460: 1, 9253: 1, 556: 1, 4655: 1, 5936: 1, 7713: 1, 1845: 1, 6921: 1, 7225: 1, 2619: 1, 6344: 1, 6205: 1, 6974: 1, 5183: 1, 833: 1, 1346: 1, 1347: 1, 9029: 1, 326: 1, 3913: 1, 2890: 1, 3659: 1, 7761: 1, 4691: 1, 5460: 1, 342: 1, 7768: 1, 6747: 1, 6749: 1, 357: 1, 2145: 1, 1891: 1, 2404: 1, 3685: 1, 2663: 1, 6761: 1, 1389: 1, 8253: 1, 5489: 1, 6757: 1, 9587: 1, 7285: 1, 7286: 1, 3192: 1, 9849: 1, 8588: 1, 4224: 1, 7763: 1, 3715: 1, 456: 1, 3719: 1, 7305: 1, 138: 1, 2955: 1, 4236: 1, 1166: 1, 3215: 1, 4496: 1, 2451: 1, 2196: 1, 8601: 1, 2202: 1, 4507: 1, 9885: 1, 9679: 1, 9797: 1, 672: 1, 5537: 1, 6939: 1, 420: 1, 4465: 1, 9642: 1, 2475: 1, 9388: 1, 1967: 1, 4017: 1, 1970: 1, 1267: 1, 5291: 1, 6672: 1, 9279: 1, 8889: 1, 7872: 1, 9035: 1,

In [77]:
mss_song_artist.loc[26,]

title           b'Superconfidential'
artist_name                   b'Clp'
artist_id      b'ARNTLGG11E2835DDB9'
song_id        b'SOZQDIU12A58A7BCF6'
Name: 26, dtype: object

In [12]:
loc_song = 26

def find_loc2(location):
    j = 0
    for distance in mss_num_dm2[0:num_rows,location]:
        if distance == np.partition(mss_num_dm2[0:num_rows,location],2)[1]:
            return j
        j = j + 1

loc_re_song = find_loc2(loc_song)
        
mss_df['title'][loc_re_song].decode("utf-8")


'Where Are You'

In [15]:
loc_song = 26
def find_loc2(location):
    j = 0
    for distance in mss_num_dm3[0:num_rows,location]:
        if distance == np.partition(mss_num_dm3[0:num_rows,location],2)[1]:
            return j
        j = j + 1

loc_re_song = find_loc2(loc_song)
        
mss_df['title'][loc_re_song].decode("utf-8")

'Where Are You'

In [14]:
loc_song = 26
def find_loc2(location):
    j = 0
    for distance in mss_num_dm4[0:num_rows,location]:
        if distance == np.partition(mss_num_dm4[0:num_rows,location],2)[1]:
            return j
        j = j + 1

loc_re_song = find_loc2(loc_song)
        
mss_df['title'][loc_re_song].decode("utf-8")

'Back To Burn'

In [20]:
loc_song = 26

loc_re_song = find_loc3(loc_song)
        
mss_df['title'][loc_re_song].decode("utf-8")

'Zydeco In D-Minor'

In [22]:
loc_song = 26
def find_loc2(location):
    j = 0
    for distance in mss_dm_cosine2[0:num_rows,location]:
        if distance == np.partition(mss_dm_cosine2[0:num_rows,location],9999)[9998]:
            return j
        j = j + 1

loc_re_song = find_loc2(loc_song)
        
mss_df['title'][loc_re_song].decode("utf-8")

'Zydeco In D-Minor'