In [17]:
import numpy as np
import pandas as pd

import implicit

from google.cloud import storage

from pandas.io import gbq

import pickle

In [3]:
#client = storage.Client(project='umg-comm-tech-dev')
#bucket = client.get_bucket('umg-comm-tech-dev')

In [4]:
class MacOSFile(object):

    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        # print("reading total_bytes=%s" % n, flush=True)
        if n >= (1 << 31):
            buffer = bytearray(n)
            idx = 0
            while idx < n:
                batch_size = min(n - idx, 1 << 31 - 1)
                # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
                buffer[idx:idx + batch_size] = self.f.read(batch_size)
                # print("done.", flush=True)
                idx += batch_size
            return buffer
        return self.f.read(n)

    def write(self, buffer):
        n = len(buffer)
        print("writing total_bytes=%s..." % n, flush=True)
        idx = 0
        while idx < n:
            batch_size = min(n - idx, 1 << 31 - 1)
            print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
            self.f.write(buffer[idx:idx + batch_size])
            print("done.", flush=True)
            idx += batch_size


def pickle_dump(obj, file_path):
    with open(file_path, "wb") as f:
        return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)


def pickle_load(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(MacOSFile(f))

In [8]:
model = pickle_load('/Users/meshchd/Downloads/saved_model.pkl')

In [203]:
df_orig = pd.read_csv('/Users/meshchd/Downloads/500UMeISRCs.csv')

In [235]:
df_orig[100:]

Unnamed: 0,isrc,artist,title
100,NLA270300127,The Cats,Lovin’ Arms
101,USEM39100400,Fats Domino,Shu Rah
102,GBEKZ0300200,Katherine Jenkins,Bailero
103,DEF058831109,Howard Crook,"20. ""Ich will bei meinem Jesu wachen"""
104,GBF075813000,George London,"""Leb wohl, du kühnes, herrliches Kind"""
105,GBF079740702,Paul Agnew,"No.2 Evangelist: ""Es begab sich aber zu der Z..."
106,GBAKW8801023,The Style Council,The Gardener Of Eden (A Three Piece Suite)
107,SEUM71700321,Habz,Gör plats
108,USUG11401628,Ryan Adams,Shadows
109,USHOM9000027,Ottmar Liebert,O Holy Nite


In [11]:
df_orig.to_gbq('amplify.ume500_isrcs', project_id='umg-comm-tech-dev', if_exists='replace')

1it [00:05,  5.35s/it]


In [12]:
get_existing_tags = """
SELECT
  isrc,
  confidence,
  path,
  source,
  taxonomy_node_id,
  value
FROM
  `umg-metadata.ircam.amplify_tem_v3_3_v`
WHERE
  isrc IN (
  SELECT
    isrc
  FROM
    `umg-comm-tech-dev.amplify.ume500_isrcs`
  GROUP BY
    isrc)
"""

In [13]:
tags = gbq.read_gbq(get_existing_tags, project_id='umg-comm-tech-dev', dialect='standard')

In [14]:
tags.head()

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
0,GRE018000028,1.0,Anatomy/Harmony/Root,Auto,acf5ddcf-0890-4647-8823-16522e66b541,A
1,GRE018000028,0.544659,Energy/Emotion,Auto,3adcefb7-b1c1-4f4d-a40c-4032c8d35aca,Negative
2,GRE018000028,0.544659,Energy/Emotion/Negative,Auto,77822969-740e-4f56-9ea0-09de404dc071,Sad
3,GRE018000028,1.0,Energy/Intensity/Medium,Manual,0b5909f2-c61f-43f2-a51d-a473fb01a768,Medium Intensity
4,GRE018000028,0.631046,Performance/Ensemble/Ensemble Configuration,Auto,6df35aeb-4c6f-448e-8d14-1967aa24a098,Voice & Accompaniment


In [15]:
tags.describe(include='all')

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
count,30518,30518.0,30518,30518,30518,30518
unique,491,,100,2,256,733
top,GBF075820000,,Performance/Ensemble/Ensemble Section,Auto,ca8d3fea-e3cc-4c38-abd3-0a32e49a311f,Percussion
freq,274,,4180,29948,1062,1775
mean,,0.8521,,,,
std,,0.180539,,,,
min,,0.301012,,,,
25%,,0.732352,,,,
50%,,0.939958,,,,
75%,,1.0,,,,


In [16]:
tags.groupby(by='isrc').value.count().describe()

count    491.000000
mean      62.154786
std       26.924090
min        6.000000
25%       50.000000
50%       58.000000
75%       84.000000
max      274.000000
Name: value, dtype: float64

In [30]:
tags_in_recommender = [isrc if isrc in model.isrcs else '' for isrc in tags.isrc.unique()]

In [31]:
tags_in_recommender = list(set(tags_in_recommender))

In [32]:
len(tags_in_recommender)

168

In [71]:
tags_in_recommender[1]

'GBAYE8200083'

In [91]:
def get_similar_items(seed_isrc, n=100):
    
    item_id = model.isrcs.index(seed_isrc)

    # Get the item row for the isrc
    item_vec = model.isrc_vecs[item_id].T

    # Calculate the similarity score
    # and select the top 100 most similar.
    scores = model.isrc_vecs.dot(item_vec).reshape(1,-1)[0]
    top_100 = np.argsort(scores)[::-1][:n]
    
    similar_isrcs = []
    artists = []
    names = []
    isrc_scores = []
    
    for idx in list(top_100):

        isrc = model.isrcs[idx]
        lookup = model.isrc_lookup[model.isrc_lookup.isrc==isrc].iloc[0]
    
        similar_isrcs.append(isrc)
        artists.append(lookup[1])
        names.append(lookup[2])
        isrc_scores.append(scores[idx])
        
    similar = pd.DataFrame({'isrc': similar_isrcs, 'artist': artists, 'track_name': names, 'score': isrc_scores})
    similar['seed_isrc']=seed_isrc
    
    return similar

In [93]:
df = pd.DataFrame(columns = ['isrc','artist','track_name','score','seed_isrc'])

for isrc in tags_in_recommender[1:]:
    df=pd.concat([df,get_similar_items(isrc)], ignore_index=True)

In [94]:
df.shape

(16700, 5)

In [95]:
df.head()

Unnamed: 0,isrc,artist,track_name,score,seed_isrc
0,GBAYE8200083,Thomas Dolby,She Blinded Me With Science,0.012011,GBAYE8200083
1,CHD771800006,Alejandro Reyes,Solamente,0.010726,GBAYE8200083
2,ROARA1700019,Andra,Shukar,0.01003,GBAYE8200083
3,USSD11900123,Descemer Bueno,Nos Fuimos Lejos (feat. Ece Seçkin & El Micha)...,0.009937,GBAYE8200083
4,ushm20708304,Emile Pandolfi,Once Upon a December (from Anastasia),0.009774,GBAYE8200083


In [96]:
df.to_gbq('amplify.ume500_100similar', project_id='umg-comm-tech-dev', if_exists='replace')

1it [00:00,  6.21it/s]


In [97]:
get_existing_tags_similar = """
SELECT
  isrc,
  confidence,
  path,
  source,
  taxonomy_node_id,
  value
FROM
  `umg-metadata.ircam.amplify_tem_v3_3_v`
WHERE
  isrc IN (
  SELECT
    isrc
  FROM
    `umg-comm-tech-dev.amplify.ume500_100similar`
  GROUP BY
    isrc)
"""

In [98]:
tags_similar = gbq.read_gbq(get_existing_tags_similar, project_id='umg-comm-tech-dev', dialect='standard')

In [99]:
tags_similar.head()

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
0,UKG4Y1700048,1.0,Anatomy/Harmony/Root,Auto,e5d51dc7-8a75-4cd8-8efd-a3f18eddc866,Bb/A#
1,UKG4Y1700048,0.927146,Genre,Auto,195e1e65-82e9-4a49-96de-fca22301027e,Electronic
2,QZG4T1900011,1.0,Anatomy/Harmony/Root,Auto,71fade6b-1c0e-4b10-9ee3-2178a7548c86,G
3,QZG4T1900011,0.912205,Genre,Auto,3ffa3c14-6b46-4295-9618-77b1de08c9b6,R&B
4,QZG4T1900011,0.912205,Genre/R&B,Auto,d460dddd-2bf3-434b-81de-86b5a60d285a,Contemporary R&B


In [101]:
tags_similar.groupby(by='isrc').value.count().describe()

count    1235.000000
mean       79.149798
std        70.874631
min         6.000000
25%        50.000000
50%        60.000000
75%        90.000000
max      1118.000000
Name: value, dtype: float64

In [103]:
original_count = pd.DataFrame(tags.groupby(by='isrc').value.nunique()).reset_index()
original_count.head()

Unnamed: 0,isrc,value
0,ARA730600041,26
1,ATT259608440,33
2,ATUM71500361,26
3,AUAB00050243,30
4,AUAB00500696,30


In [104]:
similar_count = pd.DataFrame(tags_similar.groupby(by='isrc').value.nunique()).reset_index()
similar_count.head()

Unnamed: 0,isrc,value
0,ARF410600414,30
1,ATB158500018,18
2,ATPB51801014,26
3,AUAP08200012,30
4,AUBEC1601210,23


In [107]:
df_all = pd.merge(df, original_count, left_on='seed_isrc', right_on='isrc', suffixes=['','_original'])
df_all.head()

Unnamed: 0,isrc,artist,track_name,score,seed_isrc,isrc_original,value
0,GBAYE8200083,Thomas Dolby,She Blinded Me With Science,0.012011,GBAYE8200083,GBAYE8200083,30
1,CHD771800006,Alejandro Reyes,Solamente,0.010726,GBAYE8200083,GBAYE8200083,30
2,ROARA1700019,Andra,Shukar,0.01003,GBAYE8200083,GBAYE8200083,30
3,USSD11900123,Descemer Bueno,Nos Fuimos Lejos (feat. Ece Seçkin & El Micha)...,0.009937,GBAYE8200083,GBAYE8200083,30
4,ushm20708304,Emile Pandolfi,Once Upon a December (from Anastasia),0.009774,GBAYE8200083,GBAYE8200083,30


In [108]:
df_all = pd.merge(df_all, similar_count, on='isrc', suffixes=['','_similar'])
df_all.head()

Unnamed: 0,isrc,artist,track_name,score,seed_isrc,isrc_original,value,value_similar
0,GBAYE8200083,Thomas Dolby,She Blinded Me With Science,0.012011,GBAYE8200083,GBAYE8200083,30,30
1,USMC18416586,Harold Faltermeyer,"Axel F - From ""Beverly Hills Cop""",0.009662,GBAYE8200083,GBAYE8200083,30,22
2,USMC18416586,Harold Faltermeyer,"Axel F - From ""Beverly Hills Cop""",0.009969,USAR18400004,USAR18400004,28,22
3,USMC18416586,Harold Faltermeyer,"Axel F - From ""Beverly Hills Cop""",0.006738,USCH38400069,USCH38400069,31,22
4,USMC18416586,Harold Faltermeyer,"Axel F - From ""Beverly Hills Cop""",0.001448,GBUM71601386,GBUM71601386,6,22


In [113]:
df_all.to_gbq('amplify.ume500_initial_and_similar', project_id='umg-comm-tech-dev', if_exists='replace')

1it [00:00,  3.79it/s]


In [240]:
df_all[df_all.seed_isrc=='GBF076724460'].sort_values(by='score', ascending=False)

Unnamed: 0,isrc,artist,track_name,score,seed_isrc,isrc_original,value,value_similar
228,FR0LO1800026,Jean-Michel Jarre,ROBOTS DON'T CRY (movement 3),0.004392,GBF076724460,GBF076724460,27,25
920,USUM70913044,JAY Z,"Money, Cash, Hoes",0.004179,GBF076724460,GBF076724460,27,24
642,CAQ521820001,Lara Fabian,For me… formidable,0.004162,GBF076724460,GBF076724460,27,18
775,CZN271998765,Mikolas Josef,Acapella,0.004055,GBF076724460,GBF076724460,27,22
80,GBUM71807978,James Blake,Barefoot In The Park,0.004051,GBF076724460,GBF076724460,27,21
1122,USUM71810087,The Cranberries,Íosa,0.004036,GBF076724460,GBF076724460,27,23
1763,BEA661800021,Lara Fabian,Par amour,0.004013,GBF076724460,GBF076724460,27,19
355,NOLEH1801010,Fieh,25,0.003999,GBF076724460,GBF076724460,27,84
1665,USFI86600063,Vince Guaraldi Trio,Linus And Lucy,0.003977,GBF076724460,GBF076724460,27,38
1867,USMEX0800099,Washed Out,Feel It All Around,0.003972,GBF076724460,GBF076724460,27,28


In [112]:
tags[tags.isrc=='GBAYE8200083'].value.unique()

array(['C', 'Attitude', 'Sentimental', 'Pop Band', 'Low Intensity',
       'Steady', 'No Strings', 'No Winds', 'Wind Section', 'Positive',
       'Percussion', 'Romantic', 'Major', 'Drum Set', 'No Choir',
       'Medium', 'Low', 'High Vocal Range', 'Simple', 'Persistent Pulse',
       'Slow', '4', 'String Section', 'Dynamic', 'Has Percussion',
       'Electric', '130.54', 'Light', 'Choir', 'Voice & Music'],
      dtype=object)

In [115]:
tags = tags.drop_duplicates()

In [237]:
tags[tags.isrc=='ITZ041600039']

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
865,ITZ041600039,0.388283,Energy/Emotion/Positive,Auto,1d120cf1-e28a-4c97-b867-117a4662f128,Sweet
866,ITZ041600039,0.756001,Performance/Ensemble/Ensemble Configuration,Auto,6df35aeb-4c6f-448e-8d14-1967aa24a098,Voice & Accompaniment
867,ITZ041600039,0.98208,Performance/Ensemble/Ensemble Section/Choir,Auto,3bb61462-df9f-4f32-b81b-94a85f1a5bb5,Has Choir
868,ITZ041600039,0.756001,Performance/Ensemble/Ensemble Size,Auto,85fb940e-c389-419e-9180-43173c767bc3,Small Ensemble Size
869,ITZ041600039,0.98208,Performance/Voice,Auto,1461d992-66fe-4af0-aeeb-d611d324faa2,Vocal Ensemble
870,ITZ041600039,0.98208,Performance/Voice/Vocal Ensemble,Auto,c4783294-386f-40da-af3a-16e8edaa0396,Choir
4128,ITZ041600039,0.772315,Energy/Intensity/Low,Auto,0045b3a0-e3ce-439a-905b-50ab4e25f578,Calm
4205,ITZ041600039,0.772315,Energy/Intensity/Low,Auto,0045b3a0-e3ce-439a-905b-50ab4e25f578,Low Intensity
6085,ITZ041600039,0.692527,Performance/Ensemble/Ensemble Section/Wind Sec...,Auto,137e3494-28f1-44e0-ad02-625abe2d47bf,No Winds
7278,ITZ041600039,0.692527,Performance/Ensemble/Ensemble Section,Auto,1d145636-1083-46f0-aca0-7b509e892f3e,Wind Section


In [121]:
tags_no_dupl = pd.DataFrame(columns=tags.columns)

for isrc in tags.isrc.unique():
    
    tag_slice = tags[tags.isrc==isrc]

    paths = tag_slice.path.unique()

    dupl = []

    for i, path in enumerate(paths):
        for j, path2 in enumerate(paths):
            if i!=j:
                if path2 in path:
                    dupl.append(path2)
                
    duplicates = list(set(dupl))
    
    slice_no_dupl = tag_slice[~tag_slice.path.isin(duplicates)]
    
    tags_no_dupl = pd.concat([tags_no_dupl,slice_no_dupl], ignore_index=True)

In [241]:
tags_no_dupl[tags_no_dupl.isrc=='GBF076724460'].sort_values(by='confidence', ascending=False)

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
2748,GBF076724460,1.0,Anatomy/Harmony/Root,Auto,bee72ffb-9c31-4c0a-8cf2-9d1411fe0b78,Ab/G#
2754,GBF076724460,1.0,Anatomy/Rhythm & Time/Meter/Simple,Auto,2ec3702e-5214-43d0-a016-c3ee36502dee,3
2758,GBF076724460,1.0,Anatomy/Harmony/Mode,Auto,600e609b-d06f-46bd-a5e9-40088309315b,Major
2767,GBF076724460,1.0,Anatomy/Rhythm & Time/BPM,Auto,ca8d3fea-e3cc-4c38-abd3-0a32e49a311f,51.65
2756,GBF076724460,0.99469,Performance/Ensemble/Ensemble Section/String S...,Auto,371c6532-8bca-4c2e-bfdb-1ec24b430c2a,Has Strings
2762,GBF076724460,0.99469,Performance/Instrument,Auto,92a26e27-622b-47f6-89a5-09c67c0a402e,Strings
2761,GBF076724460,0.986472,Anatomy/Rhythm & Time/Tempo/Speed,Auto,81d65be3-1438-43fb-9ca1-82414d1e9930,Slow
2766,GBF076724460,0.959383,Performance/Instrument,Auto,ada66447-1442-4b2d-afca-4dfc1b66ac32,Percussion
2764,GBF076724460,0.959383,Performance/Ensemble/Ensemble Section/Percussion,Auto,ab89b187-246b-42bd-8eac-93bde9ea9a98,Has Percussion
2760,GBF076724460,0.898329,Performance/Ensemble/Ensemble Section/Choir,Auto,7523ec70-c471-47a4-8837-8f1e595cf7e6,No Choir


In [142]:
tags_similar = tags_similar.drop_duplicates(subset=['isrc','path','source','taxonomy_node_id','value'])

In [143]:
similar_no_dupl = pd.DataFrame(columns=tags_similar.columns)

for isrc in tags_similar.isrc.unique():
    
    tag_slice = tags_similar[tags_similar.isrc==isrc]

    paths = tag_slice.path.unique()

    dupl = []

    for i, path in enumerate(paths):
        for j, path2 in enumerate(paths):
            if i!=j:
                if path2 in path:
                    dupl.append(path2)
                
    duplicates = list(set(dupl))
    
    slice_no_dupl = tag_slice[~tag_slice.path.isin(duplicates)]
    
    similar_no_dupl = pd.concat([similar_no_dupl,slice_no_dupl], ignore_index=True)

In [243]:
isrcs_GBF076724460 = df_all[df_all.seed_isrc=='GBF076724460'].isrc.unique()

In [244]:
tags_similar_GBF076724460 = similar_no_dupl[similar_no_dupl.isrc.isin(isrcs_GBF076724460)]

In [245]:
tags_similar_GBF076724460['path_and_tag']=[str(x)+' - '+str(y) for (x,y) in zip(tags_similar_GBF076724460.path, tags_similar_GBF076724460.value)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [246]:
similar_perc = tags_similar_GBF076724460.path_and_tag.value_counts()/len(tags_similar_GBF076724460.isrc.unique())

In [247]:
similar_perc

Performance/Ensemble/Ensemble Section/Percussion - Has Percussion     1.000000
Anatomy/Rhythm & Time/Meter/Simple - 4                                0.928571
Performance/Ensemble/Ensemble Section/String Section - No Strings     0.928571
Performance/Ensemble/Ensemble Section/Choir - No Choir                0.928571
Performance/Ensemble/Ensemble Section/Wind Section - No Winds         0.857143
Anatomy/Harmony/Mode - Major                                          0.785714
Anatomy/Arrangement - Voice & Music                                   0.642857
Anatomy/Rhythm & Time/Tempo/Speed - Medium                            0.571429
Performance/Instrument - Percussion                                   0.500000
Anatomy/Rhythm & Time/Tempo/Persistent Pulse - Steady                 0.500000
Performance/Ensemble/Ensemble Timbre - Electronic/Synthetic           0.500000
Anatomy/Structure/Dynamic Shape - Steady                              0.357143
Performance/Instrument/Guitar - Electric Guitar     

In [216]:
similar_perc_df = pd.DataFrame(similar_perc).reset_index()

In [230]:
isstructure = [True if 'Speed' in str(x) else False for x in similar_perc_df['index']]

In [231]:
similar_perc_df[isstructure]

Unnamed: 0,index,path_and_tag
7,Anatomy/Rhythm & Time/Tempo/Speed - Medium,0.862069
29,Anatomy/Rhythm & Time/Tempo/Speed - Fast,0.155172
32,Anatomy/Rhythm & Time/Tempo/Speed - Slow,0.103448
