In [17]:
import numpy as np
import pandas as pd

import implicit

from google.cloud import storage

from pandas.io import gbq

import pickle

In [3]:
#client = storage.Client(project='umg-comm-tech-dev')
#bucket = client.get_bucket('umg-comm-tech-dev')

In [4]:
class MacOSFile(object):

    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        # print("reading total_bytes=%s" % n, flush=True)
        if n >= (1 << 31):
            buffer = bytearray(n)
            idx = 0
            while idx < n:
                batch_size = min(n - idx, 1 << 31 - 1)
                # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
                buffer[idx:idx + batch_size] = self.f.read(batch_size)
                # print("done.", flush=True)
                idx += batch_size
            return buffer
        return self.f.read(n)

    def write(self, buffer):
        n = len(buffer)
        print("writing total_bytes=%s..." % n, flush=True)
        idx = 0
        while idx < n:
            batch_size = min(n - idx, 1 << 31 - 1)
            print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
            self.f.write(buffer[idx:idx + batch_size])
            print("done.", flush=True)
            idx += batch_size


def pickle_dump(obj, file_path):
    with open(file_path, "wb") as f:
        return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)


def pickle_load(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(MacOSFile(f))

In [8]:
model = pickle_load('/Users/meshchd/Downloads/saved_model.pkl')

In [9]:
df = pd.read_csv('/Users/meshchd/Downloads/500UMeISRCs.csv')

In [10]:
df.head()

Unnamed: 0,isrc,artist,title
0,GBAAA8500070,Simple Minds,Alive And Kicking
1,USUG11401961,Madonna,Bitch I'm Madonna
2,GBUM71028665,James Blake,Limit To Your Love
3,USCH38400069,Pat Benatar,Shadows Of The Night
4,USPR36905204,The Velvet Underground,Pale Blue Eyes


In [11]:
df.to_gbq('amplify.ume500_isrcs', project_id='umg-comm-tech-dev', if_exists='replace')

1it [00:05,  5.35s/it]


In [12]:
get_existing_tags = """
SELECT
  isrc,
  confidence,
  path,
  source,
  taxonomy_node_id,
  value
FROM
  `umg-metadata.ircam.amplify_tem_v3_3_v`
WHERE
  isrc IN (
  SELECT
    isrc
  FROM
    `umg-comm-tech-dev.amplify.ume500_isrcs`
  GROUP BY
    isrc)
"""

In [13]:
tags = gbq.read_gbq(get_existing_tags, project_id='umg-comm-tech-dev', dialect='standard')

In [14]:
tags.head()

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
0,GRE018000028,1.0,Anatomy/Harmony/Root,Auto,acf5ddcf-0890-4647-8823-16522e66b541,A
1,GRE018000028,0.544659,Energy/Emotion,Auto,3adcefb7-b1c1-4f4d-a40c-4032c8d35aca,Negative
2,GRE018000028,0.544659,Energy/Emotion/Negative,Auto,77822969-740e-4f56-9ea0-09de404dc071,Sad
3,GRE018000028,1.0,Energy/Intensity/Medium,Manual,0b5909f2-c61f-43f2-a51d-a473fb01a768,Medium Intensity
4,GRE018000028,0.631046,Performance/Ensemble/Ensemble Configuration,Auto,6df35aeb-4c6f-448e-8d14-1967aa24a098,Voice & Accompaniment


In [15]:
tags.describe(include='all')

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
count,30518,30518.0,30518,30518,30518,30518
unique,491,,100,2,256,733
top,GBF075820000,,Performance/Ensemble/Ensemble Section,Auto,ca8d3fea-e3cc-4c38-abd3-0a32e49a311f,Percussion
freq,274,,4180,29948,1062,1775
mean,,0.8521,,,,
std,,0.180539,,,,
min,,0.301012,,,,
25%,,0.732352,,,,
50%,,0.939958,,,,
75%,,1.0,,,,


In [16]:
tags.groupby(by='isrc').value.count().describe()

count    491.000000
mean      62.154786
std       26.924090
min        6.000000
25%       50.000000
50%       58.000000
75%       84.000000
max      274.000000
Name: value, dtype: float64

In [30]:
tags_in_recommender = [isrc if isrc in model.isrcs else '' for isrc in tags.isrc.unique()]

In [31]:
tags_in_recommender = list(set(tags_in_recommender))

In [32]:
len(tags_in_recommender)

168

In [71]:
tags_in_recommender[1]

'GBAYE8200083'

In [91]:
def get_similar_items(seed_isrc, n=100):
    
    item_id = model.isrcs.index(seed_isrc)

    # Get the item row for the isrc
    item_vec = model.isrc_vecs[item_id].T

    # Calculate the similarity score
    # and select the top 100 most similar.
    scores = model.isrc_vecs.dot(item_vec).reshape(1,-1)[0]
    top_100 = np.argsort(scores)[::-1][:n]
    
    similar_isrcs = []
    artists = []
    names = []
    isrc_scores = []
    
    for idx in list(top_100):

        isrc = model.isrcs[idx]
        lookup = model.isrc_lookup[model.isrc_lookup.isrc==isrc].iloc[0]
    
        similar_isrcs.append(isrc)
        artists.append(lookup[1])
        names.append(lookup[2])
        isrc_scores.append(scores[idx])
        
    similar = pd.DataFrame({'isrc': similar_isrcs, 'artist': artists, 'track_name': names, 'score': isrc_scores})
    similar['seed_isrc']=seed_isrc
    
    return similar

In [93]:
df = pd.DataFrame(columns = ['isrc','artist','track_name','score','seed_isrc'])

for isrc in tags_in_recommender[1:]:
    df=pd.concat([df,get_similar_items(isrc)], ignore_index=True)

In [94]:
df.shape

(16700, 5)

In [95]:
df.head()

Unnamed: 0,isrc,artist,track_name,score,seed_isrc
0,GBAYE8200083,Thomas Dolby,She Blinded Me With Science,0.012011,GBAYE8200083
1,CHD771800006,Alejandro Reyes,Solamente,0.010726,GBAYE8200083
2,ROARA1700019,Andra,Shukar,0.01003,GBAYE8200083
3,USSD11900123,Descemer Bueno,Nos Fuimos Lejos (feat. Ece Seçkin & El Micha)...,0.009937,GBAYE8200083
4,ushm20708304,Emile Pandolfi,Once Upon a December (from Anastasia),0.009774,GBAYE8200083


In [96]:
df.to_gbq('amplify.ume500_100similar', project_id='umg-comm-tech-dev', if_exists='replace')

1it [00:00,  6.21it/s]


In [97]:
get_existing_tags_similar = """
SELECT
  isrc,
  confidence,
  path,
  source,
  taxonomy_node_id,
  value
FROM
  `umg-metadata.ircam.amplify_tem_v3_3_v`
WHERE
  isrc IN (
  SELECT
    isrc
  FROM
    `umg-comm-tech-dev.amplify.ume500_100similar`
  GROUP BY
    isrc)
"""

In [98]:
tags_similar = gbq.read_gbq(get_existing_tags_similar, project_id='umg-comm-tech-dev', dialect='standard')

In [99]:
tags_similar.head()

Unnamed: 0,isrc,confidence,path,source,taxonomy_node_id,value
0,UKG4Y1700048,1.0,Anatomy/Harmony/Root,Auto,e5d51dc7-8a75-4cd8-8efd-a3f18eddc866,Bb/A#
1,UKG4Y1700048,0.927146,Genre,Auto,195e1e65-82e9-4a49-96de-fca22301027e,Electronic
2,QZG4T1900011,1.0,Anatomy/Harmony/Root,Auto,71fade6b-1c0e-4b10-9ee3-2178a7548c86,G
3,QZG4T1900011,0.912205,Genre,Auto,3ffa3c14-6b46-4295-9618-77b1de08c9b6,R&B
4,QZG4T1900011,0.912205,Genre/R&B,Auto,d460dddd-2bf3-434b-81de-86b5a60d285a,Contemporary R&B


In [101]:
tags_similar.groupby(by='isrc').value.count().describe()

count    1235.000000
mean       79.149798
std        70.874631
min         6.000000
25%        50.000000
50%        60.000000
75%        90.000000
max      1118.000000
Name: value, dtype: float64

In [103]:
original_count = pd.DataFrame(tags.groupby(by='isrc').value.nunique()).reset_index()
original_count.head()

Unnamed: 0,isrc,value
0,ARA730600041,26
1,ATT259608440,33
2,ATUM71500361,26
3,AUAB00050243,30
4,AUAB00500696,30


In [104]:
similar_count = pd.DataFrame(tags_similar.groupby(by='isrc').value.nunique()).reset_index()
similar_count.head()

Unnamed: 0,isrc,value
0,ARF410600414,30
1,ATB158500018,18
2,ATPB51801014,26
3,AUAP08200012,30
4,AUBEC1601210,23
