### Load Packages

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
tqdm.pandas()
np.set_printoptions(5,)
assert int(tf.__version__[0]) == 2, "tensorflow 2.0 should be installed"

  from ._conv import register_converters as _register_converters


### Download Data

In [2]:
from tensorflow.keras.utils import get_file

ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

play_path = get_file("lastfm_play.csv",
                     ROOT_URL+"lastfm_play.csv")
artist_path = get_file("lastfm_artist.csv",
                       ROOT_URL+"lastfm_artist.csv")

play_df = pd.read_csv(play_path)
artist_df = pd.read_csv(artist_path)

### Load Bayesian Personalized Ranking

* BPR Model Below was trained based on [this script](https://github.com/craftsangjae/recommender-system-dojo)

In [8]:
from tensorflow.keras.models import load_model

# download Model weights
fpath = get_file("bpr_lastfm.h5", 
"https://craftsangjae.s3.ap-northeast-2.amazonaws.com/models/bayesian_personalized_ranking_lastfm.h5")

model = load_model(fpath, compile=False)

# Get artist & User Embedding weights
user_embeddings = model.get_layer('user_embedding').get_weights()[0]
item_embeddings = model.get_layer('item_embedding').get_weights()[0]
item_bias = model.get_layer('item_bias').get_weights()[0]
num_factor = item_embeddings.shape[1]

# Convert numpy array to Dataframe
user_embedding_df = pd.DataFrame(user_embeddings, 
                                 index=user_df.user_id)
user_embedding_df[num_factor] = 1.

artist_embedding_df = pd.DataFrame(item_embeddings,
                                   index=artist_df.artist_name)
artist_embedding_df[num_factor] = item_bias[:,0]

In [29]:
user_embedding_df.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.643246,0.039485,-0.370728,0.392601,-0.724467,0.191898,-0.544087,0.367469,0.627664,0.467874,...,-0.252156,0.004147,-1.062369,0.27079,0.591127,-0.152633,0.038697,0.082105,-0.213111,1.0
1,-0.384695,0.302023,-0.492465,-0.532832,-0.090381,0.433262,-0.364443,0.519421,0.606082,0.72478,...,-0.106053,-0.15252,0.477657,-0.178336,-0.362389,-0.137496,0.432181,0.371543,-0.022037,1.0
2,-0.348102,0.270752,-0.145613,0.699967,-0.071221,0.843833,0.289842,0.688561,0.76208,-0.342442,...,0.275275,-0.290239,0.336594,0.352792,0.736535,0.209032,-0.621285,0.116348,-0.096729,1.0


In [30]:
artist_embedding_df.head(3)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,23,24,25,26,27,28,29,30,31,32
artist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
betty blowtorch,0.198954,-0.161507,-0.013325,0.588709,-0.879386,0.064949,-0.149893,0.274045,0.262083,0.584227,...,0.000921,0.261988,-0.892634,0.046933,0.036324,-0.012372,-0.095637,-0.012037,-0.284383,-0.027492
die Ärzte,0.108095,0.362245,-0.132375,0.536328,0.021018,0.030845,-0.578812,-0.188061,-0.105132,-0.06142,...,-0.495304,0.172012,0.028186,0.066016,0.158585,-0.076505,-0.676024,-0.539862,0.335527,0.245553
melissa etheridge,0.340715,-0.090018,-0.07398,-0.074489,-0.254661,0.36032,-0.024921,-0.421455,-0.060424,-0.056207,...,-0.30369,-0.09973,-0.524851,0.071448,0.563454,0.077163,-0.010788,-0.599362,-0.545797,0.172876


## Serving Matrix Factorization Using Annoy

### Build Annoy Tree

In [9]:
from annoy import AnnoyIndex

# use dot-product similarity
tree = AnnoyIndex(num_factor + 1, "dot")

for idx, value in enumerate(artist_embedding_df.values):
    tree.add_item(idx, value)
    
tree.build(50) 

True

### Search Similarity using Annoy


#### case 1. Find Similar Artists

> Which artist is similar to `jason mraz`?

In [56]:
target_artist_id = 3662

artist_df.iloc[target_artist_id]

artist_id            3662
artist_name    jason mraz
Name: 3662, dtype: object

In [100]:
%%time
# recommend 10 artists
artist_indices = tree.get_nns_by_item(target_artist_id, 10)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.93 ms


In [102]:
artist_indices = tree.get_nns_by_item(target_artist_id, 10)
artist_df.loc[artist_indices]

Unnamed: 0,artist_id,artist_name
3662,3662,jason mraz
15100,15100,jason reeves
8344,8344,justin nozuka
10745,10745,matt white
3674,3674,gavin degraw
5568,5568,matt wertz
35,35,john mayer
1587,1587,james morrison
397,397,colbie caillat
10738,10738,teddy geiger


#### case 2. Find artists to recommend to users

> Find artist for dancing songs lovers

In [83]:
target_user_id = 209
listened_artists = play_df[play_df.user_id==target_user_id].artist_id

print("List of artists that the user has heard")
print(artist_df.loc[listened_artists.values,"artist_name"].values)

List of artists that the user has heard
['安室奈美恵' '浜崎あゆみ' 'britney spears' '12012' '中島美嘉' '倖田來未' '이효리' '宇多田ヒカル'
 'madonna' 'michael jackson' '新垣結衣' 'rihanna' 'mariah carey' 'evanescence'
 'linkin park' '久石譲' 'olivia' 'christina aguilera' '鄭秀文' 'boa' 'disney'
 'olivia ong' 'donawhale' '王力宏' 'bee gees' 'gackt' 'enya'
 'the pussycat dolls' 'ashlee simpson' 'm-flo' 'enrique iglesias' 'alan'
 'ガゼット' 'late night alumni' 'michelle branch' 'nelly furtado'
 'vanessa paradis' 'big bang' 'spice girls' 'beyoncé' 'uverworld'
 'frank sinatra' 'avril lavigne' 'mink' 'bon jovi' 'abingdon boys school'
 'jennifer lopez' 'kelly clarkson' 'lady gaga' 'timbaland'
 'justin timberlake' 'ciara']


In [94]:
%%time

# recommend 10 artists
target_vector = user_embedding_df.loc[target_user_id].values
artist_indices = tree.get_nns_by_vector(target_vector, 10)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.22 ms


In [103]:
target_vector = user_embedding_df.loc[target_user_id].values
artist_df.loc[artist_indices]

Unnamed: 0,artist_id,artist_name
3662,3662,jason mraz
15100,15100,jason reeves
8344,8344,justin nozuka
10745,10745,matt white
3674,3674,gavin degraw
5568,5568,matt wertz
35,35,john mayer
1587,1587,james morrison
397,397,colbie caillat
10738,10738,teddy geiger
