### Load Packages

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
tqdm.pandas()
np.set_printoptions(5,)
assert int(tf.__version__[0]) == 2, "tensorflow 2.0 should be installed"

  from ._conv import register_converters as _register_converters


### Download Data

In [2]:
from tensorflow.keras.utils import get_file

ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

play_path = get_file("lastfm_play.csv",
                     ROOT_URL+"lastfm_play.csv")
artist_path = get_file("lastfm_artist.csv",
                       ROOT_URL+"lastfm_artist.csv")
user_path = get_file("lastfm_user.csv",
                     ROOT_URL+"lastfm_user.csv")

play_df = pd.read_csv(play_path)
artist_df = pd.read_csv(artist_path)
user_df = pd.read_csv(user_path)

## Bayesian Personalized Ranking
---

### Build Model

In [3]:
from tensorflow.keras.layers import Input, Dot, Concatenate
from tensorflow.keras.layers import Embedding, Subtract
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

def bayesian_personalized_ranking(num_user, num_item, num_factor, l2_reg=1e-2):
    user_id = Input(shape=(), name='user')
    pos_item_id = Input(shape=(), name='pos_item')
    neg_item_id = Input(shape=(), name='neg_item')
    
    # initializer For Embedding Layer
    initializer = RandomUniform(minval=-1/num_factor,
                                maxval= 1/num_factor)
    
    user_embedding_layer = Embedding(num_user, num_factor, 
                                     embeddings_initializer=initializer,
                                     name='user_embedding')
    item_embedding_layer = Embedding(num_item, num_factor, 
                                     embeddings_initializer=initializer,
                                     name='item_embedding')
    item_bias_layer = Embedding(num_item, 1, 
                                embeddings_initializer='zeros',
                                name='item_bias')
    
    user_embedding = user_embedding_layer(user_id)
    
    pos_item_embedding = item_embedding_layer(pos_item_id)
    neg_item_embedding = item_embedding_layer(neg_item_id)
    
    pos_item_bias = item_bias_layer(pos_item_id)
    neg_item_bias = item_bias_layer(neg_item_id)
    
    # Calculation the Score Difference between positive and negative
    pos_score = (
        Dot(axes=(1,1))([user_embedding, pos_item_embedding]) + pos_item_bias)
    neg_score = (
        Dot(axes=(1,1))([user_embedding, neg_item_embedding]) + neg_item_bias)

    score = Subtract()([pos_score, neg_score])
    
    model = Model([user_id, pos_item_id, neg_item_id], score)
        
    # Add L2 Regularization Term
    l2_pos_item = l2(l2_reg)(pos_item_embedding)
    l2_neg_item = l2(l2_reg)(neg_item_embedding)
    l2_user = l2(l2_reg)(user_embedding)
    l2_loss = l2_pos_item+l2_neg_item+l2_user
    model.add_loss(l2_loss)
    
    return model

In [4]:
num_user = play_df.user_id.max() + 1
num_item = play_df.artist_id.max() + 1
num_factor = 32

model = bayesian_personalized_ranking(num_user, num_item, 32, 1e-2)

### Compile Model

In [5]:
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy

model.compile(optimizer=Adagrad(1e-1),
              loss=BinaryCrossentropy(from_logits=True, reduction='sum'), 
              metrics=[BinaryAccuracy(threshold=0.)])

### Build Data Input Pipeline

In [6]:
def bootstrap_dataset(df, batch_size=4096):
    bootstrap = df.sample(frac=1., replace=True)
    user_ids = bootstrap.user_id.values
    pos_item_ids = bootstrap.artist_id.values
    neg_item_ids = df.artist_id.sample(frac=1., replace=True).values

    X = {
        "user": user_ids,
        "pos_item": pos_item_ids,
        "neg_item": neg_item_ids
    }
    dummy_y = np.ones((len(bootstrap), 1))
    
    dataset = (
        tf.data.Dataset
        .from_tensor_slices((X,dummy_y))
        .batch(batch_size))
    
    return dataset

### Train Model

In [None]:
num_epoch = 50
batch_size = 4096

for i in range(num_epoch):
    print(f"{i+1}th epoch")
    dataset = bootstrap_dataset(play_df, batch_size)
    model.fit(dataset)

### Recommend Items using Model

* case 1. Find Similar Artists
* case 2. Find artists to recommend to users

#### Get artist & User Embedding weights

In [9]:
# Extract Embedding weights
user_embeddings = model.get_layer('user_embedding').get_weights()[0]
item_embeddings = model.get_layer('item_embedding').get_weights()[0]
item_bias = model.get_layer('item_bias').get_weights()[0]

# Convert numpy array to Dataframe
user_embedding_df = pd.DataFrame(user_embeddings, 
                                 index=user_df.user_id)
user_embedding_df[num_factor] = 1.

artist_embedding_df = pd.DataFrame(item_embeddings,
                                   index=artist_df.artist_name)
artist_embedding_df[num_factor] = item_bias[:,0]

#### case 1. Find Similar Artists

> Which artist is similar to `jason mraz`?

In [10]:
# recommend 10 artists
target_embedding = artist_embedding_df.loc['jason mraz']

(
    artist_embedding_df
    .dot(target_embedding)
    .sort_values(ascending=False)
    .iloc[:10]
)

artist_name
jason mraz        2.948219
jason reeves      2.874918
justin nozuka     2.836041
matt white        2.792048
gavin degraw      2.775528
matt wertz        2.767827
john mayer        2.709603
james morrison    2.701028
colbie caillat    2.696645
teddy geiger      2.693708
dtype: float32

#### case 2. Find artists to recommend to users

> Find artist for dancing songs lovers

In [11]:
target_user_id = 209
listened_artists = play_df[play_df.user_id==target_user_id].artist_id

print("List of artists that the user has heard")
print(artist_df.loc[listened_artists.values,"artist_name"].values)

List of artists that the user has heard
['安室奈美恵' '浜崎あゆみ' 'britney spears' '12012' '中島美嘉' '倖田來未' '이효리' '宇多田ヒカル'
 'madonna' 'michael jackson' '新垣結衣' 'rihanna' 'mariah carey' 'evanescence'
 'linkin park' '久石譲' 'olivia' 'christina aguilera' '鄭秀文' 'boa' 'disney'
 'olivia ong' 'donawhale' '王力宏' 'bee gees' 'gackt' 'enya'
 'the pussycat dolls' 'ashlee simpson' 'm-flo' 'enrique iglesias' 'alan'
 'ガゼット' 'late night alumni' 'michelle branch' 'nelly furtado'
 'vanessa paradis' 'big bang' 'spice girls' 'beyoncé' 'uverworld'
 'frank sinatra' 'avril lavigne' 'mink' 'bon jovi' 'abingdon boys school'
 'jennifer lopez' 'kelly clarkson' 'lady gaga' 'timbaland'
 'justin timberlake' 'ciara']


In [12]:
target_user = user_embedding_df.loc[target_user_id]

(
    artist_embedding_df
    .dot(target_user)
    .sort_values(ascending=False)
    [:10]
)

artist_name
蔡依林             4.555263
sweetbox        4.500760
boa             4.447616
倖田來未            4.417671
권보아             4.410030
utada           4.397315
twins           4.368110
宇多田ヒカル          4.359187
wonder girls    4.352513
dream           4.342292
dtype: float64