### Load Packages

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
tqdm.pandas()
np.set_printoptions(5,)
assert int(tf.__version__[0]) == 2, "tensorflow 2.0 should be installed"

### Download Data

In [4]:
from tensorflow.keras.utils import get_file

ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

# 데이터 가져오기
play_path = get_file("lastfm_play.csv",
                     ROOT_URL+"lastfm_play.csv")
artist_path = get_file("lastfm_artist.csv",
                       ROOT_URL+"lastfm_artist.csv")
user_path = get_file("lastfm_user.csv",
                     ROOT_URL+"lastfm_user.csv")

play_df = pd.read_csv(play_path)
artist_df = pd.read_csv(artist_path)
user_df = pd.read_csv(user_path)

Downloading data from https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/lastfm_play.csv
Downloading data from https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/lastfm_artist.csv
Downloading data from https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/lastfm_user.csv


### Build Bayesian Personalized Ranking

In [5]:
from tensorflow.keras.layers import Input, Dot, Concatenate
from tensorflow.keras.layers import Embedding, Subtract
from tensorflow.keras.initializers import RandomUniform
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

def bayesian_personalized_ranking(num_user, num_item, num_factor, l2_reg=1e-2):
    user_id = Input(shape=(), name='user')
    pos_item_id = Input(shape=(), name='pos_item')
    neg_item_id = Input(shape=(), name='neg_item')
    
    # initializer For Embedding Layer
    initializer = RandomUniform(minval=-1/num_factor, maxval= 1/num_factor)
    
    user_embedding_layer = Embedding(num_user, num_factor, 
                                     embeddings_initializer=initializer,
                                     name='user_embedding')
    item_embedding_layer = Embedding(num_item, num_factor, 
                                     embeddings_initializer=initializer,
                                     name='item_embedding')
    item_bias_layer = Embedding(num_item, 1, 
                                embeddings_initializer='zeros',
                                name='item_bias')
    
    user_embedding = user_embedding_layer(user_id)
    
    pos_item_embedding = item_embedding_layer(pos_item_id)
    neg_item_embedding = item_embedding_layer(neg_item_id)
    
    pos_item_bias = item_bias_layer(pos_item_id)
    neg_item_bias = item_bias_layer(neg_item_id)
    
    # Calculation the Score Difference between positive and negative
    pos_score = (
        Dot(axes=(1,1))([user_embedding, pos_item_embedding]) + pos_item_bias)
    neg_score = (
        Dot(axes=(1,1))([user_embedding, neg_item_embedding]) + neg_item_bias)

    score = Subtract()([pos_score, neg_score])
    
    model = Model([user_id, pos_item_id, neg_item_id], score)
        
    # Add L2 Regularization Term
    l2_pos_item = l2(1e-2)(pos_item_embedding)
    l2_neg_item = l2(1e-2)(neg_item_embedding)
    l2_user = l2(1e-2)(user_embedding)
    l2_loss = l2_pos_item+l2_neg_item+l2_user
    model.add_loss(l2_loss)
    
    return model

In [6]:
num_user = play_df.user_id.max() + 1
num_item = play_df.artist_id.max() + 1
num_factor = 32

model = bayesian_personalized_ranking(num_user, num_item, 32, 1e-2)

### Compile Model

In [7]:
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.metrics import BinaryAccuracy

def log_sigmoid_loss(y_true, y_pred):
    return tf.reduce_sum(-tf.math.log(tf.sigmoid(y_pred)))

model.compile(optimizer=Adagrad(1e-1),
              loss=log_sigmoid_loss, 
              metrics=[log_sigmoid_loss, BinaryAccuracy(threshold=0.)])


### Build Data Input Pipeline

In [8]:
def bootstrap_dataset(df, batch_size=4096):
    bootstrap = df.sample(frac=1., replace=True)
    user_ids = bootstrap.user_id.values
    pos_item_ids = bootstrap.artist_id.values
    neg_item_ids = df.artist_id.sample(frac=1., replace=True).values

    X = {
        "user": user_ids,
        "pos_item": pos_item_ids,
        "neg_item": neg_item_ids
    }
    dummy_y = np.ones((len(bootstrap), 1))
    
    dataset = (
        tf.data.Dataset
        .from_tensor_slices((X,dummy_y))
        .batch(batch_size)) # 배치 단위로 record 묶기
    
    return dataset

### Train Model

In [None]:
num_epoch = 100
batch_size = 4096

for i in range(num_epoch):
    print(f"{i+1}th epoch")
    dataset = bootstrap_dataset(play_df, batch_size)
    model.fit(dataset)

### Recommend Items using Model

* case 1. 유사한 아티스트 찾기
* case 2. 유저의 취향에 맞는 아티스트 찾기

#### 임베딩 행렬 가져오기

In [13]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
train, test = mnist.load_data()
train_images, train_labels = train[0], train[1]

# 행렬의 Sparse 표기법
train_labels

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

In [12]:
# 행렬의 Dense 표기법
to_categorical(train_labels,num_classes=10)

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

* 만약 데이터 label이 sparse 꼴로 되어 있다면, sparse_categorical_crossentropy
* 만약 데이터 label이 dense 꼴로 되어 있다면, categorical_crossentropy

In [24]:
# Extract Embedding weights
user_embeddings = model.get_layer('user_embedding').get_weights()[0]
item_embeddings = model.get_layer('item_embedding').get_weights()[0]
item_bias = model.get_layer('item_bias').get_weights()[0]

# Convert numpy array to Dataframe
user_embedding_df = pd.DataFrame(user_embeddings, 
                                 index=user_df.user_id)
user_embedding_df[num_factor] = 1.

artist_embedding_df = pd.DataFrame(item_embeddings,
                                   index=artist_df.artist_name)
artist_embedding_df[num_factor] = item_bias[:,0]

#### Case 1.  유사한 아티스트 찾기

*제이슨 므라즈와 유사한 아티스트 찾기*

In [42]:
target_embedding = artist_embedding_df.loc['jason mraz']

(
    artist_embedding_df
    .dot(target_embedding)
    .sort_values(ascending=False)
    .iloc[:10]
)

artist_name
mozella                   2.814972
josh kelley               2.806064
matt wertz                2.772019
jason reeves              2.766373
teddy geiger              2.703305
justin nozuka             2.701589
jeremy kay                2.651573
jamie scott & the town    2.649342
eric hutchinson           2.634584
gavin degraw              2.634161
dtype: float32

*브리트니 스피어스와 유사한 아티스트 찾기*

In [43]:
target_embedding = artist_embedding_df.loc['britney spears']

(
    artist_embedding_df
    .dot(target_embedding)
    .sort_values(ascending=False)
    .iloc[:10]
)

artist_name
girlicious          3.007082
the saturdays       2.964588
billie              2.933447
paris hilton        2.902036
victoria beckham    2.900184
nadia oh            2.896300
kate alexa          2.894922
agnes carlsson      2.880454
basim               2.862728
alesha dixon        2.859251
dtype: float32

#### Case 2.  유저의 취향에 맞는 아티스트 찾기

**메탈, 락과 같은 음악을 많은 들은 사람**

In [59]:
# 메탈, 락과 같은 음악을 많이 들은 사람
target_id = 300
target_user = user_embedding_df.loc[target_id]

(
    artist_embedding_df
    .dot(target_user) # target_user와 유사도 계산하기
    .sort_values(ascending=False)
    [:10]
)

['angelo badalamenti', 'red hot chili peppers', 'marilyn manson', 'led zeppelin', 'eric clapton', 'metallica', 'iron maiden', 'u2', 't.love', 'slipknot', 'queens of the stone age', "guns n' roses", 'iced earth', 'avril lavigne', 'guano apes', 'the offspring', 'alice in chains', 'in flames', 'pantera', 'john williams', 'daniel licht', 'high and mighty color', 'karmacoma', 'down', 'missile girl scoot', 'akira yamaoka', 'the kilimanjaro darkjazz ensemble', 'mondo generator', 'raging speedhorn', 'graeme revell', 'spiritual beggars', 'as i lay dying', 'frida snell', 'fatboy slim', 'pearl jam', 'isis', 'suicidal tendencies', 'black sabbath', 'stone sour', 'the smashing pumpkins', 'sigur rós', 'godsmack', 'pink', 'no doubt', 'nine inch nails']
