# [Module 6.1] Custom Metric 으로 성능 데이터 및 Cold Start 성능 체크 하기 

이번 모듈에서는 모듈1에서 테스트 용으로 분리했던 데이터를 가지고 Custom 지표를 통해 추가적인 성능을 평가해 보도록 합니다. 
또한 HRNN Coldstart 성능도 추가적으로 확인해 보도록 합니다.Coldstart 아이템은 신규로 등록된 아이템 이기 때문에 성능을 예측하기가 어려운 부분이 있습니다. 

In [1]:
!pip install tqdm

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [12]:
import pandas as pd, numpy as np
import io
import scipy.sparse as ss
import json
import time
import os
import boto3
from botocore.exceptions import ClientError
from metrics import ndcg_at_k, precision_at_k, mean_reciprocal_rank

from tqdm import notebook

In [3]:
%store -r

In [4]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [5]:
#read holdout data and coldstart data
df_holdout = pd.read_csv(validation_interaction_filename)
df_coldstart=pd.read_csv(coldstart_interation_filename)

## Custom Metric으로 테스트 데이터 세트 평가하기

이번 파트에서는 앞장에 남겨두었던 데이터 세트를 활용하여 모델 성능을 평가 하도록 합니다.
테스트 데이터 셋에 있는 모든 고유한 사용자에 대해 테스트 데이터 세트 Interaction Ground Truth data와 Campaign에서 생성된 결과를 비교 하도록 합니다.


In [6]:
test_users = df_holdout['USER_ID'].unique()
df_holdout


Unnamed: 0,USER_ID,ITEM_ID,EVENT_VALUE,TIMESTAMP,EVENT_TYPE
0,1,2355,5,978824291,RATING
1,1,1907,4,978824330,RATING
2,1,783,4,978824291,RATING
3,2,1687,3,978300174,RATING
4,2,292,3,978300123,RATING
...,...,...,...,...,...
51161,6040,1273,4,964828734,RATING
51162,6040,1674,4,964828706,RATING
51163,6040,1784,3,997454464,RATING
51164,6040,535,4,964828734,RATING


In [31]:
def get_relevance_list(campaign_arn, df_holdout, test_user_list):
    relevance = []
    for user_id in test_user_list:
        true_items = set(df_holdout[df_holdout['USER_ID']==user_id]['ITEM_ID'].values)
        rec_response = personalize_runtime.get_recommendations(
            campaignArn = campaign_arn,
            userId = str(user_id)
        )
        rec_items = [int(x['itemId']) for x in rec_response['itemList']]
        relevance.append([int(x in true_items) for x in rec_items])
    return relevance

def evaluate_relevance(relevance):
    mrr = np.mean([mean_reciprocal_rank(r) for r in relevance])
    print('mean_reciprocal_rank: ',mrr)
    
    p_at_5= np.mean([precision_at_k(r, 5) for r in relevance])
    print('precision_at_5: ',p_at_5)
    
    p_at_10 = np.mean([precision_at_k(r, 10) for r in relevance])
    print('precision_at_10: ',p_at_10)
    p_at_25 = np.mean([precision_at_k(r, 25) for r in relevance])
    print('precision_at_25: ', p_at_25)
    ndcg_at_5 = np.mean([ndcg_at_k(r, 5) for r in relevance])
    print('normalized_discounted_cumulative_gain_at_5: ', ndcg_at_5)
    ndcg_at_10 = np.mean([ndcg_at_k(r, 10) for r in relevance])
    print('normalized_discounted_cumulative_gain_at_10: ',ndcg_at_10 )
    ndcg_at_25 = np.mean([ndcg_at_k(r, 25) for r in relevance])    
    print('normalized_discounted_cumulative_gain_at_25: ',ndcg_at_25 )    

metrics=[]

def build_metric_matrix(solution,response):
    metrics.append([solution,
                response['metrics']['coverage'],
                response['metrics']['mean_reciprocal_rank_at_25'],
                response['metrics']['normalized_discounted_cumulative_gain_at_5'],
                response['metrics']['normalized_discounted_cumulative_gain_at_10'],
                response['metrics']['normalized_discounted_cumulative_gain_at_25'],
                response['metrics']['precision_at_5'],
                response['metrics']['precision_at_10'],
                response['metrics']['precision_at_25']])


![Fig.3.2.metric_summary.png](static/imgs/Fig.3.2.metric_summary.png)

In [34]:
test_user_list = test_users[:10]

In [35]:
relevance = get_relevance_list(user_personalization_campaign_arn, df_holdout, test_user_list)
evaluate_relevance(relevance)

mean_reciprocal_rank:  0.21524743230625581
precision_at_5:  0.08
precision_at_10:  0.09000000000000001
precision_at_25:  0.068
normalized_discounted_cumulative_gain_at_5:  0.13811095333354723
normalized_discounted_cumulative_gain_at_10:  0.2220698649094479
normalized_discounted_cumulative_gain_at_25:  0.3381175584255128


In [36]:
relevance = get_relevance_list(hrnn_campaign_arn, df_holdout, test_user_list)
evaluate_relevance(relevance)

mean_reciprocal_rank:  0.10416666666666667
precision_at_5:  0.02
precision_at_10:  0.04
precision_at_25:  0.024
normalized_discounted_cumulative_gain_at_5:  0.1
normalized_discounted_cumulative_gain_at_10:  0.1967132018086354
normalized_discounted_cumulative_gain_at_25:  0.23029407667552051


In [37]:
relevance = get_relevance_list(hrnn_meta_campaign_arn, df_holdout, test_user_list)
evaluate_relevance(relevance)

mean_reciprocal_rank:  0.12115079365079365
precision_at_5:  0.04
precision_at_10:  0.05
precision_at_25:  0.05600000000000001
normalized_discounted_cumulative_gain_at_5:  0.07807721888661444
normalized_discounted_cumulative_gain_at_10:  0.12298713276783971
normalized_discounted_cumulative_gain_at_25:  0.24449393114592563


## Cold Start 성능 테스트 

이부분에서는 새롭게 더해진 새로운 아이템(ColdStart)에 대한 추천 성능을 테스트 해보도록 합니다. 


In [9]:
metrics=[]

def build_metric_matrix(solution,relevance):
    metrics.append([solution,
                np.mean([mean_reciprocal_rank(r) for r in relevance]),
                np.mean([precision_at_k(r, 5) for r in relevance]),
                np.mean([precision_at_k(r, 10) for r in relevance]),
                np.mean([precision_at_k(r, 10) for r in relevance]),
                np.mean([ndcg_at_k(r, 5) for r in relevance]),
                np.mean([ndcg_at_k(r, 10) for r in relevance]),
                np.mean([ndcg_at_k(r, 25) for r in relevance])])


In [10]:
users = df_coldstart['USER_ID'].unique()
users.shape

(6040,)

In [11]:
relevance = []
for user_id in  tqdm_notebook(users[:1000]):

    true_items = set(df_coldstart[df_coldstart['USER_ID']==user_id]['ITEM_ID'].values)

    rec_response = personalize_runtime.get_recommendations(
            campaignArn = hrnn_coldstart_campaign_arn,
            userId = str(user_id)
        )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    relevance.append([int(x in true_items) for x in rec_items])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [12]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))
build_metric_matrix('hrnn-coldstart',relevance)

mean_reciprocal_rank 0.3062901684461423
precision_at_5 0.1504
precision_at_10 0.1422
precision_at_25 0.12644
normalized_discounted_cumulative_gain_at_5 0.19831109168815617
normalized_discounted_cumulative_gain_at_10 0.2529745586524605
normalized_discounted_cumulative_gain_at_25 0.38139360801035516


### A baseline

랜덤으로 추천하였을 경우 대비 Coldstart 성능이 얼마나 좋은 것인지 비교하여 보도록 합니다. 

In [13]:
len(rec_items)

25

In [14]:
relevance = []
for user_id in  tqdm_notebook(users[:1000]):

    true_items = set(df_coldstart[df_coldstart['USER_ID']==user_id]['ITEM_ID'].values)
    rec_items = np.random.permutation(cold_items)[:25]
    relevance.append([int(x in true_items) for x in rec_items])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [15]:
print('mean_reciprocal_rank', np.mean([mean_reciprocal_rank(r) for r in relevance]))
print('precision_at_5', np.mean([precision_at_k(r, 5) for r in relevance]))
print('precision_at_10', np.mean([precision_at_k(r, 10) for r in relevance]))
print('precision_at_25', np.mean([precision_at_k(r, 25) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_5', np.mean([ndcg_at_k(r, 5) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_10', np.mean([ndcg_at_k(r, 10) for r in relevance]))
print('normalized_discounted_cumulative_gain_at_25', np.mean([ndcg_at_k(r, 25) for r in relevance]))
build_metric_matrix('random',relevance)


mean_reciprocal_rank 0.11485065805076304
precision_at_5 0.0384
precision_at_10 0.0411
precision_at_25 0.04264
normalized_discounted_cumulative_gain_at_5 0.0713667283987009
normalized_discounted_cumulative_gain_at_10 0.10898545670852522
normalized_discounted_cumulative_gain_at_25 0.1921322157564481


In [16]:
metrics=pd.DataFrame(metrics,columns=['recipe','mrr','p@5','p@10','p@25','ndcg@5','ndcg@10','ndcg@25'])
metrics

Unnamed: 0,recipe,mrr,p@5,p@10,p@25,ndcg@5,ndcg@10,ndcg@25
0,hrnn-coldstart,0.30629,0.1504,0.1422,0.1422,0.198311,0.252975,0.381394
1,random,0.114851,0.0384,0.0411,0.0411,0.071367,0.108985,0.192132


HRNN Cold Start모델은 메타 데이터의 일부 정보를 활용하여 Interaction정보가 없는 새로운 아이템에 대해도 추천을 할수 있습니다. 메타 데이터 정보가 장르밖에 없었음에도 랜덤 추천 대비 약 3~4배의 성능이 있었음을 확인할 수 있습니다. 메타 데이터 성능을 향상 시키거나 Cold-start item비율을 줄인다면 더 좋은 성능을 기대해 볼 수 있습니다. 


## A quick test

In [17]:
# we had saved all the data before deleting the cold items

df=pd.read_csv(interaction_filename)
df = df.sort_values('TIMESTAMP', kind='mergesort').copy()

In [18]:
items_all = pd.read_csv('./ml-1m/movies.dat',sep='::', encoding='latin1',names=['ITEM_ID', '_TITLE', 'GENRE'],)
del items_all['_TITLE']

user_id = users[1]
hist_items = df[df['USER_ID']==user_id]['ITEM_ID'].tail(5).values
items_all.set_index('ITEM_ID').loc[hist_items]

  if __name__ == '__main__':


Unnamed: 0_level_0,GENRE
ITEM_ID,Unnamed: 1_level_1
459,Action
442,Action|Sci-Fi
1597,Action|Mystery|Romance|Thriller
2628,Action|Adventure|Fantasy|Sci-Fi
1690,Action|Horror|Sci-Fi


In [19]:
rec_response = personalize_runtime.get_recommendations(
            campaignArn = hrnn_coldstart_campaign_arn,
            userId = str(user_id)
        )
rec_items = [int(x['itemId']) for x in rec_response['itemList']]

items_all.set_index('ITEM_ID').loc[rec_items[:5]]



Unnamed: 0_level_0,GENRE
ITEM_ID,Unnamed: 1_level_1
1205,Action|Animation|Children's|Sci-Fi|Thriller|War
610,Action|Adventure|Animation|Horror|Sci-Fi
849,Action|Adventure|Sci-Fi|Thriller
1917,Action|Adventure|Sci-Fi|Thriller
1591,Action|Adventure|Sci-Fi|Thriller


In [20]:
rec_response = personalize_runtime.get_recommendations(
            campaignArn = hrnn_coldstart_campaign_arn,
            userId = str(user_id)
        )
rec_items = [int(x['itemId']) for x in rec_response['itemList']]

items_all.set_index('ITEM_ID').loc[rec_items[:5]]



Unnamed: 0_level_0,GENRE
ITEM_ID,Unnamed: 1_level_1
1205,Action|Animation|Children's|Sci-Fi|Thriller|War
610,Action|Adventure|Animation|Horror|Sci-Fi
849,Action|Adventure|Sci-Fi|Thriller
1917,Action|Adventure|Sci-Fi|Thriller
1591,Action|Adventure|Sci-Fi|Thriller


In [21]:
##This code is only testing purpose
'''
def is_cold_item(rec_items):
    count=0
    np_cold_items=np.array(cold_items)
    for i in range(len(rec_items)):
        if np.where(np_cold_items==rec_items[i]):
            count+=1
        else:
            print("Item_id {} is not Coldstart Item".format(rec_items[i]))
    print(count)
is_cold_item(rec_items)
'''

'\ndef is_cold_item(rec_items):\n    count=0\n    np_cold_items=np.array(cold_items)\n    for i in range(len(rec_items)):\n        if np.where(np_cold_items==rec_items[i]):\n            count+=1\n        else:\n            print("Item_id {} is not Coldstart Item".format(rec_items[i]))\n    print(count)\nis_cold_item(rec_items)\n'

이 사용자는액션|어드벤처|스릴러 아이템을 많이 선택하였고 모델도 장르에서 만이 선택하였다는 것을 다는 것을 알았습니다. 콜드 아이템에서 액션 | 어드벤처 | 스릴러 아이템을 추천합니다.

## Another quick test

In [22]:
user_id = users[2]
hist_items = df[df['USER_ID']==user_id]['ITEM_ID'].tail(10).values
items_all.set_index('ITEM_ID').loc[hist_items]

Unnamed: 0_level_0,GENRE
ITEM_ID,Unnamed: 1_level_1
2470,Adventure|Comedy
2115,Action|Adventure
552,Action|Adventure|Comedy
2617,Action|Adventure|Horror|Thriller
2735,Action|Adventure|Comedy
1136,Comedy
3114,Animation|Children's|Comedy
3619,Comedy
1265,Comedy|Romance
2355,Animation|Children's|Comedy


In [23]:
rec_response = personalize_runtime.get_recommendations(
            campaignArn = hrnn_coldstart_campaign_arn,
            userId = str(user_id)
        )
rec_items = [int(x['itemId']) for x in rec_response['itemList']]
items_all.set_index('ITEM_ID').loc[rec_items[:10]]

Unnamed: 0_level_0,GENRE
ITEM_ID,Unnamed: 1_level_1
1566,Adventure|Animation|Children's|Comedy|Musical
588,Animation|Children's|Comedy|Musical
2078,Animation|Children's|Comedy|Musical
239,Animation|Children's|Comedy|Romance
2141,Animation|Children's|Comedy
3611,Animation|Children's|Comedy
3754,Animation|Children's|Comedy
688,Action|Adventure|Comedy|War
1148,Animation|Comedy
2700,Animation|Comedy


다시 한번 테스트를 통해 해당 사용자는 Comedy|Action을 주로 보았고 Amazon personalize 모델이 Comedy|Action 아이템을 추천하는 것을 볼 수 있습니다. 