# [Module 6.1] Custom Metric 으로 성능 데이터 및 Cold Start 성능 체크 하기 

이번 모듈에서는 모듈1에서 테스트 용으로 분리했던 데이터를 가지고 Custom 지표를 통해 추가적인 성능을 평가해 보도록 합니다. 
또한 HRNN Coldstart 성능도 추가적으로 확인해 보도록 합니다.Coldstart 아이템은 신규로 등록된 아이템 이기 때문에 성능을 예측하기가 어려운 부분이 있습니다. 

In [18]:
import pandas as pd, numpy as np
import io
import scipy.sparse as ss
import json
import time
import os
import boto3
from botocore.exceptions import ClientError
from metrics import ndcg_at_k, precision_at_k, mean_reciprocal_rank

from tqdm import notebook

In [19]:
%store -r

In [20]:
# Configure the SDK to Personalize:
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

In [21]:
#read holdout data and coldstart data
df_holdout = pd.read_csv(validation_interaction_filename)
df_coldstart=pd.read_csv(coldstart_interation_filename)

## Validaton(holdout) 데이터 세트 평가하기

이번 파트에서는 앞장에 남겨두었던 데이터 세트를 활용하여 모델 성능을 평가 하도록 합니다.
테스트 데이터 셋에 있는 모든 고유한 사용자에 대해 테스트 데이터 세트 Interaction Ground Truth data와 Campaign에서 생성된 결과를 비교 하도록 합니다.


In [22]:
def get_relevance_list(campaign_arn, df_holdout, test_user_list):
    relevance = []
    for user_id in test_user_list:
        true_items = set(df_holdout[df_holdout['USER_ID']==user_id]['ITEM_ID'].values)
        rec_response = personalize_runtime.get_recommendations(
            campaignArn = campaign_arn,
            userId = str(user_id)
        )
        rec_items = [int(x['itemId']) for x in rec_response['itemList']]
        relevance.append([int(x in true_items) for x in rec_items])
    return relevance

def evaluate_relevance(relevance):
    '''
    relevance 입력 받으면, 지표를 사전으로 리턴 함
    '''
    mrr = np.mean([mean_reciprocal_rank(r) for r in relevance])
    p_at_5= np.mean([precision_at_k(r, 5) for r in relevance])
    p_at_10 = np.mean([precision_at_k(r, 10) for r in relevance])
    p_at_25 = np.mean([precision_at_k(r, 25) for r in relevance])
    ndcg_at_5 = np.mean([ndcg_at_k(r, 5) for r in relevance])
    ndcg_at_10 = np.mean([ndcg_at_k(r, 10) for r in relevance])
    ndcg_at_25 = np.mean([ndcg_at_k(r, 25) for r in relevance])    
#     print('mean_reciprocal_rank: ',mrr)
#     print('precision_at_5: ',p_at_5)
#     print('precision_at_10: ',p_at_10)
#     print('precision_at_25: ', p_at_25)
#     print('normalized_discounted_cumulative_gain_at_5: ', ndcg_at_5)
#     print('normalized_discounted_cu{}mulative_gain_at_10: ',ndcg_at_10 )
#     print('normalized_discounted_cumulative_gain_at_25: ',ndcg_at_25 )    

    metric_dict = {}
    metric_dict['mrr'] = round(mrr,3)
    metric_dict['ndcg_at_5'] = round(ndcg_at_5,3)        
    metric_dict['ndcg_at_10'] = round(ndcg_at_10,3)            
    metric_dict['ndcg_at_25'] = round(ndcg_at_25,3)                
    metric_dict['p_at_5'] = round(p_at_5,3)    
    metric_dict['p_at_10'] = round(p_at_10, 3)
    metric_dict['p_at_25'] = round(p_at_25,3)    
        
    return metric_dict

def build_metric_matrix(solution,metric_dict):
    metrics.append([solution,
                        metric_dict['mrr'],
                        metric_dict['p_at_5'],
                        metric_dict['p_at_10'],
                        metric_dict['p_at_25'],
                        metric_dict['ndcg_at_5'],
                        metric_dict['ndcg_at_10'],
                        metric_dict['ndcg_at_25']

])


![Fig.3.2.metric_summary.png](static/imgs/Fig.3.2.metric_summary.png)

#### Validation 결과 확인 하기

아래 과정은 num_test_user = 1000 으로 하면 약 4분 소요 됩니다.
디폴트로 100을 설정 합니다.

In [23]:
test_users = df_holdout['USER_ID'].unique()

# num_validation_test_user = 6040
num_validation_test_user = 10
test_user_list = test_users[:num_validation_test_user]

In [24]:
%%time

metrics=[] # 변수 선언
# user-perssonalization
relevance = get_relevance_list(user_personalization_campaign_arn, df_holdout, test_user_list)
metrics_eval_rel = evaluate_relevance(relevance)
build_metric_matrix("user-pers",metrics_eval_rel)
# hrnn
relevance = get_relevance_list(hrnn_campaign_arn, df_holdout, test_user_list)
metrics_eval_rel = evaluate_relevance(relevance)
build_metric_matrix("hrnn",metrics_eval_rel)
# hrnn-meta
relevance = get_relevance_list(hrnn_meta_campaign_arn, df_holdout, test_user_list)
metrics_eval_rel = evaluate_relevance(relevance)
build_metric_matrix("hrnn-meta",metrics_eval_rel)
# hrnn-coldstart
relevance = get_relevance_list(hrnn_coldstart_campaign_arn, df_holdout, test_user_list)
metrics_eval_rel = evaluate_relevance(relevance)
build_metric_matrix("hrnn_coldstart",metrics_eval_rel)

CPU times: user 128 ms, sys: 256 µs, total: 128 ms
Wall time: 4.43 s


In [25]:
val_metrics = pd.DataFrame(metrics, 
                           columns=['recipe','mrr','ncdg@5','ncdg@10','ncdg@25','p@5','p@10','p@25'])
val_metrics

Unnamed: 0,recipe,mrr,ncdg@5,ncdg@10,ncdg@25,p@5,p@10,p@25
0,user-pers,0.215,0.08,0.09,0.068,0.138,0.222,0.338
1,hrnn,0.104,0.02,0.04,0.024,0.1,0.197,0.23
2,hrnn-meta,0.121,0.04,0.05,0.056,0.078,0.123,0.244
3,hrnn_coldstart,0.025,0.02,0.01,0.004,0.05,0.05,0.05


![Fig.6.1.validation_metric_summary](static/imgs/Fig.6.1.validation_metric_summary.png)

## Cold Start 성능 테스트 

이부분에서는 새롭게 더해진 새로운 아이템(ColdStart)에 대한 추천 성능을 테스트 해보도록 합니다. 


In [26]:
test_users = df_coldstart['USER_ID'].unique()
# num_coldstart_test_user = 6040
num_coldstart_test_user = 10
test_user_list = test_users[:num_coldstart_test_user]

In [27]:
metrics=[] # 변수 선언

# hrnn-coldstart
relevance = get_relevance_list(hrnn_coldstart_campaign_arn, df_coldstart, test_user_list)
metrics_eval_rel = evaluate_relevance(relevance)
build_metric_matrix("hrnn_coldstart",metrics_eval_rel)
# random
def get_random_relevance_list(df_coldstart, test_user_list):
    relevance = []
    for user_id in  test_user_list:
        true_items = set(df_coldstart[df_coldstart['USER_ID']==user_id]['ITEM_ID'].values)
        rec_items = np.random.RandomState(seed=42).permutation(cold_items)[:25]
        relevance.append([int(x in true_items) for x in rec_items])
        
    return relevance

relevance = get_random_relevance_list(df_coldstart, test_user_list)
metrics_eval_rel = evaluate_relevance(relevance)
build_metric_matrix("random",metrics_eval_rel)

In [28]:
val_metrics = pd.DataFrame(metrics, columns=['recipe','mrr','ncdg@5','ncdg@10','ncdg@25','p@5','p@10','p@25'])
val_metrics

Unnamed: 0,recipe,mrr,ncdg@5,ncdg@10,ncdg@25,p@5,p@10,p@25
0,hrnn_coldstart,0.32,0.14,0.14,0.104,0.216,0.274,0.403
1,random,0.105,0.04,0.04,0.02,0.113,0.168,0.19


![Fig.6.1.coldstart_random_metrics](static/imgs/Fig.6.1.coldstart_random_metrics.png)
HRNN Cold Start모델은 메타 데이터의 일부 정보를 활용하여 Interaction정보가 없는 새로운 아이템에 대해도 추천을 할수 있습니다. 메타 데이터 정보가 장르밖에 없었음에도 랜덤 추천 대비 약 3~4배의 성능이 있었음을 확인할 수 있습니다. 메타 데이터 성능을 향상 시키거나 Cold-start item비율을 줄인다면 더 좋은 성능을 기대해 볼 수 있습니다. 


## ColdStart Recipe 추천 결과의 유추

In [29]:
from utils import get_rich_dataset
df_warm_train = pd.read_csv(warm_train_interaction_filename)
item_meta = pd.read_csv('./ml-1m/movies.dat',sep='::', encoding='latin1',names=['ITEM_ID', 'TITLE', 'GENRE'],)
df_warm_train_rich = get_rich_dataset(df_warm_train, item_meta)
df_warm_train_rich = df_warm_train_rich.sort_values('TIMESTAMP').copy()

  app.launch_new_instance()


In [30]:
def get_recentViews_coldstart(user_id, df_warm_train_rich, hrnn_coldstart_campaign_arn, item_meta):
    history_items = df_warm_train_rich[df_warm_train_rich['USER_ID']==user_id].tail(5)
    
    rec_response = personalize_runtime.get_recommendations(
                campaignArn = hrnn_coldstart_campaign_arn,
                userId = str(user_id)
            )
    rec_items = [int(x['itemId']) for x in rec_response['itemList']]
    rec_items_movies = item_meta.set_index('ITEM_ID').loc[rec_items[:5]]

    return history_items, rec_items_movies



#### 유저 2

이 사용자는액션|어드벤처|스릴러 아이템을 많이 선택하였고 모델도 장르에서 만이 선택하였다는 것을 다는 것을 알았습니다. 콜드 아이템에서 액션 | 어드벤처 | 스릴러 아이템을 추천합니다.

In [31]:
from IPython.display import display, HTML

coldstart_test_user = 1
user_id = df_coldstart.USER_ID.unique()[coldstart_test_user]
history_items , rec_items_movies= get_recentViews_coldstart(user_id, df_warm_train_rich, hrnn_coldstart_campaign_arn, item_meta)

display(HTML("<font color='blue'>The lastest top 5 movies fed into learned model</font>"))
display(history_items)
display(HTML("<font color='blue'>Five movies recommended by ColdStart recipe</font>"))
display(rec_items_movies)

Unnamed: 0,USER_ID,ITEM_ID,TITLE,GENRE,TIMESTAMP,DATE
35539,2,3107,Backdraft (1991),Action|Drama,978300002,2000-12-31 22:00:02
63832,2,1597,Conspiracy Theory (1997),Action|Mystery|Romance|Thriller,978300025,2000-12-31 22:00:25
50473,2,442,Demolition Man (1993),Action|Sci-Fi,978300025,2000-12-31 22:00:25
26815,2,2628,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Fantasy|Sci-Fi,978300051,2000-12-31 22:00:51
47355,2,1690,Alien: Resurrection (1997),Action|Horror|Sci-Fi,978300051,2000-12-31 22:00:51


Unnamed: 0_level_0,TITLE,GENRE
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1205,"Transformers: The Movie, The (1986)",Action|Animation|Children's|Sci-Fi|Thriller|War
610,Heavy Metal (1981),Action|Adventure|Animation|Horror|Sci-Fi
849,Escape from L.A. (1996),Action|Adventure|Sci-Fi|Thriller
1917,Armageddon (1998),Action|Adventure|Sci-Fi|Thriller
1591,Spawn (1997),Action|Adventure|Sci-Fi|Thriller


#### 유저 3
다시 한번 테스트를 통해 해당 사용자는 Animation|Comedy|Children을 주로 보았고 Amazon personalize 모델이 Animation|Comedy|Children 아이템을 추천하는 것을 볼 수 있습니다. 

In [32]:
coldstart_test_user = 2
user_id = df_coldstart.USER_ID.unique()[coldstart_test_user]
history_items , rec_items_movies= get_recentViews_coldstart(user_id, df_warm_train_rich, hrnn_coldstart_campaign_arn, item_meta)

display(HTML("<font color='blue'>The lastest top 5 movies fed into learned model</font>"))
display(history_items)
display(HTML("<font color='blue'>Five movies recommended by ColdStart recipe</font>"))
display(rec_items_movies)

Unnamed: 0,USER_ID,ITEM_ID,TITLE,GENRE,TIMESTAMP,DATE
90723,3,1136,Monty Python and the Holy Grail (1974),Comedy,978298079,2000-12-31 21:27:59
24236,3,3114,Toy Story 2 (1999),Animation|Children's|Comedy,978298103,2000-12-31 21:28:23
84384,3,3619,"Hollywood Knights, The (1980)",Comedy,978298201,2000-12-31 21:30:01
53307,3,1265,Groundhog Day (1993),Comedy|Romance,978298316,2000-12-31 21:31:56
84948,3,2355,"Bug's Life, A (1998)",Animation|Children's|Comedy,978298430,2000-12-31 21:33:50


Unnamed: 0_level_0,TITLE,GENRE
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1566,Hercules (1997),Adventure|Animation|Children's|Comedy|Musical
588,Aladdin (1992),Animation|Children's|Comedy|Musical
2078,"Jungle Book, The (1967)",Animation|Children's|Comedy|Musical
239,"Goofy Movie, A (1995)",Animation|Children's|Comedy|Romance
2141,"American Tail, An (1986)",Animation|Children's|Comedy


#### 유저 500
다시 한번 테스트를 통해 해당 사용자는 Drama 주로 보았고 Amazon personalize 모델이 Drama|Romance|Comedy 아이템을 추천하는 것을 볼 수 있습니다. 

In [33]:
coldstart_test_user = 499
user_id = df_coldstart.USER_ID.unique()[coldstart_test_user]
history_items , rec_items_movies= get_recentViews_coldstart(user_id, df_warm_train_rich, hrnn_coldstart_campaign_arn, item_meta)

display(HTML("<font color='blue'>The lastest top 5 movies fed into learned model</font>"))
display(history_items)
display(HTML("<font color='blue'>Five movies recommended by ColdStart recipe</font>"))
display(rec_items_movies)

Unnamed: 0,USER_ID,ITEM_ID,TITLE,GENRE,TIMESTAMP,DATE
124126,500,497,Much Ado About Nothing (1993),Comedy|Romance,976644137,2000-12-12 18:02:17
152018,500,337,What's Eating Gilbert Grape (1993),Drama,976644171,2000-12-12 18:02:51
17239,500,2762,"Sixth Sense, The (1999)",Thriller,976644186,2000-12-12 18:03:06
1132,500,3408,Erin Brockovich (2000),Drama,979257574,2001-01-11 23:59:34
408772,500,3795,"Five Senses, The (1999)",Drama,979257748,2001-01-12 00:02:28


Unnamed: 0_level_0,TITLE,GENRE
ITEM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1094,"Crying Game, The (1992)",Drama|Romance|War
1944,From Here to Eternity (1953),Drama|Romance|War
912,Casablanca (1942),Drama|Romance|War
224,Don Juan DeMarco (1995),Comedy|Drama|Romance
3343,And God Created Woman (1988),Comedy|Drama|Romance
