## Load Packages

In [None]:
%matplotlib inline
import snappy # Awesome string Compression package
from redis import Redis # key-value storage redis connector package
from datasketch import MinHash # to generate MinHash for set 

import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from tensorflow.keras.utils import get_file
tqdm.pandas()

## Download Data

In [None]:
from tensorflow.keras.utils import get_file

ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

# 데이터 가져오기
play_path = get_file("lastfm_play.csv",
                     ROOT_URL+"lastfm_play.csv")
artist_path = get_file("lastfm_artist.csv",
                       ROOT_URL+"lastfm_artist.csv")
user_path = get_file("lastfm_user.csv",
                     ROOT_URL+"lastfm_user.csv")

play_df = pd.read_csv(play_path)
artist_df = pd.read_csv(artist_path)
user_df = pd.read_csv(user_path)

## Real Time Collaborative Filtering  Using MinHash
---

## [ 추천 시스템 구성 단계 ]

### 1. Generate MinHash Signatures Per Artists

In [None]:
def generate_minhash_signatures(user_set, sig_size=128):
    """
    
    """
    minhash = MinHash(sig_size)
    for user_id in user_set:
        minhash.update(str(user_id).encode('utf8'))
    return minhash.hashvalues

In [13]:
from functools import partial

# artist 별로 minhash Signatures를 만듦
signatures_per_artist = (
    play_df               # 청취 데이터를
    .groupby('artist_id') # artist을 기준으로
    ['user_id']           # user_id를 모아
    .progress_apply(set)  # 하나의 집합으로 만든 후,
    .progress_apply(      # 집합을 minhash Signature Value로 변환
        partial(
            generate_minhash_signatures, sig_size=128) )
)

signature_df = pd.DataFrame(np.stack(signatures_per_artist.values))
signature_df.index = signatures_per_artist.index
columns = [f"sig{col}" for col in signature_df.columns]
signature_df.columns = columns
signature_df

NameError: name 'play_df' is not defined

### 2. ETL signature values to Redis

In [None]:
def etl_worker(inputs, redis_config):
    """
    각 signature 별로 Key-Value 형태로 Redis에 ETL하는 메소드
    """
    
    sig_name, signature_series = inputs
    
    db = Redis(**redis_config)   
    print(f"start {sig_name} processing")
    for sig_value, grouped in (
        signature_series.groupby(signature_series)):
        # Key로 만들기
        key_string = "{}-{}".format(sig_name, sig_value)
        # Value를 String으로 만들기
        value_string = snappy.compress(
            str(grouped.index.values.tolist()))
        db.set(key_string, value_string)

In [None]:
from multiprocessing import Pool, cpu_count

redis_config = {
    "host":'localhost',
    "port": 6379
}

pool = Pool(cpu_count())
pool.map(etl_worker, minhv_df.iteritems());

## [ 추천 시스템 운영 단계 ]

### 1. 아이템 추천하기

> 마돈나(353)번과 유사한 아티스트를 추천해 주어야 할 경우

In [52]:
# 마돈나를 들은 사람에게 추천
print(artist_df[artist_df.artist_name=='madonna'])

     artist_id artist_name
353        353     madonna


In [65]:
%%timeit -n 3 -r 3
target_item = minhv_df.loc[353]

# set Query
querys = [f'{k}-{v}' for k, v in target_item.items()]

# 퀴리를 통해 redis에서 가져오기
db = Redis('localhost', port=6379)   
intersected_ids = np.concatenate(
    [json.loads(snappy.decompress(row).decode('utf8')) for row in db.mget(querys)])

# 아이템 count하기
items, counts = np.unique(intersected_ids, return_counts=True)   

artist_names = [artist_df.loc[item,"artist_name"] for item in items]
result = (
    pd.Series(counts,index=artist_names)
    .sort_values(ascending=False)
    .iloc[:5])

31.9 ms ± 3.37 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [66]:
result

madonna            128
michael jackson     28
kylie minogue       24
britney spears      24
rihanna             22
dtype: int64

### 2. 데이터 갱신하기 

> 유저(241)번이 마돈나(353)d