## Load Packages

In [1]:
%matplotlib inline
import snappy # Awesome string Compression package
from redis import Redis # key-value storage redis connector package
from datasketch import MinHash # to generate MinHash for set 

import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from tensorflow.keras.utils import get_file
tqdm.pandas()

  from ._conv import register_converters as _register_converters


## Download Data

In [2]:
from tensorflow.keras.utils import get_file

ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

play_path = get_file("lastfm_play.csv",
                     ROOT_URL+"lastfm_play.csv")
artist_path = get_file("lastfm_artist.csv",
                       ROOT_URL+"lastfm_artist.csv")
user_path = get_file("lastfm_user.csv",
                     ROOT_URL+"lastfm_user.csv")

play_df = pd.read_csv(play_path)
artist_df = pd.read_csv(artist_path)
user_df = pd.read_csv(user_path)

# Real Time Collaborative Filtering  Using MinHash
---

## [ Recommender System Build Step ]

### 1. Generate MinHash Signatures Per Artists

* The Value of Minhash can represents a set. 
* Calculating the similarity of a set takes a long time, but calculating the similarity through the value of minhash is super fast.

In [3]:
def generate_minhash_signatures(target_set, sig_size=128):
    """
    hash target set with minhash signature
    
    ex) Hashing the following set to minhash signature with sig size 4
      {"apple","banana","meat"} -> [12391011, 1291004, 1029322, 101319141]
    
    :param target_set : set or list, want to be hashed
    :param sig_size : the length of the minhash signature    
    """
    minhash = MinHash(sig_size)
    for value in target_set:
        minhash.update(str(value).encode('utf8'))
    return minhash.hashvalues

In [4]:
from functools import partial

signatures_per_artist = (
    play_df
    .groupby('artist_id')
    ['user_id']
    .progress_apply(set)  # A Set of users who have listened to each artist
    .progress_apply(      # Transform the set of users to MinHash Signature
    partial(generate_minhash_signatures, sig_size=128))
)

signature_df = pd.DataFrame(np.stack(signatures_per_artist.values))
signature_df.index = signatures_per_artist.index
columns = [f"sig{col}" for col in signature_df.columns]
signature_df.columns = columns
signature_df

100%|██████████| 160110/160110 [00:28<00:00, 5599.83it/s]
100%|██████████| 160110/160110 [11:05<00:00, 240.65it/s]


Unnamed: 0_level_0,sig0,sig1,sig2,sig3,sig4,sig5,sig6,sig7,sig8,sig9,...,sig118,sig119,sig120,sig121,sig122,sig123,sig124,sig125,sig126,sig127
artist_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,15272515,7929136,54786193,220850801,148469233,129193264,151833698,32190803,72030142,106014141,...,285663242,20262139,12990350,21532253,185151218,159232678,34536932,61756701,78849441,119144360
1,219544,472353,153487,136833,212995,346387,505720,208617,218104,593208,...,419089,623455,868879,106014,130336,1134767,232626,409169,1772688,138745
2,2316182,2669925,4108700,1561830,17398608,5668259,4294243,3828143,20816126,3376388,...,741346,2162120,1574153,697069,18495062,2420681,7216669,9287660,2476826,17341546
3,1977307,1054261,1553539,571953,7801169,836069,1598983,1727387,28286,1692707,...,654609,1126684,1999946,3790200,328002,6616527,2228328,302693,40856,190736
4,1652888,4097654,3743117,8838460,3607721,2329176,5720695,8343277,5818621,11115887,...,3837885,1399469,65564,3750797,960596,3244396,4194281,5287100,1590393,15457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160105,951799114,1854879239,4247449365,1547854251,1160565990,3668936329,4156069434,1981523146,991820527,1062032107,...,3366869862,3666061203,529114427,596484597,44481688,3482473300,48484541,3787114399,747956424,3997977448
160106,2423266424,3583945328,1697077480,3953071647,611347776,2423315376,317309556,2976015654,3413969749,3676028536,...,2129998970,978934029,196219302,1312992217,2210018945,3903786975,2853574539,592227410,3611773069,4120859720
160107,720630150,180770797,1768760589,804380883,1830534683,3668193745,3824152446,1903889119,848275811,1196171276,...,2949029734,2160684325,1722904350,2439634735,1448679535,2106267287,2063784359,4246900980,51876737,445005217
160108,1741782383,1286326773,4150332639,1675293374,197421572,1176580331,3191575451,3469730157,1493787956,548116222,...,1626671209,71124821,2120339757,1706043572,728158693,3795006693,3082018583,2259003803,1661831016,3959399716


### 2. ETL signature values to Redis

In [5]:
def etl_worker(inputs, redis_config):
    """
    Store a list of artist id for each signature value (Secondary Indexing)
    """
    sig_name, signature_series = inputs
    
    db = Redis(**redis_config)   
    print(f"start {sig_name} processing")
    for sig_value, grouped in (
        signature_series.groupby(signature_series)):

        key_string = "{}-{}".format(sig_name, sig_value)

        value_string = snappy.compress(
            str(grouped.index.values.tolist()))
        
        db.set(key_string, value_string)

In [7]:
from functools import partial
from multiprocessing import Pool, cpu_count

redis_config = {
    "host":'redis',
    "port": 6379
}

pool = Pool(cpu_count())
pool.map(partial(etl_worker,redis_config=redis_config), 
         signature_df.iteritems());

start sig0 processing
start sig4 processing
start sig8 processing
start sig12 processing
start sig16 processing
start sig20 processing
start sig24 processing
start sig28 processing
start sig5 processing
start sig9 processing
start sig25 processing
start sig21 processing
start sig1 processing
start sig17 processing
start sig29 processing
start sig13 processing
start sig6 processing
start sig10 processing
start sig2 processing
start sig26 processing
start sig30 processing
start sig22 processing
start sig18 processing
start sig14 processing
start sig7 processing
start sig11 processing
start sig27 processing
start sig31 processing
start sig3 processing
start sig23 processing
start sig19 processing
start sig15 processing
start sig32 processing
start sig36 processing
start sig40 processing
start sig44 processing
start sig48 processing
start sig52 processing
start sig56 processing
start sig60 processing
start sig37 processing
start sig33 processing
start sig41 processing
start sig45 processin

## [ Recommender System Operation Step ]

### 1. Find Similar Artists 

> Which artist is similar to `madonna`?

In [8]:
print(artist_df[artist_df.artist_name=='madonna'])

     artist_id artist_name
353        353     madonna


In [11]:
def find_similar_artists(artist_id, num_recommend):
    target_item = signature_df.loc[artist_id]

    # set Query
    querys = [f'{k}-{v}' for k, v in target_item.items()]

    # get related artist id by secondary index
    db = Redis(**redis_config)   
    intersected_ids = np.concatenate(
        [json.loads(snappy.decompress(row).decode('utf8')) 
         for row in db.mget(querys)])
    items, counts = np.unique(intersected_ids, return_counts=True)   

    artist_names = [artist_df.loc[item, "artist_name"] for item in items]
    
    # Sort by counts
    result = (
        pd.Series(counts, index=artist_names)
        .sort_values(ascending=False)
        .iloc[:num_recommend])
    return result

In [12]:
%%timeit -n 3 -r 3
find_similar_artists(353, 10);

54.9 ms ± 3.57 ms per loop (mean ± std. dev. of 3 runs, 3 loops each)


In [13]:
find_similar_artists(353, 10)

madonna            128
michael jackson     28
kylie minogue       24
britney spears      24
rihanna             22
lady gaga           18
depeche mode        18
nelly furtado       17
lily allen          14
katy perry          14
dtype: int64

### 2. Update Artists Similarity

> When User(241) listens to Madonna(353), update the signature of Madonna(353)

In [28]:
def update_signature(user_id, artist_id, signature_df):
    db = Redis(**redis_config)   

    old_minhash = signature_df.loc[artist_id]
    new_minhash = generate_minhash_signatures([user_id])
    updated_minhash = np.minimum(old_minhash, new_minhash)

    for sig_name in signature_df.columns[updated_minhash!=old_minhash]:
        old_value = old_minhash[sig_name]
        key_string = f"{sig_name}-{old_value}"

        old_list = json.loads(snappy.decompress(db.get(key_string)))
        old_list.remove(artist_id)
        value_string = snappy.compress(str(old_list))

        db.set(key_string, value_string)

        new_value = updated_minhash[sig_name]
        key_string = f"{sig_name}-{new_value}"

        new_list = json.loads(snappy.decompress(db.get(key_string)))
        new_list.append(artist_id)
        value_string = snappy.compress(str(new_list))

        db.set(key_string, value_string)
    
    signature_df.loc[artist_id] = updated_minhash        
    return signature_df

In [29]:
signature_df = update_signature(user_id, artist_id, signature_df)