In [None]:
import io
import os
import json
import distutils.dir_util
from collections import Counter
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

from scipy.sparse import csr_matrix

from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
pd.options.display.max_rows = 499
pd.options.display.max_columns = 499
pd.options.mode.chained_assignment = None

In [None]:
fname = '../static/fonts/D2Coding.ttc'
font_family = fm.FontProperties(fname=fname).get_name()
plt.rcParams['font.family'] = font_family
plt.rcParams['font.size'] = 14

In [None]:
%matplotlib inline

In [5]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))


def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]

In [6]:
train_path = '../arena_data/orig/train.json'
val_path = '../arena_data/questions/val.json'

train_json = load_json(train_path)
val_json = load_json(val_path)

In [7]:
def json_to_dataframe(json_data):
    dataframe_dict = {'id': [], 'plylst_title': [], 'tags': [], 'songs': [], 'like_cnt': [], 'updt_date': []}

    for data in tqdm(json_data):
        dataframe_dict['id'].append(data['id'])
        dataframe_dict['plylst_title'].append(data['plylst_title'])
        dataframe_dict['tags'].append(data['tags'])
        dataframe_dict['songs'].append(data['songs'])
        dataframe_dict['like_cnt'].append(data['like_cnt'])
        dataframe_dict['updt_date'].append(data['updt_date'])
    
    dataframe = pd.DataFrame(dataframe_dict)
    dataframe['updt_date'] = pd.to_datetime(dataframe.updt_date)

    return dataframe


In [8]:
train_df = json_to_dataframe(train_json)
train_df

100%|██████████| 92056/92056 [00:00<00:00, 604796.02it/s]


Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
0,147668,To. 힘들고 지친 분들에게,"[힐링, 휴식, 밤, 새벽]","[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27
1,50422,130807-7,[팝],"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11
2,116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,[뉴에이지],"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50
3,55076,당신을 하얗게 불태울 곡들,"[하드락, 록스피릿, 댄스]","[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25
4,125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[힐링, 휴식, 기분전환]","[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50
...,...,...,...,...,...,...
92051,149690,옛날노래 * 좋은노래 8090년생 노래 모음,"[90년생, 회상, 추억, 좋은노래, 80년생, 옛날노래]","[292099, 513963, 174225, 287212, 140444, 62469...",155,2020-01-15 15:15:45
92052,35004,LOVE 1,[팝],"[62596, 359718, 596004, 668790, 291212, 148977...",8,2010-03-23 00:03:00
92053,59765,추억의 2004년 발라드 베스트,"[여행, 발라드, 기분전환, 사랑]","[214372, 145150, 407082, 160552, 102445, 50845...",3,2019-05-15 13:26:07
92054,9867,All Music Guide 선정 90s R&B: 1997,"[소울, 알앤비]","[561958, 397574, 250915, 110345, 426772, 10698...",51,2013-12-24 14:40:01


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92056 entries, 0 to 92055
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            92056 non-null  int64         
 1   plylst_title  92056 non-null  object        
 2   tags          92056 non-null  object        
 3   songs         92056 non-null  object        
 4   like_cnt      92056 non-null  int64         
 5   updt_date     92056 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 4.2+ MB


In [10]:
train_df[train_df.duplicated('id')]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date


In [11]:
val_df = json_to_dataframe(val_json)
val_df

100%|██████████| 23015/23015 [00:00<00:00, 624866.70it/s]


Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
0,18488,요즘 많이듣는 인디 노래,"[카페, 인디음악, 드라이브, 인디뮤직, 사랑, 이별, 인디]","[674442, 131295, 83652, 352919, 233166, 99741,...",3,2017-07-17 11:00:52
1,76254,살랑살랑 불어오는 바람 같은 뉴에이지,"[살랑살랑, 뉴에이지]","[222141, 422934, 4917, 700161, 424495, 683582,...",4,2017-07-20 13:42:37
2,86227,비오는날 감미롭고 우울한 재즈,"[비오는날, 밤, 새벽]","[333034, 638621, 483000, 570730, 442053, 17405...",41,2015-07-10 03:18:46
3,87450,걸크러쉬돋는 여자보컬 락 노래 모음,"[락, 락밴드, 메탈, 락음악]","[229337, 30825, 475737, 672432, 59091, 98657, ...",17,2017-07-10 21:30:25
4,24649,퇴근 후 차분한 인디,"[집중, 휴식, 밤, 카페, 새벽, 차분한, 조용한, 인디]","[13930, 18100, 105626, 310720, 93295, 557891, ...",5,2020-04-09 00:04:15
...,...,...,...,...,...,...
23010,34841,♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥,[발라드],"[55291, 84345, 434639, 129233, 93698, 121309, ...",11,2007-10-25 09:10:55
23011,119043,영원한 가왕 조용필,"[자작곡, 원조, 열창, 조용필, 감동, 명곡]","[38369, 139236, 201546, 134358, 596576, 649107...",10,2020-01-06 09:28:09
23012,65397,2015 SJF 서울 재즈 페스티벌 셋리스트 - 해외 공연팀,"[서울재즈페스티벌, 서재페, SJF, 셋리스트, 공연]","[589131, 695266, 300481, 348950, 165299, 19520...",18,2017-02-03 17:57:37
23013,79292,회복의 찬양(스튜디오 녹음),"[CCM, 항상]","[425704, 404399, 190558, 72325, 54248, 389972,...",67,2018-07-12 09:38:59


In [12]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23015 entries, 0 to 23014
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            23015 non-null  int64         
 1   plylst_title  23015 non-null  object        
 2   tags          23015 non-null  object        
 3   songs         23015 non-null  object        
 4   like_cnt      23015 non-null  int64         
 5   updt_date     23015 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 1.1+ MB


In [13]:
val_df[val_df.id.isin(train_df.id.unique())]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date


In [14]:
val_df[val_df.plylst_title.isin(train_df.plylst_title.unique())]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
10,107988,밤에들으면 좋은노래들 ...,[발라드],"[651499, 181309, 217217, 476291, 707621, 64589...",0,2018-09-16 22:34:05
33,37188,OST 명곡 모음,"[기억조작, 그땐그랬지, 기분전환]","[123648, 645489, 482903, 150297, 611706, 49267...",0,2020-02-27 21:22:11
36,76459,chill,"[기분전환, 흔한노래]","[117266, 260973, 90295, 489296, 696901, 283130...",6,2018-09-29 00:31:46
42,65264,나를 사랑하지 않는 그대에게,"[여름밤, 슬픔, 짝사랑, 새벽, 발라드]","[64529, 385603, 624047, 265584, 210, 266745, 6...",12,2017-06-08 00:47:20
48,111869,사랑 그리고 이별,"[이별, 헤어짐]","[226331, 27456, 327365, 278658, 464477, 515115...",10,2018-12-15 10:11:52
...,...,...,...,...,...,...
22936,2205,트로트1,[트로트],"[35848, 638330, 132604, 666824, 231868, 546740...",43,2019-10-16 18:56:01
22948,151510,친구에서 연인으로,"[여행, 사랑, 설렘, 산책]","[516676, 342677, 674580, 320019, 130110, 23993...",35,2016-03-18 13:50:48
22970,72675,힙합패거리 전성시대 71,[힙합],"[477964, 510984, 588832, 315072, 404135, 36267...",0,2019-03-27 15:28:08
22978,72463,Good Night,"[힐링, 휴식, 밤, 새벽]","[339492, 268175, 17558, 669346, 661454, 373893...",81,2016-04-27 00:47:44


In [15]:
train_df[train_df.plylst_title == 'chill']

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
56020,58081,chill,[Chill],"[160901, 70505, 646171, 364758, 427835]",43,2017-12-20 02:41:26
61366,142273,chill,"[힙합, 랩]","[3312, 356590, 388570, 208711, 491476, 606786,...",246,2015-06-06 01:42:25
86248,41496,chill,"[감성, vlog]","[507383, 275673, 406073, 289777, 38513, 363125...",61,2019-10-06 20:24:55


In [16]:
val_df[val_df.plylst_title == 'chill']

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
36,76459,chill,"[기분전환, 흔한노래]","[117266, 260973, 90295, 489296, 696901, 283130...",6,2018-09-29 00:31:46


In [17]:
def get_unique_value(dataframe, column, list_type=True):
    unique_values = set()
    if list_type:
        for c in tqdm(dataframe[column]):
            unique_values |= set(c)
    else:
        unique_values = set(dataframe[column].unique())
    
    return unique_values

In [18]:
unique_tags_train = get_unique_value(train_df, 'tags')
len(unique_tags_train)

100%|██████████| 92056/92056 [00:00<00:00, 648579.16it/s]


25480

In [19]:
train_df[train_df.tags.apply(len) == 0]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date


In [20]:
train_df[train_df.tags.apply(lambda x: '' in x)]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
88542,95032,어떻게 떴냐고? 답은 바로 TikTok!,"[힙합, , 흑인음악, 힙합엘이, 힙합추천, 틱톡, 외힙, HIPHOPLE]","[253222, 270262, 301533, 136007, 238347, 13384...",7,2020-01-08 11:19:11


In [21]:
unique_tags_val = get_unique_value(val_df, 'tags')
len(unique_tags_val)

100%|██████████| 23015/23015 [00:00<00:00, 556318.94it/s]


10802

In [22]:
unique_tags = unique_tags_train | unique_tags_val
len(unique_tags)

29160

In [23]:
unique_songs_train = get_unique_value(train_df, 'songs')
len(unique_songs_train)

100%|██████████| 92056/92056 [00:00<00:00, 115939.70it/s]


549729

In [24]:
unique_songs_val = get_unique_value(val_df, 'songs')
len(unique_songs_val)

100%|██████████| 23015/23015 [00:00<00:00, 116713.12it/s]


261843

In [25]:
unique_songs = unique_songs_train | unique_songs_val
len(unique_songs)

615142

In [26]:
unique_playlists_train = get_unique_value(train_df, 'plylst_title')
len(unique_playlists_train)

100%|██████████| 92056/92056 [00:00<00:00, 342545.56it/s]


2502

In [27]:
unique_playlists_val = get_unique_value(val_df, 'plylst_title')
len(unique_playlists_val)

100%|██████████| 23015/23015 [00:00<00:00, 342383.36it/s]


1842

In [28]:
unique_playlists = unique_playlists_train | unique_playlists_val
len(unique_playlists)

2626

In [273]:
def make_item_index_dictionary(items):
    item2idx = {item:idx for idx, item in enumerate(items)}
    idx2item = {idx:item for idx, item in enumerate(items)}
    return item2idx, idx2item

In [30]:
tag2idx, idx2tag = make_item_index_dictionary(unique_tags)
song2idx, idx2song = make_item_index_dictionary(unique_songs)

In [32]:
len(song2idx)

615142

In [241]:
def _dataframe_to_user_item_matrix(dataframe, item, playlist2idx, item2idx):
    assert item in ['tags', 'songs']

    matrix_shape = (len(playlist2idx), len(item2idx))

    rows = list()
    cols = list()
    data = list()

    for idx, item_list in tqdm(enumerate(dataframe[item])):
        for i in item_list:
            rows.append(playlist2idx[dataframe.loc[idx, 'id']])
            cols.append(item2idx[i])   
            data.append(1)  
    
    rows = np.array(rows)
    cols = np.array(cols)
    data = np.array(data)

    user_item_matrix = csr_matrix((data, (rows, cols)), shape=matrix_shape)

    return user_item_matrix

In [242]:
def dataframe_to_user_item_matrix(dataframe_main, dataframe_secondary=None, item='tags', playlist2idx=None, item2idx=None):
    assert item in ['tags', 'songs']

    # Only For Training Data
    if dataframe_secondary is not None:
        unique_items_main = get_unique_value(dataframe_main, item)
        unique_items_secondary = get_unique_value(dataframe_secondary, item)
        unique_items = unique_items_main | unique_items_secondary

        item2idx, idx2item = make_item_index_dictionary(unique_items)

    if not playlist2idx:
        unique_playlists = get_unique_value(dataframe_main, 'id', False)
        playlist2idx, idx2playlist = make_item_index_dictionary(unique_playlists)
    
    user_item_matrix = _dataframe_to_user_item_matrix(dataframe_main, item, playlist2idx, item2idx)

    return (playlist2idx, idx2playlist), (item2idx, idx2item), user_item_matrix


In [41]:
# def dataframe_to_user_item_matrix(dataframe_train, dataframe_test, item='tags', playlist2idx=None):
#     assert item in ['tags', 'songs']

#     unique_items_train = get_unique_value(dataframe_train, item)
#     unique_items_test = get_unique_value(dataframe_test, item)
#     unique_items = unique_items_train | unique_items_test

#     item2idx = item_to_index(unique_items)

#     if not playlist2idx:
#         unique_playlists = get_unique_value(dataframe_train, 'id', False)
#         playlist2idx = item_to_index(unique_playlists)


#     rows = list()
#     cols = list()
#     data = list()
#     for idx, item_list in tqdm(enumerate(dataframe_train[item])):
#         for i in item_list:
#             rows.append(playlist2idx[dataframe_train.loc[idx, 'id']])
#             cols.append(item2idx[i])   
#             data.append(1)  
    
#     rows = np.array(rows)
#     cols = np.array(cols)
#     data = np.array(data)

#     user_item_matrix = csr_matrix((data, (rows, cols)))

#     return playlist2idx, item2idx, user_item_matrix


In [243]:
(playlist2idx_train, idx2playlist_train), (tag2idx, idx2tag), playlist_tag_matrix_train = dataframe_to_user_item_matrix(
    train_df, 
    val_df, 
    item='tags'
)

(_, _), (song2idx, idx2song), playlist_song_matrix_train = dataframe_to_user_item_matrix(
    train_df, 
    val_df, 
    item='songs', 
    playlist2idx=playlist2idx_train
)

100%|██████████| 92056/92056 [00:00<00:00, 169981.16it/s]
100%|██████████| 23015/23015 [00:00<00:00, 167667.25it/s]
92056it [00:03, 23350.25it/s]
100%|██████████| 92056/92056 [00:00<00:00, 105558.28it/s]
100%|██████████| 23015/23015 [00:00<00:00, 112281.92it/s]
92056it [00:41, 2239.15it/s]


In [244]:
(len(tag2idx), len(song2idx))

(29160, 615142)

In [245]:
playlist_tag_matrix_train

<92056x29160 sparse matrix of type '<class 'numpy.longlong'>'
	with 380794 stored elements in Compressed Sparse Row format>

In [246]:
playlist_song_matrix_train

<92056x615142 sparse matrix of type '<class 'numpy.longlong'>'
	with 4239978 stored elements in Compressed Sparse Row format>

In [247]:
(playlist2idx_val, idx2playlist_val), (_, _), playlist_tag_matrix_val = dataframe_to_user_item_matrix(
    val_df, 
    item='tags', 
    item2idx=tag2idx
)

(_, _), (_, _), playlist_song_matrix_val = dataframe_to_user_item_matrix(
    val_df, 
    item='songs', 
    playlist2idx=playlist2idx_val, 
    item2idx=song2idx
)

23015it [00:00, 23653.18it/s]
23015it [00:10, 2252.56it/s]


In [248]:
(len(tag2idx), len(song2idx))

(29160, 615142)

In [249]:
playlist_tag_matrix_val

<23015x29160 sparse matrix of type '<class 'numpy.longlong'>'
	with 95537 stored elements in Compressed Sparse Row format>

In [250]:
playlist_song_matrix_val

<23015x615142 sparse matrix of type '<class 'numpy.longlong'>'
	with 1045893 stored elements in Compressed Sparse Row format>

In [251]:
def matrix_factorization(user_item_matrix, n_components=100):
    model = NMF(n_components=n_components, init='random', verbose=True, tol=5e-2, max_iter=100, random_state=2020, shuffle=True)
    model.fit(user_item_matrix)
    W = model.transform(user_item_matrix)
    H = model.components_

    return model, W, H

In [252]:
model_tag, W_tag, H_tag = matrix_factorization(playlist_tag_matrix_train)

violation: 1.0
violation: 10.031219999727462
violation: 5.032633123240272
violation: 2.8606228865707415
violation: 1.6938233208402944
violation: 1.0858468999396413
violation: 0.716443114992063
violation: 0.5437725627962213
violation: 0.40790838284373365
violation: 0.31043250916154463
violation: 0.23624660844762002
violation: 0.1954251238372654
violation: 0.20152163718599497
violation: 0.2041245655965717
violation: 0.20306654000778487
violation: 0.18096686496449652
violation: 0.13602163037228346
violation: 0.10729017552135867
violation: 0.10736056600160906
violation: 0.10056312679932365
violation: 0.11121065578247828
violation: 0.1231458809807175
violation: 0.1595481385942812
violation: 0.19544652703972082
violation: 0.1433736581997101
violation: 0.0905547414501437
violation: 0.08779578021394215
violation: 0.10127077057019854
violation: 0.10141061530167898
violation: 0.11532596334873058
violation: 0.11349834666322749
violation: 0.11053876034435081
violation: 0.11382956726666008
violatio

In [253]:
with open('checkpoints/model-v1/model-v1-tag-nmf.pkl', 'wb') as f:
    pickle.dump(model_tag, f)

In [254]:
model_song, W_song, H_song = matrix_factorization(playlist_song_matrix_train)

violation: 1.0
violation: 6.923231816431041
violation: 3.3249560160491276
violation: 1.836157124128675
violation: 1.192462362749963
violation: 0.8870859938251491
violation: 0.7160244072257216
violation: 0.5674657692555136
violation: 0.4685138372836856
violation: 0.38442684241115394
violation: 0.3072268213123712
violation: 0.2339058114900291
violation: 0.18140244528733387
violation: 0.14508710410606468
violation: 0.12660973133082834
violation: 0.10956747778808056
violation: 0.09671586499308156
violation: 0.08992692162203479
violation: 0.08293266889109506
violation: 0.07543562868468079
violation: 0.06925456725729934
violation: 0.06148600280308796
violation: 0.057046846547261165
violation: 0.052241434203767645
violation: 0.04947411992956252
Converged at iteration 26
violation: 1.0
violation: 0.4546062627449705
violation: 0.0927027897338968
violation: 0.023141694374950592
Converged at iteration 5


In [255]:
with open('checkpoints/model-v1/model-v1-song-nmf.pkl', 'wb') as f:
    pickle.dump(model_song, f)

In [256]:
with open('checkpoints/model-v1/model-v1-tag-nmf.pkl', 'rb') as f:
    model_tag = pickle.load(f)

In [257]:
with open('checkpoints/model-v1/model-v1-song-nmf.pkl', 'rb') as f:
    model_song = pickle.load(f)

In [258]:
W_tag = model_tag.transform(playlist_tag_matrix_train)
H_tag = model_tag.components_

W_song = model_song.transform(playlist_song_matrix_train)
H_song = model_song.components_

violation: 1.0
violation: 0.04823401859836165
Converged at iteration 3
violation: 1.0
violation: 0.4546062627449705
violation: 0.0927027897338968
violation: 0.023141694374950592
Converged at iteration 5


In [43]:
def calculate_r_precision(user_item_matrix, W, H):
    nonzero_r = user_item_matrix.nonzero()[0]
    nonzero_c = user_item_matrix.nonzero()[1]

    scores = list()
    for r in tqdm(range(user_item_matrix.shape[0])):
        nonzeros = user_item_matrix[r, :].nonzero()[1]
        n = len(nonzeros)
        top_n = np.argsort(np.dot(W[r, :], H))[::-1][:n]

        counter = len(set(nonzeros) & set(top_n))
        
        score = counter / n if n != 0 else 0
        scores.append(score)
    
    scoes = np.array(scores)
    return np.mean(scores)

In [44]:
calculate_r_precision(playlist_tag_matrix, W_tag, H_tag)

100%|██████████| 92056/92056 [02:54<00:00, 526.46it/s]


0.7194591662537806

In [79]:
nonzero_r = playlist_tag_matrix.nonzero()[0]
nonzero_c = playlist_tag_matrix.nonzero()[1]

random_idx = np.random.choice(len(nonzero_r), size=10)

print('(Original, Imputation)')
for idx in random_idx:
    r = nonzero_r[idx]
    c = nonzero_c[idx]
    print('({}, {})\t{}\t{}'.format(r, c, playlist_tag_matrix[r, c], np.dot(W_tag[r, :], H_tag[:, c])))

    print(playlist_tag_matrix[r, :].nonzero()[1])
    top10 = np.argsort(np.dot(W_tag[r, :], H_tag))[::-1][:10]
    print(top10)
    print(np.dot(W_tag[r, :], H_tag[:, top10]))


(Original, Imputation)
(63658, 11325)	1	1.51312672300729e-07
[ 9083  9188 11325 13133 18418]
[12329 23059 24122  2051 28211 27644 26988  4981 15674 21939]
[3.17208519e-04 2.47513216e-04 2.14783588e-04 1.95777539e-04
 8.88423381e-05 8.86667415e-05 8.45644842e-05 5.96200557e-05
 3.83380207e-05 2.83520515e-05]
(35462, 9267)	1	0.0
[ 1269  1546  4177  6472  9267  9348 17441 17742 18418 21530]
[28211 10208  6820 16003 25366 13754  5386 23817 28349 17662]
[0.00095426 0.00092932 0.00085408 0.00082481 0.00067106 0.00066365
 0.0006533  0.00060422 0.00057475 0.00044031]
(57972, 20439)	1	1.7953526191052652e-06
[20439]
[27612 10785 19640  9958  9838 14327  3781 11598 19083  7158]
[0.00070063 0.00068209 0.00044556 0.00037935 0.00036877 0.00028476
 0.00021908 0.00021876 0.00017476 0.00017113]
(60408, 12587)	1	0.0
[ 2834  4513  5132  9497 10953 10966 12587 17557]
[25366  9270 23059 27387  2988  6623 23869  9958 26791 10208]
[0.00270246 0.00206454 0.00171561 0.00149258 0.00130031 0.00119891
 0.00108964

In [80]:
nonzero_r = playlist_song_matrix.nonzero()[0]
nonzero_c = playlist_song_matrix.nonzero()[1]

random_idx = np.random.choice(len(nonzero_r), size=10)

print('(Original, Imputation)')
for idx in random_idx:
    r = nonzero_r[idx]
    c = nonzero_c[idx]
    print('({}, {})\t{}\t{}'.format(r, c, playlist_song_matrix[r, c], np.dot(W_song[r, :], H_song[:, c])))

    print(playlist_song_matrix[r, :].nonzero()[1])
    top10 = np.argsort(np.dot(W_song[r, :], H_song))[::-1][:10]
    print(top10)
    print(np.dot(W_song[r, :], H_song[:, top10]))


(Original, Imputation)
(72400, 280260)	1	0.5078473044400594
[ 11461  13019  33702  37350  40655  58139  68491 100083 102168 122626
 152045 161496 161847 170670 174380 185143 187241 195258 201278 204863
 230191 230691 237231 242489 263221 280260 293499 310516 312846 315498
 320843 323143 334446 336086 338911 353799 366022 368297 385228 394292
 394609 398371 402573 461974 470492 480326 501633 501822 502925 542180
 547194 550113 585743 591131 601442]
[187241 334446 591131   1309  68491 185143 280260 204863 353799 589759]
[0.52363754 0.51353657 0.51072696 0.51050572 0.50933978 0.50843665
 0.5078473  0.4423804  0.44071618 0.43988013]
(34717, 457214)	1	0.12847192488276127
[   216   1068   4424   6770   7565   7583   9675  10175  10194  11347
  12225  15922  17353  18013  18146  18444  19022  20240  20789  22427
  23078  24030  24753  30315  30887  30890  31401  31404  34631  35986
  36101  38347  38781  39001  39092  39439  42117  42332  43124  43650
  45314  49721  51218  51376  51530  5261

In [259]:
def calculate_cosine_similarity(A, B):
    similarity = cosine_similarity(A, B, dense_output=False)
    return similarity

In [262]:
def find_neigbors(user_item_matrix_src, user_item_matrix_dst, k=10):
    similarity_matrix = calculate_cosine_similarity(user_item_matrix_src, user_item_matrix_dst)
    neighbors = list()
    for r in tqdm(range(similarity_matrix.shape[0])):
        neighbors.append(np.argsort(similarity_matrix[r, :].toarray()[0])[::-1][:k])
    
    neighbors = np.array(neighbors)
    return similarity_matrix, neighbors

In [263]:
_, neighbors_tag = find_neigbors(playlist_tag_matrix_val, playlist_tag_matrix_train)
_, neighbors_song = find_neigbors(playlist_song_matrix_val, playlist_song_matrix_train)

100%|██████████| 23015/23015 [01:18<00:00, 291.98it/s]
100%|██████████| 23015/23015 [01:03<00:00, 360.47it/s]


In [299]:
neighbors_tag.shape

(23015, 10)

In [300]:
neighbors_song.shape

(23015, 10)

In [297]:
def recommend_item(neighbors, user_item_matrix_src, user_item_matrix_dst, n):
    recommendations = list()
    for idx, neighbor in tqdm(enumerate(neighbors)):
        ratings = np.sum(user_item_matrix_dst[neighbor, :].toarray(), axis=0)
        top_rating = np.argsort(ratings)[::-1][:n]
        recommendations.append(np.array(top_rating))
    recommendations = np.array(recommendations)
    return recommendations

In [298]:
recommendations_tag = recommend_item(neighbors_tag, playlist_tag_matrix_val, playlist_tag_matrix_train, 100)
recommendations_song = recommend_item(neighbors_song, playlist_song_matrix_val, playlist_song_matrix_train, 200)

23015it [00:16, 1400.85it/s]
23015it [08:03, 47.57it/s]


In [301]:
recommendations_tag.shape

(23015, 100)

In [302]:
recommendations_song.shape

(23015, 200)

In [235]:
for idx in range(playlist_tag_matrix_val.shape[0]):
    neighbor = np.mean(playlist_tag_matrix_train[n[idx], :], axis=0)[0]
    top = np.argsort(np.array(neighbor))[0][::-1][:100]
        print(top)
        print(neighbor[0, top])



[28500 22462 29159  9722  9712  9713  9714  9715  9716  9717]
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
[16361  4049 27691 18937  4267 14529 10751  6954 14440 15860]
[[0.8 0.7 0.3 0.2 0.2 0.2 0.1 0.1 0.1 0.1]]
[23213 29159  9721  9710  9711  9712  9713  9714  9715  9716]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[22281  4758 28820  4583  6538 29159  9712  9713  9714  9715]
[[1.  1.  1.  0.1 0.1 0.  0.  0.  0.  0. ]]
[19591  5115 19727  5019 26040 17718 19346 11218 18686  9723]
[[1.  1.  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0. ]]
[23078 29159  9722  9711  9712  9713  9714  9715  9716  9717]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[22997 11862  2838  9607  8780  6104  3127 15988  7824  2041]
[[1.  1.  1.  0.8 0.8 0.5 0.5 0.2 0.2 0.2]]
[  692  5115   887 19346   651   855  9716  9717  9718  9719]
[[0.8 0.8 0.1 0.1 0.1 0.1 0.  0.  0.  0. ]]
[ 3127 11862  2041 22997  3372  2838 27924 22281 17362 13696]
[[1.  1.  1.  0.9 0.2 0.2 0.1 0.1 0.1 0.1]]
[ 2838 10660 29159  9724  9713  9714  9715  9716  9717  9718]
[[1. 1. 0. 0. 

In [223]:
playlist_tag_matrix_train.shape

(92056, 29160)

In [227]:
n.shape

(23015, 10)

In [325]:
idx2playlist_val = {idx:playlist for playlist, idx in playlist2idx_val.items()}
idx2tag = {idx:tag for tag, idx in tag2idx.items()}
idx2song = {idx:song for song, idx in song2idx.items()}

In [329]:
def restore_item(idx, idx2item, list_type=True):
    if list_type:
        return list(map(lambda x: [idx2item[i] for i in x], idx))
    else:
        return list(map(lambda x: idx2item[x], idx))

In [332]:
def write_answer(recommendations_tag, recommendations_song, idx2playlist, idx2song, idx2tag):
    
    n_data = recommendations_tag.shape[0]

    playlists = restore_item(range(n_data), idx2playlist, False)
    tags = restore_item(recommendations_tag, idx2tag)
    songs = restore_item(recommendations_song, idx2song)

    answer = list()

    for i, t, s in tqdm(zip(playlists, tags, songs)):
        answer.append({'id': i, 'tags': t, 'songs': s})

    return answer

In [333]:
answer = write_answer(recommendations_tag, recommendations_song, idx2playlist_val, idx2song, idx2tag)

23015it [00:00, 569734.98it/s]
