In [2]:
import os
import pandas as pd
import numpy as np

import json
import distutils.dir_util
import io
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from tqdm.notebook import tqdm
from collections import Counter

In [3]:
path = '../../data/'
tqdm.pandas()

In [4]:
print("All data in path")
print(os.listdir(path))

All data in path
['train.json', 'test.json', 'onehot_matrix.csv', 'new_date', 'genre_all.json', 'song_meta.json', '.ipynb_checkpoints', 'genre_gn_all.json', 'val.json', 'train_genre_count.csv']


# Basic Module

In [5]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        print('file save seuccess')

In [6]:
song_meta = pd.read_json(path+'song_meta.json')
train = pd.read_json(path+'train.json')
val = pd.read_json(path+'val.json')

In [7]:
n_train = len(train)
n_val = len(val)

train['istrain'] = 1
val['istrain'] = 0

playlist = pd.concat([train, val], ignore_index=True)

song_list = [song for songs in playlist.songs for song in songs]
song_counter = dict(Counter(song_list))

tag_list = [tag for tags in playlist.tags for tag in tags]
tag_counter = dict(Counter(tag_list))

In [8]:
song_to_sid = dict()
sid_to_song = dict()

for i, s in enumerate(song_counter):
    song_to_sid[s] = i
    sid_to_song[i] = s

tag_to_tid = dict()
tid_to_tag = dict()

for i, t in enumerate(tag_counter):
    tag_to_tid[t] = i
    tid_to_tag[i] = t

In [9]:
id_to_nid = dict(zip(playlist['id'], range(n_train+n_val)))
nid_to_id = dict(zip(range(n_train+n_val) , playlist['id']))

In [10]:
n_song = len(song_counter)
n_tag = len(tag_counter)

train['sid_list'] = train['songs'].progress_apply(lambda x: [song_to_sid[s] for s in x])
train['tid_list'] = train['tags'].progress_apply(lambda x: [tag_to_tid[t] for t in x])
train['nid'] = train['id'].progress_apply(lambda x: id_to_nid[x])
val['sid_list'] = val['songs'].progress_apply(lambda x: [song_to_sid[s] for s in x])
val['tid_list'] = val['tags'].progress_apply(lambda x: [tag_to_tid[t] for t in x])
val['nid'] = val['id'].progress_apply(lambda x: id_to_nid[x])

train['num_songs'] = train['sid_list'].map(len)
train['num_tags'] = train['tid_list'].map(len)
val['num_songs'] = val['sid_list'].map(len)
val['num_tags'] = val['tid_list'].map(len)

  0%|          | 0/115071 [00:00<?, ?it/s]

  0%|          | 0/115071 [00:00<?, ?it/s]

  0%|          | 0/115071 [00:00<?, ?it/s]

  0%|          | 0/23015 [00:00<?, ?it/s]

  0%|          | 0/23015 [00:00<?, ?it/s]

  0%|          | 0/23015 [00:00<?, ?it/s]

In [11]:
train_use = train[['nid', 'sid_list', 'tid_list', 'istrain', 'num_songs', 'num_tags']]
val_use = val[['nid', 'sid_list', 'tid_list', 'istrain', 'num_songs', 'num_tags']]
train_use = train_use.set_index('nid')
val_use = val_use.set_index('nid')

In [12]:
use_data = pd.concat([train_use, val_use])

# Make Sparse Matrix

In [13]:
from scipy.sparse import csr_matrix

def make_sparse(data):
    row = np.repeat(range(n_train+n_val), data['num_songs'])
    col = [sid for songs in data['sid_list'] for sid in songs]
    # (0, 1) --> 0번 playlist는 1번 노래를 갖고 있는 것
    dat = np.repeat(1, data['num_songs'].sum())
    songs_sparse = csr_matrix((dat, (row, col)), shape=(n_train+n_val, n_song))
    
    row = np.repeat(range(n_train+n_val), data['num_tags'])
    col = [tid for tags in data['tid_list'] for tid in tags]
    dat = np.repeat(1, data['num_tags'].sum())
    tags_sparse = csr_matrix((dat, (row, col)), shape=(n_train+n_val, n_tag))
    
    return songs_sparse, tags_sparse

In [14]:
song_sparse, tag_sparse = make_sparse(use_data)

In [15]:
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR

In [25]:
bpr_model_item = BPR(factors=100)
bpr_model_item.fit(song_sparse.T)

bpr_model_tag = BPR(factors=100)
bpr_model_tag.fit(tag_sparse.T)

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [26]:
val_song_sparse = song_sparse[n_train:]
val_tag_sparse = tag_sparse[n_train:]

In [27]:
train_song_sparse = song_sparse[:n_train]
train_tag_sparse = tag_sparse[n_train:]

In [28]:
song_ret = []

for s in tqdm(range(val_song_sparse.shape[0])):
    song_recom = bpr_model_item.recommend(s, train_song_sparse, N=100)
    song_recom = [sid_to_song[x[0]] for x in song_recom]
    song_ret.append(song_recom)

  0%|          | 0/23015 [00:00<?, ?it/s]

In [29]:
tag_ret = []

for s in tqdm(range(val_tag_sparse.shape[0])):
    tag_recom = bpr_model_tag.recommend(s, train_tag_sparse, N=100)
    tag_recom = [tid_to_tag[x[0]] for x in tag_recom if x[0] in tid_to_tag]
    tag_ret.append(tag_recom)

  0%|          | 0/23015 [00:00<?, ?it/s]

In [30]:
answer = []

for _id, rec, rec_tag in zip(val.id.tolist(), song_ret, tag_ret):
    answer.append({
        "id":_id,
        "songs":rec[:100],
        "tags":rec_tag[:10]
    })

In [31]:
write_json(answer, "own_result/mf/results.json")

file save seuccess


In [32]:
song_meta.loc[val.loc[0, 'songs']]

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
373313,"[GN2207, GN1501, GN1506, GN1509]",20150101,Walt Disney Records The Legacy Collection: Pin...,2308104,[353277],I&#39;ve Got No Strings,"[GN1500, GN2200]",[Dickie Jones],373313
151080,"[GN2207, GN1509, GN1501, GN1506]",20150312,겨울왕국 열기 OST (Making Today A Perfect Day),2308739,"[232538, 746208, 2758138]",Making Today A Perfect Day,"[GN1500, GN2200]","[Idina Menzel, Kristen Bell, Cast of Frozen Fe...",151080
275346,"[GN1301, GN1302]",20150814,Friend Like Me,2334904,[181769],Friend Like Me (From &#34;Aladdin&#34;),[GN1300],[Ne-Yo],275346
696876,"[GN1301, GN1302]",20151006,Ev&#39;rybody Wants To Be A Cat (From &#34;The...,2643514,[871816],Ev’rybody Wants To Be A Cat (From &#34;The Ari...,[GN1300],[Charles Perry],696876
165237,"[GN2207, GN1501, GN0901, GN1509, GN1506]",20151030,We Love Disney (Deluxe),2647713,[742133],Colors Of The Wind (From &#34;Pocahontas&#34;),"[GN1500, GN0900, GN2200]",[Tori Kelly],165237
525935,"[GN2207, GN0901, GN1509, GN1501, GN1506]",20160212,영화 주토피아 OST (Zootopia OST),2666420,[10379],Try Everything (From &#34;Zootopia&#34;/Soundt...,"[GN1500, GN0900, GN2200]",[Shakira],525935
457812,"[GN2207, GN0901, GN1509, GN1501, GN1506]",20171110,Coco (Original Motion Picture Soundtrack),10105939,[553325],Remember Me (D&#250;o) (From &#34;Coco&#34;/So...,"[GN1500, GN0900, GN2200]",[Miguel],457812
371709,"[GN2207, GN1509, GN1501, GN1506]",20171103,Olaf&#39;s Frozen Adventure (Original Soundtrack),10108314,"[746208, 232538, 746216, 407770]",When We&#39;re Together (From &#34;Olaf&#39;s ...,"[GN1500, GN2200]","[Kristen Bell, Idina Menzel, Josh Gad, Jonatha...",371709
170292,"[GN1503, GN1501, GN0908, GN0901]",20190522,Aladdin (Original Motion Picture Soundtrack),10288448,[27242],Arabian Nights (2019) (From &#34;Aladdin&#34;/...,"[GN1500, GN0900]",[Will Smith],170292
438915,"[GN1503, GN1501, GN1509]",20000513,쿠스코? 쿠스코! OST,41158,[28192],My Funny Friend And Me,[GN1500],[Sting],438915


In [33]:
song_meta.loc[answer[1]['songs']]

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
86502,"[GN0508, GN0501, GN0601, GN0503, GN0605]",20080624,R.A.I.N.B.O.W,386406,[184730],Love Today,"[GN0500, GN0600]",[타루],86502
380361,"[GN0601, GN1501, GN1504]",20141031,가족끼리 왜 이래 OST Part.1,2288486,[232828],내가 말했잖아,"[GN0600, GN1500]",[요조],380361
136857,[GN0101],20091027,환타스틱 프렌즈 (이승환 20주년 기념 앨범),704551,"[7523, 439041]",내가 바라는 나,[GN0100],"[유희열, 김종완 (NELL)]",136857
302043,"[GN0804, GN0801]",20070216,The Light Of Songs,345486,[2800],보이나요?,[GN0800],[루시드폴],302043
206440,"[GN0105, GN0101]",20120420,돌멩이,2114406,[663911],돌멩이,[GN0100],[마시따 밴드],206440
...,...,...,...,...,...,...,...,...,...
27044,"[GN0501, GN0502, GN0801, GN0509]",20140512,작은 위로,2255402,[749751],작은 위로,"[GN0500, GN0800]",[이매진],27044
610473,"[GN0104, GN0101]",20090911,Blossom,661052,[182200],봄이 오면,[GN0100],[허민],610473
157722,"[GN0508, GN0501, GN0601, GN0503, GN0605]",20080325,New Standard,374703,[108707],New Hippie Generation,"[GN0500, GN0600]",[페퍼톤스 (Peppertones)],157722
39357,[GN0101],20010411,윤상 Best,317423,[38],달리기,[GN0100],[윤상],39357
