In [71]:
import os
import pandas as pd
import numpy as np

import json
import distutils.dir_util
import io

from tqdm.notebook import tqdm

In [2]:
path = '../../data/'
tqdm.pandas()

In [3]:
print("All data in path")
print(os.listdir(path))

All data in path
['train.json', 'test.json', 'onehot_matrix.csv', 'new_date', 'genre_all.json', 'song_meta.json', '.ipynb_checkpoints', 'genre_gn_all.json', 'val.json', 'train_genre_count.csv']


In [28]:
song_meta = pd.read_json(path+'song_meta.json')
train = pd.read_json(path+'train.json')
val = pd.read_json(path+'val.json')

In [29]:
from collections import Counter
from scipy.sparse import csr_matrix

In [30]:
train['istrain'] = 1
val['istrain'] = 0

n_train = len(train)
n_val = len(val)

playlist = pd.concat([train, val], ignore_index=True)
playlist['nid'] = range(n_train+n_val)

id_to_nid = dict(zip(playlist['id'], playlist['nid']))
nid_to_id = dict(zip(playlist['nid'], playlist['id']))

In [31]:
song_in_data = [song for songs in playlist['songs'] for song in songs]
song_counter = dict(Counter(song_in_data))

song_to_sid = dict()
sid_to_song = dict()

for i, s in enumerate(song_counter):
    song_to_sid[s] = i
    sid_to_song[i] = s

song_list = list(set(song_in_data))


tag_in_data = [tag for tags in playlist['tags'] for tag in tags]
tag_counter = dict(Counter(tag_in_data))

tag_to_tid = dict()
tid_to_tag = dict()

for i, t in enumerate(tag_counter):
    tag_to_tid[t] = i
    tid_to_tag[i] = t

tag_list = list(set(tag_in_data))

In [32]:
playlist['sid_list'] = playlist['songs'].map(lambda x: [song_to_sid[s] for s in x])
playlist['tid_list'] = playlist['tags'].map(lambda x: [tag_to_tid[t] for t in x])

In [33]:
playlist['song_ranks'] = playlist['sid_list'].progress_apply(lambda x: [1/i for i in range(1, len(x)+1)])

  0%|          | 0/138086 [00:00<?, ?it/s]

In [34]:
playlist.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid,sid_list,tid_list,song_ranks
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0],"[1.0, 0.5, 0.3333333333333333, 0.25, 0.2, 0.16..."
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,1,1,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]","[1.0, 0.5, 0.3333333333333333, 0.25, 0.2, 0.16..."
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,1,2,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[3, 4]","[1.0, 0.5, 0.3333333333333333, 0.25, 0.2, 0.16..."
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,1,3,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...","[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[1.0, 0.5, 0.3333333333333333, 0.25, 0.2, 0.16..."
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1,4,"[127, 128, 129, 130, 131, 132, 133, 134, 135, ...",[15],"[1.0, 0.5, 0.3333333333333333, 0.25, 0.2, 0.16..."


In [39]:
playlist_use = playlist[['nid', 'istrain', 'sid_list', 'tid_list', 'song_ranks']]
playlist_use = playlist_use.set_index('nid')
playlist_use['num_songs'] = playlist_use['sid_list'].map(len)
playlist_use['num_tags'] = playlist_use['tid_list'].map(len)

In [41]:
playlist_train = playlist_use.iloc[:n_train, :]
playlist_test = playlist_use.iloc[n_train:, :]

# Making Sparse Matrix

In [43]:
n_songs = len(song_counter)
n_tags = len(tag_counter)

In [49]:
dat = np.array([dat for ranks in playlist_train['song_ranks'].tolist() for dat in ranks])
len(dat)

5285871

In [50]:
def make_sparse(data):
    row = np.repeat(range(n_train), data['num_songs'])
    col = [sid for songs in data['sid_list'] for sid in songs]
    # per (0, 1) --> 0번 playlist는 1번 노래를 1/순위 로 가짐
    dat = np.array([dat for ranks in playlist_train['song_ranks'].tolist() for dat in ranks])
    songs_sparse = csr_matrix((dat, (row, col)), shape=(n_train, n_songs))
    
    row = np.repeat(range(n_train), data['num_tags'])
    col = [tid for tags in data['tid_list'] for tid in tags]
    dat = np.repeat(1, data['num_tags'].sum())
    tags_sparse = csr_matrix((dat, (row, col)), shape=(n_train, n_tags))
    
    return songs_sparse, tags_sparse

In [51]:
song_sparse, tag_sparse = make_sparse(playlist_train)

In [56]:
song_sparse_T, tag_sparse_T = song_sparse.T.tocsr(), tag_sparse.T.tocsr()

In [54]:
song_counter_df = pd.DataFrame(columns=['song', 'count'])
song_counter_df['song'] = song_counter.keys()
song_counter_df['count'] = song_counter.values()
song_counter_df = song_counter_df.sort_values('count', ascending=False, ignore_index=True)

In [55]:
tag_counter_df = pd.DataFrame(columns=['tag', 'count'])
tag_counter_df['tag'] = tag_counter.keys()
tag_counter_df['count'] = tag_counter.values()
tag_counter_df = tag_counter_df.sort_values('count', ascending=False, ignore_index=True)

In [88]:
playlist_train['inverse_rank'] = playlist_train['sid_list'].progress_apply(lambda x: [i for i in reversed(range(1, len(x)+1))])

def make_sparse1(data):
    row = np.repeat(range(n_train), data['num_songs'])
    col = [sid for songs in data['sid_list'] for sid in songs]
    # per (0, 1) --> 0번 playlist는 1번 노래를 1/순위 로 가짐
    dat = np.array([dat for ranks in playlist_train['inverse_rank'].tolist() for dat in ranks])
    songs_sparse = csr_matrix((dat, (row, col)), shape=(n_train, n_songs))
    
    row = np.repeat(range(n_train), data['num_tags'])
    col = [tid for tags in data['tid_list'] for tid in tags]
    dat = np.repeat(1, data['num_tags'].sum())
    tags_sparse = csr_matrix((dat, (row, col)), shape=(n_train, n_tags))
    
    return songs_sparse, tags_sparse

  0%|          | 0/115071 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playlist_train['inverse_rank'] = playlist_train['sid_list'].progress_apply(lambda x: [i for i in reversed(range(1, len(x)+1))])


In [90]:
song_sparse, tag_sparse = make_sparse1(playlist_train)
song_sparse_T, tag_sparse_T = song_sparse.T.tocsr(), tag_sparse.T.tocsr()

In [92]:
def recoms(nids):
    ret = []
    count = 1
    default_songs = list(song_counter_df[:100]['song'].values)
    default_tags = list(tag_counter_df[:10]['tag'].values)
    
    for nid in tqdm(nids):
        rec_songs_idx = []
        rec_tags_idx = []
        songs_exist = playlist_test.loc[nid, 'sid_list']
        tags_exist = playlist_test.loc[nid, 'tid_list']
        
        if len(songs_exist) == 0:
            rec_songs_idx = default_songs
        else:
            onehot = np.zeros(n_songs)
            onehot[songs_exist] = 1
        
            inter_songs_count = song_sparse.dot(onehot).reshape(-1)
            cand_songs = song_sparse_T.dot(inter_songs_count)
            cand_songs_idx = cand_songs.reshape(-1).argsort()[-200:][::-1]
            cand_songs_idx = cand_songs_idx[np.isin(cand_songs_idx, songs_exist) == False][:100]
            
            rec_songs_idx = [sid_to_song[sid] for sid in cand_songs_idx]
        
        if len(tags_exist) == 0:
            rec_tags_idx = default_tags
        else:
            onehot = np.zeros(n_tags)
            onehot[tags_exist] = 1
            
            inter_tag_count = tag_sparse.dot(onehot).reshape(-1)
            cand_tags = tag_sparse_T.dot(inter_tag_count)
            cand_tags_idx = cand_tags.reshape(-1).argsort()[-20:][::-1]
            cand_tags_idx = cand_tags_idx[np.isin(cand_tags_idx, tags_exist) == False][:10]
            
            rec_tags_idx = [tid_to_tag[tid] for tid in cand_tags_idx]
            
        ret.append({
            "id":nid_to_id[nid],
            "songs":rec_songs_idx,
            "tags":rec_tags_idx
        })
        
        
        
    return ret

In [93]:
answer = recoms(playlist_test.index)

  0%|          | 0/23015 [00:00<?, ?it/s]

In [94]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        print('file save seuccess')

In [95]:
write_json(answer, "own_result/cf/results_rank.json")

file save seuccess


In [96]:
test = pd.read_json(path+'test.json')

In [97]:
test

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[],70107,,"[398985, 449403, 411543, 528044, 143048, 98020...",6,2012-09-29 01:57:26.000
1,"[나만의Best3, 인디아티스트들의추천음악]",7461,,"[196298, 269984, 267805, 175867, 529244, 63825...",0,2019-12-17 14:06:45.000
2,[드라이브],90348,,"[273433, 331003, 68432, 411659, 117793, 616860...",21,2015-05-23 10:44:48.000
3,[분위기],58617,,"[702227, 48152, 440008, 358488, 701041, 540721...",0,2019-03-14 09:47:34.000
4,[],102395,,"[630683, 481582, 528550, 285114, 506667, 17922...",38,2018-07-11 16:43:32.000
...,...,...,...,...,...,...
10735,[추억],137930,,"[323755, 397594, 445908, 570242, 221853, 20018...",16,2016-04-18 11:02:09.000
10736,"[띵곡의, 우울, 분위기, 드라이브, 산책]",936,,"[105140, 582252, 199262, 422915, 547967, 48791...",1,2020-04-08 07:15:59.000
10737,[기분전환],110589,,"[21976, 207746, 40025, 31635, 567462, 641799, ...",6,2016-06-29 00:57:21.000
10738,[여름],2605,,"[234554, 265033, 507260, 83092, 366757, 497097...",4,2015-06-06 09:52:01.000
