# Idea

주어진 플레이리스트와 겹치는 곡을 가중치로 두고 플레이리스트에서 노래를 추천하는 User-based Collaborative Filtering을 진행합니다.  

# Import Library

In [25]:
import os
import pandas as pd
import numpy as np

import json
import distutils.dir_util
import io

from tqdm.notebook import tqdm

In [26]:
path = '../'
tqdm.pandas()

In [27]:
print("All data in path")
print(os.listdir(path))

All data in path
['.DS_Store', 'Untitled.ipynb', 'results.json', 'arena_util.py', 'test.json', '__pycache__', 'train.json', '1.arena_output', '.ipynb_checkpoints', 'song_meta.json', 'genre_gn_all.json', 'val.json']


# Basic module

In [28]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        print('file save seuccess')

# Data Load

In [29]:
song_meta = pd.read_json(path+'song_meta.json')
train = pd.read_json(path+'train.json')
val = pd.read_json(path+'val.json')

# Data Preprocessing

In [30]:
from collections import Counter

In [31]:
train['istrain'] = 1
val['istrain'] = 0

n_train = len(train)
n_val = len(val)

data = pd.concat([train, val], ignore_index=True)
data['nid'] = range(n_train+n_val)

id_to_nid = dict(zip(data['id'], data['nid']))
nid_to_id = dict(zip(data['nid'], data['id']))

In [32]:
song_in_data = [song for songs in data['songs'] for song in songs]
song_counter = dict(Counter(song_in_data))

song_to_sid = dict()
sid_to_song = dict()

for i, s in enumerate(song_counter):
    song_to_sid[s] = i
    sid_to_song[i] = s

song_list = list(set(song_in_data))


tag_in_data = [tag for tags in data['tags'] for tag in tags]
tag_counter = dict(Counter(tag_in_data))

tag_to_tid = dict()
tid_to_tag = dict()

for i, t in enumerate(tag_counter):
    tag_to_tid[t] = i
    tid_to_tag[i] = t

tag_list = list(set(tag_in_data))

In [33]:
data['sid_list'] = data['songs'].map(lambda x: [song_to_sid[s] for s in x])
data['tid_list'] = data['tags'].map(lambda x: [tag_to_tid[t] for t in x])

In [22]:
data = data[['nid', 'istrain', 'sid_list', 'tid_list', 'updt_date']]
data = data.set_index('nid')

In [23]:
data['num_songs'] = data['sid_list'].map(len)
data['num_tags'] = data['tid_list'].map(len)
data_train = data.iloc[:n_train, :]
data_test = data.iloc[n_train:, :]

In [34]:
data.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid,sid_list,tid_list
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[0]
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,1,1,"[19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 3...","[1, 2]"
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,1,2,"[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 7...","[3, 4]"
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,1,3,"[89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 1...","[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]"
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1,4,"[127, 128, 129, 130, 131, 132, 133, 134, 135, ...",[15]


## Make Sparse Matrix

In [213]:
from scipy.sparse import csr_matrix

In [214]:
n_songs = len(song_counter)
n_tags = len(tag_counter)

In [224]:
def make_sparse(data):
    row = np.repeat(range(n_train), data['num_songs'])
    col = [sid for songs in data['sid_list'] for sid in songs]
    # (0, 1) --> 0번 playlist는 1번 노래를 갖고 있는 것
    dat = np.repeat(1, data['num_songs'].sum())
    songs_sparse = csr_matrix((dat, (row, col)), shape=(n_train, n_songs))
    
    row = np.repeat(range(n_train), data['num_tags'])
    col = [tid for tags in data['tid_list'] for tid in tags]
    dat = np.repeat(1, data['num_tags'].sum())
    tags_sparse = csr_matrix((dat, (row, col)), shape=(n_train, n_tags))
    
    return songs_sparse, tags_sparse

In [225]:
songs_sparse, tags_sparse = make_sparse(data_train)

In [230]:
song_counter_df = pd.DataFrame(columns=['song', 'count'])
song_counter_df['song'] = song_counter.keys()
song_counter_df['count'] = song_counter.values()
song_counter_df = song_counter_df.sort_values('count', ascending=False, ignore_index=True)

In [231]:
tag_counter_df = pd.DataFrame(columns=['tag', 'count'])
tag_counter_df['tag'] = tag_counter.keys()
tag_counter_df['count'] = tag_counter.values()
tag_counter_df = tag_counter_df.sort_values('count', ascending=False, ignore_index=True)

In [255]:
songs_sparse_T = songs_sparse.T.tocsr()
tags_sparse_T = tags_sparse.T.tocsr()

In [319]:
def recoms(nids):
    ret = []
    count = 1
    default_songs = list(song_counter_df[:100]['song'].values)
    default_tags = list(tag_counter_df[:10]['tag'].values)
    
    for nid in tqdm(nids):
        rec_songs_idx = []
        rec_tags_idx = []
        songs_exist = data_test.loc[nid, 'sid_list']
        tags_exist = data_test.loc[nid, 'tid_list']
        
        if len(songs_exist) == 0:
            rec_songs_idx = default_songs
        else:
            onehot = np.zeros(n_songs)
            onehot[songs_exist] = 1
        
            inter_songs_count = songs_sparse.dot(onehot).reshape(-1)
            cand_songs = songs_sparse_T.dot(inter_songs_count)
            cand_songs_idx = cand_songs.reshape(-1).argsort()[-200:][::-1]
            cand_songs_idx = cand_songs_idx[np.isin(cand_songs_idx, songs_exist) == False][:100]
            
            rec_songs_idx = [sid_to_song[sid] for sid in cand_songs_idx]
        
        if len(tags_exist) == 0:
            rec_tags_idx = default_tags
        else:
            onehot = np.zeros(n_tags)
            onehot[tags_exist] = 1
            
            inter_tag_count = tags_sparse.dot(onehot).reshape(-1)
            cand_tags = tags_sparse_T.dot(inter_tag_count)
            cand_tags_idx = cand_tags.reshape(-1).argsort()[-20:][::-1]
            cand_tags_idx = cand_tags_idx[np.isin(cand_tags_idx, tags_exist) == False][:10]
            
            rec_tags_idx = [tid_to_tag[tid] for tid in cand_tags_idx]
            
        ret.append({
            "id":nid_to_id[nid],
            "songs":rec_songs_idx,
            "tags":rec_tags_idx
        })
        
        
        
    return ret

In [320]:
answer = recoms(data_test.index)

  0%|          | 0/23015 [00:00<?, ?it/s]

In [None]:
write_json(answer, "own_result/cf/results.json")