# import Library and Data load

In [1]:
import os
import pandas as pd
import numpy as np

from collections import Counter
import re

from tqdm.notebook import tqdm

In [2]:
path = '../../data/'
tqdm.pandas()

In [3]:
print('Data path file list')
print(os.listdir(path))

Data path file list
['train.json', 'test.json', 'onehot_matrix.csv', 'new_date', 'genre_all.json', 'song_meta.json', '.ipynb_checkpoints', 'genre_gn_all.json', 'val.json', 'train_genre_count.csv']


In [4]:
song_meta = pd.read_json(path+'song_meta.json').set_index('id')
train = pd.read_json(path+'train.json')
val = pd.read_json(path+'val.json')

In [5]:
song_meta.head(3)

Unnamed: 0_level_0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists]
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia]
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel]


In [6]:
train.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000


In [7]:
val.head(3)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[],118598,,"[373313, 151080, 275346, 696876, 165237, 52593...",1675,2019-05-27 14:14:33.000
1,[],131447,앨리스테이블,[],1,2014-07-16 15:24:24.000
2,[],51464,,"[529437, 516103, 360067, 705713, 226062, 37089...",62,2008-06-21 23:26:22.000


# Genre TF-IDF

$$
\text{TF(w, d)} = \frac{\text{Number of a genre "w" in a song}}{\text{all genres in a song}}
$$

$$
\text{IDF(w)} = \log{\frac{\text{All songs number}}{\text{Number of songs containing "w"}}}
$$

In [8]:
# make train + val dataframe
train['istrain'] = 1
val['istrain'] = 0
data = pd.concat([train, val], ignore_index=True)

## song recom preprocessing

In [9]:
# get all song list in train and val test set
song_meta['all_genres'] = song_meta.progress_apply(lambda x: x['song_gn_dtl_gnr_basket']+x['song_gn_gnr_basket'], axis=1)
genre_list = list(set([genre for genres in song_meta['all_genres'].tolist() for genre in genres]))
print(f"exist genre counts : {len(genre_list)}")
print(f"exist song counts : {len(song_meta)}")

  0%|          | 0/707989 [00:00<?, ?it/s]

exist genre counts : 249
exist song counts : 707989


In [10]:
total_count = len(song_meta)
genre_count = dict(Counter([genre for genres in song_meta['all_genres'].tolist() for genre in genres]))
print(len(genre_count))

249


In [11]:
song_meta.head(3)

Unnamed: 0_level_0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,all_genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,[GN0901],20140512,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,2255639,[2727],Feelings,[GN0900],[Various Artists],"[GN0901, GN0900]"
1,"[GN1601, GN1606]",20080421,"Bach : Partitas Nos. 2, 3 & 4",376431,[29966],"Bach : Partita No. 4 In D Major, BWV 828 - II....",[GN1600],[Murray Perahia],"[GN1601, GN1606, GN1600]"
2,[GN0901],20180518,Hit,4698747,[3361],Solsbury Hill (Remastered 2002),[GN0900],[Peter Gabriel],"[GN0901, GN0900]"


In [12]:
for each_genre in genre_count:
    genre_count[each_genre] = np.log10(total_count/genre_count[each_genre])
genre_count

{'GN0901': 0.9664275270332833,
 'GN0900': 0.9629664992371292,
 'GN1601': 1.0978935176265407,
 'GN1606': 1.5099196497693246,
 'GN1600': 1.0936490780660448,
 'GN1102': 1.6510122391895945,
 'GN1101': 1.0908574462819616,
 'GN1100': 1.0708342017544772,
 'GN1802': 2.4445094031478383,
 'GN1801': 1.2222920693859076,
 'GN1800': 1.2214843267116458,
 'GN1701': 1.168071083970595,
 'GN1700': 1.1669884822377943,
 'GN1602': 1.9988905151126404,
 'GN1614': 1.946827963081236,
 'GN0301': 1.4092118501665332,
 'GN0300': 1.4062317398215085,
 'GN0105': 1.4417186916004616,
 'GN0101': 1.1099402778187106,
 'GN0100': 1.0955252807372973,
 'GN1201': 1.3040394078528414,
 'GN1200': 1.3013177848895756,
 'GN1301': 1.3396557888441003,
 'GN1300': 1.3343645690670947,
 'GN1703': 2.1155467206986374,
 'GN2704': 1.9966934051218792,
 'GN1104': 1.4496612367742756,
 'GN1103': 1.6547635517821528,
 'GN2700': 1.791600485667209,
 'GN1613': 1.7610050093292084,
 'GN2503': 2.1605400617599666,
 'GN0205': 1.9886713499309545,
 'GN2501': 

In [13]:
song_list = [song for songs in data['songs'].tolist() for song in songs]
song_counter = dict(Counter(song_list))
song_count_df = pd.DataFrame(columns=['id', 'counts'])
song_count_df['id'] = song_counter.keys()
song_count_df['counts'] = song_counter.values()
song_count_df = song_count_df.sort_values('counts', ascending=False, ignore_index=True)
song_count_df.head()

Unnamed: 0,id,counts
0,144663,2340
1,116573,2307
2,357367,2137
3,366786,2082
4,133143,1774


In [14]:
# create genre representation
song_genre_df = pd.DataFrame(columns=sorted(genre_list), index=song_meta.index)

for idx in tqdm(song_meta.index):
    genres = song_meta.loc[idx]['all_genres']
    values = list(map(genre_count.get, genres))
    song_genre_df.loc[idx, genres] = values 

  0%|          | 0/707989 [00:00<?, ?it/s]

In [15]:
song_genre_df = song_genre_df.fillna(0)

In [16]:
val['len_song'] = val['songs'].apply(len)

## tag recom preprocessing

In [17]:
tag_list = [tag for tags in data['tags'] for tag in tags]
tag_count = dict(Counter(tag_list))
print(len(tag_count))

30197


In [18]:
tag_to_tid = dict()
tid_to_tag = dict()
for i, t in enumerate(tag_count):
    tag_to_tid[t] = i
    tid_to_tag[i] = t

In [19]:
for each_tag in tag_count:
    tag_count[each_tag] = np.log10(len(data)/tag_count[each_tag])
tid_count = dict((tag_to_tid[k], v) for k, v in tag_count.items())
tid_count

{0: 1.5299154740555665,
 1: 1.300483092507522,
 2: 1.4531560831252766,
 3: 1.6612946818612921,
 4: 1.1074503218973377,
 5: 2.443793260656623,
 6: 3.09875696423173,
 7: 3.2316646305113053,
 8: 1.7187102591699055,
 9: 3.0719637876437935,
 10: 4.295051609375698,
 11: 2.7957573757048446,
 12: 2.141019108102584,
 13: 3.5960816050396796,
 14: 4.295051609375698,
 15: 1.608415340113405,
 16: 1.6914433294848752,
 17: 1.1097523485331933,
 18: 1.604222907994386,
 19: 3.259336057109164,
 20: 1.1254611375176176,
 21: 0.8990765686369685,
 22: 3.09875696423173,
 23: 1.319094894343067,
 24: 2.972832314641779,
 25: 1.7845065991690863,
 26: 2.676256660404048,
 27: 1.9799813564314432,
 28: 1.5661183816622364,
 29: 2.421647960522681,
 30: 1.1761258565499215,
 31: 1.3830574292710225,
 32: 2.1181339095722347,
 33: 2.5661183816622364,
 34: 2.602330554316681,
 35: 1.464462940690554,
 36: 2.222119312605075,
 37: 3.1273124246847828,
 38: 5.140149649389955,
 39: 3.022878353734191,
 40: 2.314723531622132,
 41: 2.

In [20]:
tag_counter = dict(Counter(tag_list))
tag_counter = dict((tag_to_tid[k], v) for k, v in tag_counter.items())
tag_count_df = pd.DataFrame(columns=['tid', 'counts'])
tag_count_df['tid'] = tag_counter.keys()
tag_count_df['counts'] = tag_counter.values()
tag_count_df = tag_count_df.sort_values('counts', ascending=False, ignore_index=True)

In [21]:
tag_count_df.head()

Unnamed: 0,tid,counts
0,21,17421
1,67,12105
2,110,11827
3,122,11114
4,4,10782


In [36]:
data['tids'] = data['tags'].apply(lambda x: [tag_to_tid[t] for t in x])
# create genre representation
tid_list = list(set([tag_to_tid[t] for t in tag_list]))
tag_genre_df = pd.DataFrame(columns=sorted(genre_list), index=tid_list)

In [45]:
song_meta.loc[[525514,129701,383374]]['all_genres'].tolist()

[['GN1402', 'GN1401', 'GN1400'],
 ['GN0901', 'GN0902', 'GN1001', 'GN0900', 'GN1000'],
 ['GN1012', 'GN1005', 'GN1001', 'GN1000']]

In [48]:
data['genres'] = data['songs'].progress_apply(lambda x: list(set([g for gnrs in (song_meta.loc[x]['all_genres'].tolist()) for g in gnrs])))

  0%|          | 0/138086 [00:00<?, ?it/s]

In [61]:
in_data = data[['id', 'songs', 'istrain', 'tids', 'genres']]
in_data

Unnamed: 0,id,songs,istrain,tids,genres
0,61281,"[525514, 129701, 383374, 562083, 297861, 13954...",1,[0],"[GN1302, GN0900, GN1904, GN1901, GN1401, GN110..."
1,10532,"[432406, 675945, 497066, 120377, 389529, 24427...",1,"[1, 2]","[GN1700, GN1803, GN1601, GN0900, GN0103, GN180..."
2,76951,"[83116, 276692, 166267, 186301, 354465, 256598...",1,"[3, 4]","[GN1700, GN0303, GN0401, GN0500, GN0400, GN080..."
3,147456,"[394031, 195524, 540149, 287984, 440773, 10033...",1,"[5, 6, 7, 8, 9, 10, 11, 12, 13, 14]","[GN1302, GN0900, GN1700, GN1803, GN0103, GN180..."
4,27616,"[159327, 553610, 5130, 645103, 294435, 100657,...",1,[15],"[GN0303, GN0103, GN0401, GN0203, GN0400, GN250..."
...,...,...,...,...,...
138081,101722,"[75842, 26083, 244183, 684715, 500593, 508608,...",0,[4],"[GN0900, GN1700, GN0303, GN0103, GN0401, GN090..."
138082,122127,"[450275, 487671, 561031, 663944, 628672, 59121...",0,"[11913, 335, 3162, 455, 23086]","[GN1700, GN0303, GN0401, GN0506, GN0400, GN050..."
138083,77438,"[625875, 464051, 11657, 236393, 358186, 213435...",0,[],"[GN0902, GN0900, GN1200, GN1001, GN1500, GN100..."
138084,36231,"[161094, 665833, 688145, 432735, 439938, 12665...",0,[],"[GN0900, GN1601, GN1613, GN1604, GN1614, GN090..."


# Recommendataion

In [97]:
test = val.set_index('id')

In [213]:
from scipy.sparse import csr_matrix
song_genre_smatrix = csr_matrix(song_genre_df.values)
n_genres = len(genre_count)

In [300]:
def recom_data(pids):
    res = []
    tag_recoms = tag_count_df[:10]['tid'].values
    tag_recoms = [tid_to_tag[tid] for tid in tag_recoms]
    
    for pid in tqdm(pids):
        songs_already = test.loc[pid, 'songs']
        cand_song = []
        
        if len(songs_already) == 0:
            cand_song = list(song_count_df[:100]['id'].values)
        else:
            p = np.zeros((n_genres, 1))
            gnr_cols = list(set(song_genre_smatrix[songs_already].indices))
            p[gnr_cols] = 1
            cand_song = np.argsort(song_genre_smatrix.dot(p).reshape(-1))[-150:][::-1]
            # cand_song = songs_already + list(cand_song[np.isin(cand_song, songs_already) == False][:100-len(songs_already)])
            cand_song = list(cand_song[np.isin(cand_song, songs_already) == False][:100])
        
        res.append({
            "id":pid,
            "songs":cand_song,
            "tags":tag_recoms
        })
    
    return res

In [301]:
answer = recom_data(test.index)

  0%|          | 0/23015 [00:00<?, ?it/s]

In [302]:
import io
import json
import distutils.dir_util

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)
        print('file save seuccess')

In [303]:
write_json(answer, "own_result/cbf/results.json")

file save seuccess
