In [1]:
import io
import os
import json
import distutils.dir_util
from collections import Counter
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

from scipy.sparse import csr_matrix

from sklearn.decomposition import NMF

In [2]:
pd.options.display.max_rows = 499
pd.options.display.max_columns = 499
pd.options.mode.chained_assignment = None

In [3]:
fname = '../static/fonts/D2Coding.ttc'
font_family = fm.FontProperties(fname=fname).get_name()
plt.rcParams['font.family'] = font_family
plt.rcParams['font.size'] = 14

In [4]:
%matplotlib inline

In [5]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))


def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]

In [6]:
train_path = '../arena_data/orig/train.json'
val_path = '../arena_data/orig/val.json'

train_json = load_json(train_path)
val_json = load_json(val_path)

In [7]:
def json_to_dataframe(json_data):
    dataframe_dict = {'id': [], 'plylst_title': [], 'tags': [], 'songs': [], 'like_cnt': [], 'updt_date': []}

    for data in tqdm(json_data):
        dataframe_dict['id'].append(data['id'])
        dataframe_dict['plylst_title'].append(data['plylst_title'])
        dataframe_dict['tags'].append(data['tags'])
        dataframe_dict['songs'].append(data['songs'])
        dataframe_dict['like_cnt'].append(data['like_cnt'])
        dataframe_dict['updt_date'].append(data['updt_date'])
    
    dataframe = pd.DataFrame(dataframe_dict)
    dataframe['updt_date'] = pd.to_datetime(dataframe.updt_date)

    return dataframe


In [8]:
train_df = json_to_dataframe(train_json)
train_df

100%|██████████| 92056/92056 [00:00<00:00, 469889.86it/s]


Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
0,147668,To. 힘들고 지친 분들에게,"[힐링, 휴식, 밤, 새벽]","[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27
1,50422,130807-7,[팝],"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11
2,116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,[뉴에이지],"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50
3,55076,당신을 하얗게 불태울 곡들,"[하드락, 록스피릿, 댄스]","[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25
4,125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[힐링, 휴식, 기분전환]","[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50
...,...,...,...,...,...,...
92051,149690,옛날노래 * 좋은노래 8090년생 노래 모음,"[90년생, 회상, 추억, 좋은노래, 80년생, 옛날노래]","[292099, 513963, 174225, 287212, 140444, 62469...",155,2020-01-15 15:15:45
92052,35004,LOVE 1,[팝],"[62596, 359718, 596004, 668790, 291212, 148977...",8,2010-03-23 00:03:00
92053,59765,추억의 2004년 발라드 베스트,"[여행, 발라드, 기분전환, 사랑]","[214372, 145150, 407082, 160552, 102445, 50845...",3,2019-05-15 13:26:07
92054,9867,All Music Guide 선정 90s R&B: 1997,"[소울, 알앤비]","[561958, 397574, 250915, 110345, 426772, 10698...",51,2013-12-24 14:40:01


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92056 entries, 0 to 92055
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            92056 non-null  int64         
 1   plylst_title  92056 non-null  object        
 2   tags          92056 non-null  object        
 3   songs         92056 non-null  object        
 4   like_cnt      92056 non-null  int64         
 5   updt_date     92056 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 4.2+ MB


In [10]:
val_df = json_to_dataframe(val_json)
val_df

100%|██████████| 23015/23015 [00:00<00:00, 606230.53it/s]


Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
0,18488,요즘 많이듣는 인디 노래,"[카페, 인디음악, 드라이브, 인디뮤직, 사랑, 이별, 인디]","[674442, 131295, 83652, 352919, 233166, 99741,...",3,2017-07-17 11:00:52
1,76254,살랑살랑 불어오는 바람 같은 뉴에이지,"[살랑살랑, 뉴에이지]","[222141, 422934, 4917, 700161, 424495, 683582,...",4,2017-07-20 13:42:37
2,86227,비오는날 감미롭고 우울한 재즈,"[비오는날, 밤, 새벽]","[333034, 638621, 483000, 570730, 442053, 17405...",41,2015-07-10 03:18:46
3,87450,걸크러쉬돋는 여자보컬 락 노래 모음,"[락, 락밴드, 메탈, 락음악]","[229337, 30825, 475737, 672432, 59091, 98657, ...",17,2017-07-10 21:30:25
4,24649,퇴근 후 차분한 인디,"[집중, 휴식, 밤, 카페, 새벽, 차분한, 조용한, 인디]","[13930, 18100, 105626, 310720, 93295, 557891, ...",5,2020-04-09 00:04:15
...,...,...,...,...,...,...
23010,34841,♥♥♥♥♥♥♥♥♥♥♥♥♥♥♥,[발라드],"[55291, 84345, 434639, 129233, 93698, 121309, ...",11,2007-10-25 09:10:55
23011,119043,영원한 가왕 조용필,"[자작곡, 원조, 열창, 조용필, 감동, 명곡]","[38369, 139236, 201546, 134358, 596576, 649107...",10,2020-01-06 09:28:09
23012,65397,2015 SJF 서울 재즈 페스티벌 셋리스트 - 해외 공연팀,"[서울재즈페스티벌, 서재페, SJF, 셋리스트, 공연]","[589131, 695266, 300481, 348950, 165299, 19520...",18,2017-02-03 17:57:37
23013,79292,회복의 찬양(스튜디오 녹음),"[CCM, 항상]","[425704, 404399, 190558, 72325, 54248, 389972,...",67,2018-07-12 09:38:59


In [11]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23015 entries, 0 to 23014
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            23015 non-null  int64         
 1   plylst_title  23015 non-null  object        
 2   tags          23015 non-null  object        
 3   songs         23015 non-null  object        
 4   like_cnt      23015 non-null  int64         
 5   updt_date     23015 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 1.1+ MB


In [12]:
def get_unique_value(dataframe, column, list_type=True):
    unique_values = set()
    if list_type:
        for c in tqdm(dataframe[column]):
            unique_values |= set(c)
    else:
        unique_values = set(dataframe[column].unique())
    
    return unique_values

In [13]:
unique_tags_train = get_unique_value(train_df, 'tags')
len(unique_tags_train)

100%|██████████| 92056/92056 [00:00<00:00, 692222.88it/s]


25480

In [14]:
train_df[train_df.tags.apply(len) == 0]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date


In [15]:
train_df[train_df.tags.apply(lambda x: '' in x)]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
88542,95032,어떻게 떴냐고? 답은 바로 TikTok!,"[힙합, , 흑인음악, 힙합엘이, 힙합추천, 틱톡, 외힙, HIPHOPLE]","[253222, 270262, 301533, 136007, 238347, 13384...",7,2020-01-08 11:19:11


In [16]:
unique_tags_val = get_unique_value(val_df, 'tags')
len(unique_tags_val)

100%|██████████| 23015/23015 [00:00<00:00, 657668.38it/s]


10802

In [17]:
unique_tags = unique_tags_train | unique_tags_val
len(unique_tags)

29160

In [18]:
unique_songs_train = get_unique_value(train_df, 'songs')
len(unique_songs_train)

100%|██████████| 92056/92056 [00:00<00:00, 125933.83it/s]


549729

In [19]:
unique_songs_val = get_unique_value(val_df, 'songs')
len(unique_songs_val)

100%|██████████| 23015/23015 [00:00<00:00, 120392.80it/s]


261843

In [20]:
unique_songs = unique_songs_train | unique_songs_val
len(unique_songs)

615142

In [21]:
unique_playlists_train = get_unique_value(train_df, 'plylst_title')
len(unique_playlists_train)

100%|██████████| 92056/92056 [00:00<00:00, 363078.60it/s]


2502

In [22]:
unique_playlists_val = get_unique_value(val_df, 'plylst_title')
len(unique_playlists_val)

100%|██████████| 23015/23015 [00:00<00:00, 327853.61it/s]


1842

In [23]:
unique_playlists = unique_playlists_train | unique_playlists_val
len(unique_playlists)

2626

In [24]:
def item_to_index(items):
    return {item:idx for idx, item in enumerate(items)}

In [25]:
tag2idx = item_to_index(unique_tags)
song2idx = item_to_index(unique_songs)

In [26]:
def dataframe_to_user_item_matrix(dataframe_train, dataframe_test, item='tags', playlist2idx=None):
    assert item in ['tags', 'songs']

    unique_items_train = get_unique_value(dataframe_train, item)
    unique_items_test = get_unique_value(dataframe_test, item)
    unique_items = unique_items_train | unique_items_test

    item2idx = item_to_index(unique_items)

    if not playlist2idx:
        unique_playlists_train = get_unique_value(dataframe_train, 'plylst_title', False)
        unique_playlists_test = get_unique_value(dataframe_test, 'plylst_title', False)
        unique_playlists = unique_playlists_train | unique_playlists_test

        playlist2idx = item_to_index(unique_playlists)


    rows = list()
    cols = list()
    data = list()
    for idx, item_list in tqdm(enumerate(dataframe_train[item])):
        for i in item_list:
            rows.append(playlist2idx[dataframe_train.loc[idx, 'plylst_title']])
            cols.append(item2idx[i])   
            data.append(1)  
    
    rows = np.array(rows)
    cols = np.array(cols)
    data = np.array(data)

    user_item_matrix = csr_matrix((data, (rows, cols)))

    return playlist2idx, item2idx, user_item_matrix


In [27]:
playlist2idx, tag2idx, playlist_tag_matrix = dataframe_to_user_item_matrix(train_df, val_df, item='tags')
_, song2idx, playlist_song_matrix = dataframe_to_user_item_matrix(train_df, val_df, item='songs', playlist2idx=playlist2idx)

100%|██████████| 92056/92056 [00:00<00:00, 827177.73it/s]
100%|██████████| 23015/23015 [00:00<00:00, 781755.14it/s]
92056it [00:03, 24151.93it/s]
100%|██████████| 92056/92056 [00:00<00:00, 117121.69it/s]
100%|██████████| 23015/23015 [00:00<00:00, 125054.94it/s]
92056it [00:40, 2280.20it/s]


In [28]:
playlist_tag_matrix

<110531x29160 sparse matrix of type '<class 'numpy.longlong'>'
	with 374930 stored elements in Compressed Sparse Row format>

In [29]:
playlist_song_matrix

<110531x615142 sparse matrix of type '<class 'numpy.longlong'>'
	with 4172234 stored elements in Compressed Sparse Row format>

In [30]:
def matrix_factorization(user_item_matrix, n_components=100):
    model = NMF(n_components=n_components, init='random', verbose=True, tol=5e-2, max_iter=100, random_state=2020, shuffle=True)
    model.fit(user_item_matrix)
    W = model.transform(user_item_matrix)
    H = model.components_

    return model, W, H

In [31]:
model_tag, W_tag, H_tag = matrix_factorization(playlist_tag_matrix)

violation: 1.0
violation: 8.253251793254766
violation: 3.8592201775425834
violation: 2.2502009052053844
violation: 1.358728231310922
violation: 0.9692378599041014
violation: 0.8659229343831552
violation: 0.5785203115456077
violation: 0.3839037684035592
violation: 0.27463257098504057
violation: 0.23915058213399576
violation: 0.23983070970853282
violation: 0.23188682489020687
violation: 0.2140629109887842
violation: 0.2195186069309953
violation: 0.1805632617620039
violation: 0.18925521901967818
violation: 0.164775936212995
violation: 0.1651802280439795
violation: 0.14962911912486876
violation: 0.13552165951465192
violation: 0.12278060118631266
violation: 0.10092647380581429
violation: 0.08510511302809617
violation: 0.07744705351514054
violation: 0.0679154785183234
violation: 0.06578454584353302
violation: 0.062069938907968615
violation: 0.06309585120443338
violation: 0.07221842872895957
violation: 0.08364122058180128
violation: 0.09352876370373493
violation: 0.10886327165831255
violation

In [32]:
with open('checkpoints/model-v1/model-v1-tag-nmf.pkl', 'wb') as f:
    pickle.dump(model_tag, f)

In [33]:
model_song, W_song, H_song = matrix_factorization(playlist_song_matrix)

violation: 1.0
violation: 8.209096773007074
violation: 2.883828377445379
violation: 1.5856331784149784
violation: 1.0448710915465376
violation: 0.7886296080586163
violation: 0.6602227615533662
violation: 0.5343575967657573
violation: 0.41433038068259437
violation: 0.3308719601908608
violation: 0.274193502759267
violation: 0.2240563399250701
violation: 0.2075414694574199
violation: 0.19493122215368114
violation: 0.18000432769204613
violation: 0.17639583546056295
violation: 0.17123665671559113
violation: 0.15781078496273152
violation: 0.14974119895192262
violation: 0.12979287533887152
violation: 0.11885332153246526
violation: 0.10290261508655914
violation: 0.0955550367548632
violation: 0.09206762490950353
violation: 0.08351875696277881
violation: 0.0787769664651518
violation: 0.07686701254554985
violation: 0.0767448589820301
violation: 0.07269858344996909
violation: 0.07175863272755544
violation: 0.06898619693580749
violation: 0.06965013237850394
violation: 0.06696506389681517
violation:

In [34]:
with open('checkpoints/model-v1/model-v1-song-nmf.pkl', 'wb') as f:
    pickle.dump(model_song, f)