### Environment Settings

In [5]:
from collections import Counter
import distutils.dir_util
import io
import json
import os
import pickle

# linear algebra
import numpy as np

# dataframe
import pandas as pd

# progress-bar
from tqdm import tqdm

# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# sparse matrix
import scipy.sparse as sp

# TF-IDF Transformer
from sklearn.feature_extraction.text import TfidfTransformer 

# matrix factorization
from sklearn.decomposition import NMF

# find neighbors
from sklearn.neighbors import NearestNeighbors

# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
pd.options.display.max_rows = 499
pd.options.display.max_columns = 499
pd.options.mode.chained_assignment = None

In [7]:
fname = '../static/fonts/D2Coding.ttc'
font_family = fm.FontProperties(fname=fname).get_name()
plt.rcParams['font.family'] = font_family
plt.rcParams['font.size'] = 14

In [8]:
%matplotlib inline

## Functions

### Basic IO(JSON)

In [9]:
def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))

### Remove Item Already in Dataset

In [10]:
def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]

### JSON to Dataframe

In [11]:
def json_to_dataframe(json_data):
    dataframe_dict = {'id': [], 'plylst_title': [], 'tags': [], 'songs': [], 'like_cnt': [], 'updt_date': []}

    for data in tqdm(json_data):
        dataframe_dict['id'].append(data['id'])
        dataframe_dict['plylst_title'].append(data['plylst_title'])
        dataframe_dict['tags'].append(data['tags'])
        dataframe_dict['songs'].append(data['songs'])
        dataframe_dict['like_cnt'].append(data['like_cnt'])
        dataframe_dict['updt_date'].append(data['updt_date'])
    
    dataframe = pd.DataFrame(dataframe_dict)
    dataframe['updt_date'] = pd.to_datetime(dataframe.updt_date)

    return dataframe

In [29]:
train_path = '../arena_data/orig/train.json'
val_path = '../arena_data/questions/val.json'

train_json = load_json(train_path)
val_json = load_json(val_path)

train_df = json_to_dataframe(train_json)
val_df = json_to_dataframe(val_json)

100%|██████████| 92056/92056 [00:00<00:00, 652912.40it/s]
100%|██████████| 23015/23015 [00:00<00:00, 611290.29it/s]


### Dataframe to Sparser Matrix

In [30]:
def get_unique_items(dataframe, column, list_type=True):
    unique_items = set()
    if list_type:
        for c in tqdm(dataframe[column]):
            unique_items |= set(c)
    else:
        assert len(dataframe[column].unique()) == len(dataframe[column])
        unique_items = dataframe[column].unique()
    
    return unique_items

In [31]:
unique_tags = get_unique_items(pd.concat([train_df, val_df], ignore_index=True, copy=False), 'tags', list_type=True)
unique_songs = get_unique_items(pd.concat([train_df, val_df], ignore_index=True, copy=False), 'songs', list_type=True)
unique_playlists_tr = get_unique_items(train_df, 'id', list_type=False)
unique_playlists_val = get_unique_items(val_df, 'id', list_type=False)

100%|██████████| 115071/115071 [00:00<00:00, 591171.86it/s]
100%|██████████| 115071/115071 [00:00<00:00, 134303.55it/s]


In [32]:
def make_item_index_dictionary(items):
    item2idx = {item:idx for idx, item in enumerate(items)}
    idx2item = {idx:item for item, idx in item2idx.items()}
    return item2idx, idx2item

In [33]:
tag2idx, idx2tag = make_item_index_dictionary(unique_tags)
song2idx, idx2song = make_item_index_dictionary(unique_songs)
playlist2idx_tr, idx2playlist_tr = make_item_index_dictionary(unique_playlists_tr)
playlist2idx_val, idx2playlist_val = make_item_index_dictionary(unique_playlists_val)

n_train = len(playlist2idx_tr)
playlist2idx = {playlist:idx for playlist, idx in playlist2idx_tr.items()}
for playlist, idx in playlist2idx_val.items():
    playlist2idx[playlist] = (idx + n_train)
idx2playlist = {idx:playlist for playlist, idx in playlist2idx.items()}

In [35]:
assert len(playlist2idx_tr) + len(playlist2idx_val) == len(playlist2idx)

In [92]:
def dataframe_to_matrix(dataframe, item='tags', playlist2idx=None, item2idx=None):
    assert item in ['tags', 'songs']

    matrix_shape = (len(playlist2idx), len(item2idx))

    if 'plylst_id' not in dataframe.columns:
        dataframe['plylst_id'] = dataframe.id.map(playlist2idx)

    column_name = '{}_id'.format(item)
    if column_name not in dataframe.columns:
        dataframe[column_name] = dataframe[item].apply(lambda items: [item2idx[item] for item in items])

    rows = list()
    cols = list()
    data = list()

    for r, cs in tqdm(zip(dataframe.plylst_id, dataframe[column_name])):
        for c in cs:
            rows.append(r)
            cols.append(c)
    
    rows = np.array(rows)
    cols = np.array(cols)
    data = np.ones(rows.shape[0])

    return sp.csr_matrix((data, (rows, cols)), shape=matrix_shape)

In [93]:
pt_matrix_tr = dataframe_to_matrix(train_df, item='tags', playlist2idx=playlist2idx_tr, item2idx=tag2idx)
pt_matrix_val = dataframe_to_matrix(val_df, item='tags', playlist2idx=playlist2idx_val, item2idx=tag2idx)
pt_matrix = dataframe_to_matrix(pd.concat([train_df, val_df], ignore_index=True, copy=False), item='tags', playlist2idx=playlist2idx, item2idx=tag2idx)

92056it [00:00, 680082.73it/s]
23015it [00:00, 940884.30it/s]
115071it [00:00, 699638.41it/s]


In [94]:
ps_matrix_tr = dataframe_to_matrix(train_df, item='songs', playlist2idx=playlist2idx_tr, item2idx=song2idx)
ps_matrix_val = dataframe_to_matrix(val_df, item='songs', playlist2idx=playlist2idx_val, item2idx=song2idx)
ps_matrix = dataframe_to_matrix(pd.concat([train_df, val_df], ignore_index=True, copy=False), item='songs', playlist2idx=playlist2idx, item2idx=song2idx)

92056it [00:01, 85850.87it/s]
23015it [00:00, 180228.65it/s]
115071it [00:01, 105265.41it/s]


### Transform to TF-IDF

In [95]:
def transform_sparse_matrix_tfidf(sparse_matrix, transformer):
    return transformer.transform(sparse_matrix)

In [96]:
transformer_tag = TfidfTransformer(smooth_idf=True)
transformer_tag.fit(pt_matrix)
pt_matrix_idf_tr = transform_sparse_matrix_tfidf(pt_matrix_tr, transformer_tag)
pt_matrix_idf_val = transform_sparse_matrix_tfidf(pt_matrix_val, transformer_tag)

In [97]:
transformer_song = TfidfTransformer(smooth_idf=True)
transformer_song.fit(ps_matrix)
ps_matrix_idf_tr = transform_sparse_matrix_tfidf(ps_matrix_tr, transformer_song)
ps_matrix_idf_val = transform_sparse_matrix_tfidf(ps_matrix_val, transformer_song)

### Matrix Factorization

In [19]:
def matrix_factorization(sparse_matrix, n_components=100):
    model = NMF(n_components=n_components, init='random', verbose=True, tol=5e-2, max_iter=100, random_state=2020, shuffle=True)
    model.fit(sparse_matrix)
    W = model.transform(sparse_matrix)
    H = model.components_

    return model, W, H

In [20]:
train = True
checkpoint_tag = 'checkpoints/model-v1/model-v1-tag-nmf-50.pkl'
checkpoint_song = 'checkpoints/model-v1/model-v1-song-nmf-50.pkl'

if train:
    model_t, W_t, H_t = matrix_factorization(pt_matrix_idf_tr, n_components=50)
    with open(checkpoint_tag, 'wb') as f:
        pickle.dump(model_t, f)

    model_s, W_s, H_s = matrix_factorization(ps_matrix_idf_tr, n_components=50)
    with open(checkpoint_song, 'wb') as f:
        pickle.dump(model_s, f)
else:
    with open(checkpoint_tag, 'rb') as f:
        model_t = pickle.load(f)
    W_t = model_t.transform(pt_matrix_idf_tr)
    H_t = model_t.components_
    
    with open(checkpoint_song, 'rb') as f:
        model_s = pickle.load(f)
    W_s = model_s.transform(ps_matrix_idf_tr)
    H_s = model_s.components_

violation: 1.0
violation: 18.409175796704126
violation: 6.664617536464677
violation: 3.624837120135531
violation: 1.8568948203753723
violation: 0.9754936313300209
violation: 0.8903335291536104
violation: 0.5871592829744191
violation: 0.4331263463815185
violation: 0.44253548082716365
violation: 0.5350062917655157
violation: 0.6666671885011007
violation: 0.38706733867497
violation: 0.25653639495624253
violation: 0.20615753133320336
violation: 0.16712156084234642
violation: 0.1909947419845377
violation: 0.17070984098939668
violation: 0.18996481034800666
violation: 0.20872359701240692
violation: 0.2115675238059274
violation: 0.1775566661492478
violation: 0.1982818296676528
violation: 0.18650963085018904
violation: 0.19875695076143154
violation: 0.09954930541146494
violation: 0.06861738875408392
violation: 0.04653956876336177
Converged at iteration 29
violation: 1.0
violation: 0.03753665264243677
Converged at iteration 3
violation: 1.0
violation: 6.080079039784536
violation: 3.9321771820202

### Playlist Continuation

In [98]:
def calculate_cosine_similarity(A, B):
    return cosine_similarity(A, B, dense_output=False)

In [99]:
### fint neighbors of sparse_matrix_src from sparse_matrix_dst

def find_neigbors(sparse_matrix_src, sparse_matrix_dst, k=5):
    # neigh = NearestNeighbors(n_neighbors=k, n_jobs=4)
    # neigh.fit(sparse_matrix_dst)
    # dist, neighbors = neigh.kneighbors(sparse_matrix_src)
    
    # return neigh, dist, neighbors
    similarity = calculate_cosine_similarity(sparse_matrix_src, sparse_matrix_dst)
    neighbors = list()
    for rid in tqdm(range(similarity.shape[0])):
        neighbors.append(np.argsort(similarity[rid, :].toarray()[0])[::-1][:k])
    
    neighbors = np.array(neighbors)
    return similarity, neighbors

In [100]:
similarity, neighbors = find_neigbors(
    sp.hstack([pt_matrix_idf_val, ps_matrix_idf_val]), 
    sp.hstack([pt_matrix_idf_tr, ps_matrix_idf_tr]),
    k=10
)

100%|██████████| 23015/23015 [01:09<00:00, 329.41it/s]


In [104]:
def recommend_items_for_continuation(pt_matrix_idf_val, pt_matrix_idf_tr, ps_matrix_idf_val, ps_matrix_idf_tr, neighbors, idx2playlist, idx2tag, idx2song):
    recommendations = list()
    collect_counter = 0
    for rid in tqdm(range(pt_matrix_idf_val.shape[0])):
        playlist = idx2playlist[rid]

        rid_neighbors = neighbors[rid, :]

        # rid_similarity = similarity[rid, rid_neighbors].toarray()
        # ratings = sparse_matrix_dst[rid_neighbors, :].multiply(rid_similarity.reshape(-1, 1)).sum(axis=0)

        tags = list()
        counter = 0
        for tid in pt_matrix_idf_tr[rid_neighbors, :].toarray().sum(axis=0).argsort()[::-1]:
            if tid not in pt_matrix_idf_val[rid, :].nonzero()[1]:
                t = idx2tag[tid]
                tags.append(t)
                counter += 1
            if counter == 10:
                break

        songs = list()
        counter = 0
        for sid in ps_matrix_idf_tr[rid_neighbors, :].toarray().sum(axis=0).argsort()[::-1]:
            if sid not in ps_matrix_idf_val[rid, :].nonzero()[1]:
                s = idx2song[sid]
                songs.append(s)
                counter += 1
            if counter == 100:
                break

        recommendations.append({
            "id": playlist,
            "songs": tags,
            "tags": songs,
        })

        collect_counter += 1
        if collect_counter % 1000 == 0:
            gc.collect()
            collect_counter = 0

    return recommendations

In [105]:
import gc
# del similarity
# del pt_matrix
# del ps_matrix
# del transformer_tag
# del transformer_song
# del train_df
# del val_df

# 12913
gc.collect()

32

In [106]:
recommendations = recommend_items_for_continuation(pt_matrix_idf_val, pt_matrix_idf_tr, ps_matrix_idf_val, ps_matrix_idf_tr, neighbors, idx2playlist_val, idx2tag, idx2song)
# recommendations_song = recommend_items_for_continuation(ps_matrix_idf_val, ps_matrix_tr, similarity, neighbors, 100)

100%|██████████| 23015/23015 [17:49<00:00, 21.51it/s]


In [108]:
answers = list()
for r in tqdm(recommendations):
    answers.append({
        'id': r['id'],
        'songs': r['tags'],
        'tags': r['songs']
    })

100%|██████████| 23015/23015 [00:00<00:00, 474064.74it/s]


In [1]:
def restore_item(idx, idx2item, list_type=True):
    if list_type:
        return list(map(lambda x: [idx2item[i] for i in x], idx))
    else:
        return list(map(lambda x: idx2item[x], idx))

In [2]:
def generate_answers(questions, recommendations_tag, recommendations_song):
    answers = []

    for idx, q in tqdm(enumerate(questions)):
        answers.append({
            "id": q["id"],
            "songs": remove_seen(q["songs"], recommendations_song[idx])[:100],
            "tags": remove_seen(q["tags"], recommendations_tag[idx])[:10],
        })

    return answers

In [109]:
write_json(answers, "results/results.json")

In [111]:
# -*- coding: utf-8 -*-
import fire
import numpy as np

class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)

        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)


In [112]:
evaluator = ArenaEvaluator()
evaluator.evaluate('../arena_data/answers/val.json', './arena_data/results/results.json')

Music nDCG: 0.151252
Tag nDCG: 0.333397
Score: 0.178573


<hr>

In [6]:
train_path = '../arena_data/orig/train.json'
val_path = '../arena_data/questions/val.json'

train_json = load_json(train_path)
val_json = load_json(val_path)

In [7]:
def json_to_dataframe(json_data):
    dataframe_dict = {'id': [], 'plylst_title': [], 'tags': [], 'songs': [], 'like_cnt': [], 'updt_date': []}

    for data in tqdm(json_data):
        dataframe_dict['id'].append(data['id'])
        dataframe_dict['plylst_title'].append(data['plylst_title'])
        dataframe_dict['tags'].append(data['tags'])
        dataframe_dict['songs'].append(data['songs'])
        dataframe_dict['like_cnt'].append(data['like_cnt'])
        dataframe_dict['updt_date'].append(data['updt_date'])
    
    dataframe = pd.DataFrame(dataframe_dict)
    dataframe['updt_date'] = pd.to_datetime(dataframe.updt_date)

    return dataframe


In [8]:
train_df = json_to_dataframe(train_json)
train_df

100%|██████████| 92056/92056 [00:00<00:00, 613830.18it/s]


Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
0,147668,To. 힘들고 지친 분들에게,"[힐링, 휴식, 밤, 새벽]","[663185, 649626, 6855, 188486, 348451, 169945,...",12,2016-06-23 10:06:27
1,50422,130807-7,[팝],"[627035, 256438, 603324, 200889, 441319, 21689...",0,2013-08-15 13:17:11
2,116432,숙면을 위한 슬픈 마음을 달래 줄 피아노,[뉴에이지],"[129204, 369497, 649743, 344619, 110281, 63266...",23,2015-09-03 16:51:50
3,55076,당신을 하얗게 불태울 곡들,"[하드락, 록스피릿, 댄스]","[677591, 420396, 104934, 119279, 251988, 58850...",1,2017-01-09 15:41:25
4,125064,[스피커 필수 / HIPHOP] 듣고 있음 꿀렁꿀렁이고 싶은 힙합음악!,"[힐링, 휴식, 기분전환]","[704455, 694036, 508043, 154933, 57614, 645195...",715,2016-02-22 12:32:50
...,...,...,...,...,...,...
92051,149690,옛날노래 * 좋은노래 8090년생 노래 모음,"[90년생, 회상, 추억, 좋은노래, 80년생, 옛날노래]","[292099, 513963, 174225, 287212, 140444, 62469...",155,2020-01-15 15:15:45
92052,35004,LOVE 1,[팝],"[62596, 359718, 596004, 668790, 291212, 148977...",8,2010-03-23 00:03:00
92053,59765,추억의 2004년 발라드 베스트,"[여행, 발라드, 기분전환, 사랑]","[214372, 145150, 407082, 160552, 102445, 50845...",3,2019-05-15 13:26:07
92054,9867,All Music Guide 선정 90s R&B: 1997,"[소울, 알앤비]","[561958, 397574, 250915, 110345, 426772, 10698...",51,2013-12-24 14:40:01


In [9]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92056 entries, 0 to 92055
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            92056 non-null  int64         
 1   plylst_title  92056 non-null  object        
 2   tags          92056 non-null  object        
 3   songs         92056 non-null  object        
 4   like_cnt      92056 non-null  int64         
 5   updt_date     92056 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 4.2+ MB


In [10]:
train_df[train_df.duplicated('id')]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date


In [11]:
val_df = json_to_dataframe(val_json)
val_df

100%|██████████| 23015/23015 [00:00<00:00, 724811.96it/s]


Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
0,56869,기분 좋게 숙면할 수 있는 좋은 음악,[휴식],"[408873, 517147, 319820, 594592, 440160, 66390...",34,2015-07-18 07:32:46
1,20098,크리스마스337,[사랑],"[405485, 381653, 566122, 686976, 368705]",0,2019-11-19 14:27:07
2,129099,HIP-HOP-비트천재 PART.1 Prima Vista 프로듀싱곡!,"[thequiett, 더콰이엇, 천재]","[54194, 86544, 135756, 17682, 453910, 206601, ...",34,2018-09-09 19:26:02
3,88153,보사노바의 아버지 Antonio Carlos Jobim,[],[],195,2014-11-19 10:17:59
4,114895,이 노래만 들으면 연애하고 싶어라~ (잡지 코스모폴리탄 추천+@),[사랑하고플때],"[25703, 316947, 549178, 625875, 671830, 11657,...",108,2017-07-25 16:14:07
...,...,...,...,...,...,...
23010,87427,21c LOVE SONGS .01,"[락, 가요]","[235955, 317540, 306109, 332887, 549463, 273586]",2,2020-03-27 04:10:40
23011,119647,신나고 즐겁게,[매장음악],"[342798, 513407, 451198, 137516, 465381, 17451...",9,2017-08-16 13:57:23
23012,96443,센치한 가을 그리고 이별,"[이별, 감성]","[375777, 448698, 140011, 257277, 474751, 25470...",2,2018-09-29 19:12:46
23013,49961,잔잔한 음악으로 감성 더하기:),[],"[36623, 78180, 434839, 32906, 576186, 448320, ...",2,2018-08-13 07:16:22


In [12]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23015 entries, 0 to 23014
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   id            23015 non-null  int64         
 1   plylst_title  23015 non-null  object        
 2   tags          23015 non-null  object        
 3   songs         23015 non-null  object        
 4   like_cnt      23015 non-null  int64         
 5   updt_date     23015 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 1.1+ MB


In [13]:
val_df[val_df.id.isin(train_df.id.unique())]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date


In [14]:
val_df[val_df.plylst_title.isin(train_df.plylst_title.unique())]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
12,30366,비가 오는 날엔,[],"[371171, 131262, 233805, 472902, 119979, 65941...",44,2011-07-11 19:37:22
25,54673,나만 알고싶은 노래들,[숨은명곡],"[113618, 422482, 79141, 707548, 49772, 111865,...",7,2016-07-15 10:20:17
30,2865,드라이브 하기 좋은 노래,[],"[493762, 362966, 324990, 412680, 609652]",5,2020-03-27 13:35:50
46,78327,한국 힙합의 시작에서 정착까지 76,[],"[458420, 178163, 648801, 205472, 58964, 662222...",0,2019-03-27 15:28:12
67,80982,감성힙합,[랩],"[664497, 676030, 63810, 528599, 508960, 176292...",20,2016-10-21 13:31:52
...,...,...,...,...,...,...
22964,71113,하루의 피로를 씻겨주는 음악 389,[저녁],"[69419, 199965, 108386, 702452, 578170, 577639...",0,2019-05-09 13:51:18
22965,127506,이별후..,[슬픔],"[473560, 260100, 635224, 512434, 352979, 11607...",1,2014-09-15 18:01:13
22976,69269,그렇게 너를 사랑해,[],"[210132, 263857, 279590, 108757, 657837]",1,2005-02-13 11:58:30
22982,139039,나만 알고싶은 노래,[],"[420396, 63786, 644806, 426812, 203189, 92534,...",4,2016-01-03 20:21:07


In [15]:
train_df[train_df.plylst_title == 'chill']

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
56020,58081,chill,[Chill],"[160901, 70505, 646171, 364758, 427835]",43,2017-12-20 02:41:26
61366,142273,chill,"[힙합, 랩]","[3312, 356590, 388570, 208711, 491476, 606786,...",246,2015-06-06 01:42:25
86248,41496,chill,"[감성, vlog]","[507383, 275673, 406073, 289777, 38513, 363125...",61,2019-10-06 20:24:55


In [16]:
val_df[val_df.plylst_title == 'chill']

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
17488,76459,chill,[],"[260973, 283130, 594936, 604699, 507183, 18483...",6,2018-09-29 00:31:46


In [18]:
def get_unique_value(dataframe, column, list_type=True):
    unique_values = set()
    if list_type:
        for c in tqdm(dataframe[column]):
            unique_values |= set(c)
    else:
        unique_values = set(dataframe[column].unique())
    
    return unique_values

In [19]:
unique_tags_train = get_unique_value(train_df, 'tags')
len(unique_tags_train)

100%|██████████| 92056/92056 [00:00<00:00, 673958.54it/s]


25480

In [20]:
train_df[train_df.tags.apply(len) == 0]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date


In [21]:
train_df[train_df.tags.apply(lambda x: '' in x)]

Unnamed: 0,id,plylst_title,tags,songs,like_cnt,updt_date
88542,95032,어떻게 떴냐고? 답은 바로 TikTok!,"[힙합, , 흑인음악, 힙합엘이, 힙합추천, 틱톡, 외힙, HIPHOPLE]","[253222, 270262, 301533, 136007, 238347, 13384...",7,2020-01-08 11:19:11


In [22]:
unique_tags_val = get_unique_value(val_df, 'tags')
len(unique_tags_val)

100%|██████████| 23015/23015 [00:00<00:00, 765870.95it/s]


4845

In [23]:
unique_tags = unique_tags_train | unique_tags_val
len(unique_tags)

26631

In [24]:
unique_songs_train = get_unique_value(train_df, 'songs')
len(unique_songs_train)

100%|██████████| 92056/92056 [00:00<00:00, 118379.90it/s]


549729

In [25]:
unique_songs_val = get_unique_value(val_df, 'songs')
len(unique_songs_val)

100%|██████████| 23015/23015 [00:00<00:00, 238839.27it/s]


147933

In [26]:
unique_songs = unique_songs_train | unique_songs_val
len(unique_songs)

576209

In [27]:
unique_playlists_train = get_unique_value(train_df, 'plylst_title')
len(unique_playlists_train)

100%|██████████| 92056/92056 [00:00<00:00, 357974.75it/s]


2502

In [28]:
unique_playlists_val = get_unique_value(val_df, 'plylst_title')
len(unique_playlists_val)

100%|██████████| 23015/23015 [00:00<00:00, 328941.76it/s]


1842

In [29]:
unique_playlists = unique_playlists_train | unique_playlists_val
len(unique_playlists)

2626

In [30]:
def make_item_index_dictionary(items):
    item2idx = {item:idx for idx, item in enumerate(items)}
    idx2item = {idx:item for idx, item in enumerate(items)}
    return item2idx, idx2item

In [31]:
tag2idx, idx2tag = make_item_index_dictionary(unique_tags)
song2idx, idx2song = make_item_index_dictionary(unique_songs)

In [32]:
len(song2idx)

576209

In [35]:
def _dataframe_to_user_item_matrix(dataframe, item, playlist2idx, item2idx):
    assert item in ['tags', 'songs']

    matrix_shape = (len(playlist2idx), len(item2idx))

    rows = list()
    cols = list()
    data = list()

    for idx, item_list in tqdm(enumerate(dataframe[item])):
        for i in item_list:
            rows.append(playlist2idx[dataframe.loc[idx, 'id']])
            cols.append(item2idx[i])   
            data.append(1)  
    
    rows = np.array(rows)
    cols = np.array(cols)
    data = np.array(data)

    user_item_matrix = csr_matrix((data, (rows, cols)), shape=matrix_shape)

    return user_item_matrix

In [38]:
def dataframe_to_user_item_matrix(dataframe_main, dataframe_secondary=None, item='tags', playlist2idx=None, item2idx=None):
    assert item in ['tags', 'songs']

    # Only For Training Data
    if dataframe_secondary is not None:
        unique_items_main = get_unique_value(dataframe_main, item)
        unique_items_secondary = get_unique_value(dataframe_secondary, item)
        unique_items = unique_items_main | unique_items_secondary

        item2idx, idx2item = make_item_index_dictionary(unique_items)

    if not playlist2idx:
        unique_playlists = get_unique_value(dataframe_main, 'id', False)
        playlist2idx, idx2playlist = make_item_index_dictionary(unique_playlists)
    else:
        idx2playlist = None

    user_item_matrix = _dataframe_to_user_item_matrix(dataframe_main, item, playlist2idx, item2idx)

    return (playlist2idx, idx2playlist), (item2idx, idx2item), user_item_matrix


In [39]:
(playlist2idx_train, idx2playlist_train), (tag2idx, idx2tag), playlist_tag_matrix_train = dataframe_to_user_item_matrix(
    train_df, 
    val_df, 
    item='tags'
)

(_, _), (song2idx, idx2song), playlist_song_matrix_train = dataframe_to_user_item_matrix(
    train_df, 
    val_df, 
    item='songs', 
    playlist2idx=playlist2idx_train
)

100%|██████████| 92056/92056 [00:00<00:00, 744746.51it/s]
100%|██████████| 23015/23015 [00:00<00:00, 1119522.03it/s]
92056it [00:03, 24736.14it/s]
100%|██████████| 92056/92056 [00:00<00:00, 106486.05it/s]
100%|██████████| 23015/23015 [00:00<00:00, 223872.73it/s]
92056it [00:40, 2282.24it/s]


In [40]:
(len(tag2idx), len(song2idx))

(26631, 576209)

In [41]:
playlist_tag_matrix_train

<92056x26631 sparse matrix of type '<class 'numpy.longlong'>'
	with 380794 stored elements in Compressed Sparse Row format>

In [42]:
playlist_song_matrix_train

<92056x576209 sparse matrix of type '<class 'numpy.longlong'>'
	with 4239978 stored elements in Compressed Sparse Row format>

In [126]:
transformer_tag = TfidfTransformer(smooth_idf=True)
playlist_tag_matrix_tfidf_train = transformer_tag.fit_transform(playlist_tag_matrix_train)

0.7156742027184873

In [127]:
transformer_song = TfidfTransformer(smooth_idf=True)
playlist_song_matrix_tfidf_train = transformer_song.fit_transform(playlist_song_matrix_train)

In [129]:
temp = np.array([[1, 0, 0, 0, 1, 1, 1, 0], 
                [1, 0, 0, 1, 1, 1, 1, 0], 
                [1, 0, 1, 1, 0, 1, 1, 0], 
                [1, 0, 0, 0, 1, 0, 0, 0], 
                [1, 1, 0, 0, 1, 0, 1, 0]])
transformer = TfidfTransformer(smooth_idf=True)
transformer.fit(temp)

TfidfTransformer()

In [131]:
transformer.transform(temp).toarray()

array([[0.41626575, 0.        , 0.        , 0.        , 0.49215996,
        0.58504698, 0.49215996, 0.        ],
       [0.34024928, 0.        , 0.        , 0.57609212, 0.40228406,
        0.4782085 , 0.40228406, 0.        ],
       [0.29305311, 0.        , 0.61500487, 0.49618205, 0.        ,
        0.41187593, 0.34648301, 0.        ],
       [0.64578193, 0.        , 0.        , 0.        , 0.7635219 ,
        0.        , 0.        , 0.        ],
       [0.34921638, 0.73286979, 0.        , 0.        , 0.41288606,
        0.        , 0.41288606, 0.        ]])

In [132]:
transformer.transform(temp[0:3, :]).toarray()

array([[0.41626575, 0.        , 0.        , 0.        , 0.49215996,
        0.58504698, 0.49215996, 0.        ],
       [0.34024928, 0.        , 0.        , 0.57609212, 0.40228406,
        0.4782085 , 0.40228406, 0.        ],
       [0.29305311, 0.        , 0.61500487, 0.49618205, 0.        ,
        0.41187593, 0.34648301, 0.        ]])

In [134]:
transformer.transform(temp[3:, :]).toarray()

array([[0.64578193, 0.        , 0.        , 0.        , 0.7635219 ,
        0.        , 0.        , 0.        ],
       [0.34921638, 0.73286979, 0.        , 0.        , 0.41288606,
        0.        , 0.41288606, 0.        ]])

In [247]:
(playlist2idx_val, idx2playlist_val), (_, _), playlist_tag_matrix_val = dataframe_to_user_item_matrix(
    val_df, 
    item='tags', 
    item2idx=tag2idx
)

(_, _), (_, _), playlist_song_matrix_val = dataframe_to_user_item_matrix(
    val_df, 
    item='songs', 
    playlist2idx=playlist2idx_val, 
    item2idx=song2idx
)

23015it [00:00, 23653.18it/s]
23015it [00:10, 2252.56it/s]


In [248]:
(len(tag2idx), len(song2idx))

(29160, 615142)

In [249]:
playlist_tag_matrix_val

<23015x29160 sparse matrix of type '<class 'numpy.longlong'>'
	with 95537 stored elements in Compressed Sparse Row format>

In [250]:
playlist_song_matrix_val

<23015x615142 sparse matrix of type '<class 'numpy.longlong'>'
	with 1045893 stored elements in Compressed Sparse Row format>

In [251]:
def matrix_factorization(user_item_matrix, n_components=100):
    model = NMF(n_components=n_components, init='random', verbose=True, tol=5e-2, max_iter=100, random_state=2020, shuffle=True)
    model.fit(user_item_matrix)
    W = model.transform(user_item_matrix)
    H = model.components_

    return model, W, H

In [252]:
model_tag, W_tag, H_tag = matrix_factorization(playlist_tag_matrix_train)

violation: 1.0
violation: 10.031219999727462
violation: 5.032633123240272
violation: 2.8606228865707415
violation: 1.6938233208402944
violation: 1.0858468999396413
violation: 0.716443114992063
violation: 0.5437725627962213
violation: 0.40790838284373365
violation: 0.31043250916154463
violation: 0.23624660844762002
violation: 0.1954251238372654
violation: 0.20152163718599497
violation: 0.2041245655965717
violation: 0.20306654000778487
violation: 0.18096686496449652
violation: 0.13602163037228346
violation: 0.10729017552135867
violation: 0.10736056600160906
violation: 0.10056312679932365
violation: 0.11121065578247828
violation: 0.1231458809807175
violation: 0.1595481385942812
violation: 0.19544652703972082
violation: 0.1433736581997101
violation: 0.0905547414501437
violation: 0.08779578021394215
violation: 0.10127077057019854
violation: 0.10141061530167898
violation: 0.11532596334873058
violation: 0.11349834666322749
violation: 0.11053876034435081
violation: 0.11382956726666008
violatio

In [253]:
with open('checkpoints/model-v1/model-v1-tag-nmf.pkl', 'wb') as f:
    pickle.dump(model_tag, f)

In [254]:
model_song, W_song, H_song = matrix_factorization(playlist_song_matrix_train)

violation: 1.0
violation: 6.923231816431041
violation: 3.3249560160491276
violation: 1.836157124128675
violation: 1.192462362749963
violation: 0.8870859938251491
violation: 0.7160244072257216
violation: 0.5674657692555136
violation: 0.4685138372836856
violation: 0.38442684241115394
violation: 0.3072268213123712
violation: 0.2339058114900291
violation: 0.18140244528733387
violation: 0.14508710410606468
violation: 0.12660973133082834
violation: 0.10956747778808056
violation: 0.09671586499308156
violation: 0.08992692162203479
violation: 0.08293266889109506
violation: 0.07543562868468079
violation: 0.06925456725729934
violation: 0.06148600280308796
violation: 0.057046846547261165
violation: 0.052241434203767645
violation: 0.04947411992956252
Converged at iteration 26
violation: 1.0
violation: 0.4546062627449705
violation: 0.0927027897338968
violation: 0.023141694374950592
Converged at iteration 5


In [255]:
with open('checkpoints/model-v1/model-v1-song-nmf.pkl', 'wb') as f:
    pickle.dump(model_song, f)

In [256]:
with open('checkpoints/model-v1/model-v1-tag-nmf.pkl', 'rb') as f:
    model_tag = pickle.load(f)

In [257]:
with open('checkpoints/model-v1/model-v1-song-nmf.pkl', 'rb') as f:
    model_song = pickle.load(f)

In [258]:
W_tag = model_tag.transform(playlist_tag_matrix_train)
H_tag = model_tag.components_

W_song = model_song.transform(playlist_song_matrix_train)
H_song = model_song.components_

violation: 1.0
violation: 0.04823401859836165
Converged at iteration 3
violation: 1.0
violation: 0.4546062627449705
violation: 0.0927027897338968
violation: 0.023141694374950592
Converged at iteration 5


In [43]:
def calculate_r_precision(user_item_matrix, W, H):
    nonzero_r = user_item_matrix.nonzero()[0]
    nonzero_c = user_item_matrix.nonzero()[1]

    scores = list()
    for r in tqdm(range(user_item_matrix.shape[0])):
        nonzeros = user_item_matrix[r, :].nonzero()[1]
        n = len(nonzeros)
        top_n = np.argsort(np.dot(W[r, :], H))[::-1][:n]

        counter = len(set(nonzeros) & set(top_n))
        
        score = counter / n if n != 0 else 0
        scores.append(score)
    
    scoes = np.array(scores)
    return np.mean(scores)

In [44]:
calculate_r_precision(playlist_tag_matrix, W_tag, H_tag)

100%|██████████| 92056/92056 [02:54<00:00, 526.46it/s]


0.7194591662537806

In [79]:
nonzero_r = playlist_tag_matrix.nonzero()[0]
nonzero_c = playlist_tag_matrix.nonzero()[1]

random_idx = np.random.choice(len(nonzero_r), size=10)

print('(Original, Imputation)')
for idx in random_idx:
    r = nonzero_r[idx]
    c = nonzero_c[idx]
    print('({}, {})\t{}\t{}'.format(r, c, playlist_tag_matrix[r, c], np.dot(W_tag[r, :], H_tag[:, c])))

    print(playlist_tag_matrix[r, :].nonzero()[1])
    top10 = np.argsort(np.dot(W_tag[r, :], H_tag))[::-1][:10]
    print(top10)
    print(np.dot(W_tag[r, :], H_tag[:, top10]))


(Original, Imputation)
(63658, 11325)	1	1.51312672300729e-07
[ 9083  9188 11325 13133 18418]
[12329 23059 24122  2051 28211 27644 26988  4981 15674 21939]
[3.17208519e-04 2.47513216e-04 2.14783588e-04 1.95777539e-04
 8.88423381e-05 8.86667415e-05 8.45644842e-05 5.96200557e-05
 3.83380207e-05 2.83520515e-05]
(35462, 9267)	1	0.0
[ 1269  1546  4177  6472  9267  9348 17441 17742 18418 21530]
[28211 10208  6820 16003 25366 13754  5386 23817 28349 17662]
[0.00095426 0.00092932 0.00085408 0.00082481 0.00067106 0.00066365
 0.0006533  0.00060422 0.00057475 0.00044031]
(57972, 20439)	1	1.7953526191052652e-06
[20439]
[27612 10785 19640  9958  9838 14327  3781 11598 19083  7158]
[0.00070063 0.00068209 0.00044556 0.00037935 0.00036877 0.00028476
 0.00021908 0.00021876 0.00017476 0.00017113]
(60408, 12587)	1	0.0
[ 2834  4513  5132  9497 10953 10966 12587 17557]
[25366  9270 23059 27387  2988  6623 23869  9958 26791 10208]
[0.00270246 0.00206454 0.00171561 0.00149258 0.00130031 0.00119891
 0.00108964

In [80]:
nonzero_r = playlist_song_matrix.nonzero()[0]
nonzero_c = playlist_song_matrix.nonzero()[1]

random_idx = np.random.choice(len(nonzero_r), size=10)

print('(Original, Imputation)')
for idx in random_idx:
    r = nonzero_r[idx]
    c = nonzero_c[idx]
    print('({}, {})\t{}\t{}'.format(r, c, playlist_song_matrix[r, c], np.dot(W_song[r, :], H_song[:, c])))

    print(playlist_song_matrix[r, :].nonzero()[1])
    top10 = np.argsort(np.dot(W_song[r, :], H_song))[::-1][:10]
    print(top10)
    print(np.dot(W_song[r, :], H_song[:, top10]))


(Original, Imputation)
(72400, 280260)	1	0.5078473044400594
[ 11461  13019  33702  37350  40655  58139  68491 100083 102168 122626
 152045 161496 161847 170670 174380 185143 187241 195258 201278 204863
 230191 230691 237231 242489 263221 280260 293499 310516 312846 315498
 320843 323143 334446 336086 338911 353799 366022 368297 385228 394292
 394609 398371 402573 461974 470492 480326 501633 501822 502925 542180
 547194 550113 585743 591131 601442]
[187241 334446 591131   1309  68491 185143 280260 204863 353799 589759]
[0.52363754 0.51353657 0.51072696 0.51050572 0.50933978 0.50843665
 0.5078473  0.4423804  0.44071618 0.43988013]
(34717, 457214)	1	0.12847192488276127
[   216   1068   4424   6770   7565   7583   9675  10175  10194  11347
  12225  15922  17353  18013  18146  18444  19022  20240  20789  22427
  23078  24030  24753  30315  30887  30890  31401  31404  34631  35986
  36101  38347  38781  39001  39092  39439  42117  42332  43124  43650
  45314  49721  51218  51376  51530  5261

In [259]:
def calculate_cosine_similarity(A, B):
    similarity = cosine_similarity(A, B, dense_output=False)
    return similarity

In [262]:
def find_neigbors(user_item_matrix_src, user_item_matrix_dst, k=10):
    similarity_matrix = calculate_cosine_similarity(user_item_matrix_src, user_item_matrix_dst)
    neighbors = list()
    for r in tqdm(range(similarity_matrix.shape[0])):
        neighbors.append(np.argsort(similarity_matrix[r, :].toarray()[0])[::-1][:k])
    
    neighbors = np.array(neighbors)
    return similarity_matrix, neighbors

In [263]:
_, neighbors_tag = find_neigbors(playlist_tag_matrix_val, playlist_tag_matrix_train)
_, neighbors_song = find_neigbors(playlist_song_matrix_val, playlist_song_matrix_train)

100%|██████████| 23015/23015 [01:18<00:00, 291.98it/s]
100%|██████████| 23015/23015 [01:03<00:00, 360.47it/s]


In [299]:
neighbors_tag.shape

(23015, 10)

In [300]:
neighbors_song.shape

(23015, 10)

In [297]:
def recommend_item(neighbors, user_item_matrix_src, user_item_matrix_dst, n):
    recommendations = list()
    for idx, neighbor in tqdm(enumerate(neighbors)):
        ratings = np.sum(user_item_matrix_dst[neighbor, :].toarray(), axis=0)
        top_rating = np.argsort(ratings)[::-1][:n]
        recommendations.append(np.array(top_rating))
    recommendations = np.array(recommendations)
    return recommendations

In [298]:
recommendations_tag = recommend_item(neighbors_tag, playlist_tag_matrix_val, playlist_tag_matrix_train, 100)
recommendations_song = recommend_item(neighbors_song, playlist_song_matrix_val, playlist_song_matrix_train, 200)

23015it [00:16, 1400.85it/s]
23015it [08:03, 47.57it/s]


In [301]:
recommendations_tag.shape

(23015, 100)

In [302]:
recommendations_song.shape

(23015, 200)

In [235]:
for idx in range(playlist_tag_matrix_val.shape[0]):
    neighbor = np.mean(playlist_tag_matrix_train[n[idx], :], axis=0)[0]
    top = np.argsort(np.array(neighbor))[0][::-1][:100]
        print(top)
        print(neighbor[0, top])



[28500 22462 29159  9722  9712  9713  9714  9715  9716  9717]
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
[16361  4049 27691 18937  4267 14529 10751  6954 14440 15860]
[[0.8 0.7 0.3 0.2 0.2 0.2 0.1 0.1 0.1 0.1]]
[23213 29159  9721  9710  9711  9712  9713  9714  9715  9716]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[22281  4758 28820  4583  6538 29159  9712  9713  9714  9715]
[[1.  1.  1.  0.1 0.1 0.  0.  0.  0.  0. ]]
[19591  5115 19727  5019 26040 17718 19346 11218 18686  9723]
[[1.  1.  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0. ]]
[23078 29159  9722  9711  9712  9713  9714  9715  9716  9717]
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[22997 11862  2838  9607  8780  6104  3127 15988  7824  2041]
[[1.  1.  1.  0.8 0.8 0.5 0.5 0.2 0.2 0.2]]
[  692  5115   887 19346   651   855  9716  9717  9718  9719]
[[0.8 0.8 0.1 0.1 0.1 0.1 0.  0.  0.  0. ]]
[ 3127 11862  2041 22997  3372  2838 27924 22281 17362 13696]
[[1.  1.  1.  0.9 0.2 0.2 0.1 0.1 0.1 0.1]]
[ 2838 10660 29159  9724  9713  9714  9715  9716  9717  9718]
[[1. 1. 0. 0. 

In [223]:
playlist_tag_matrix_train.shape

(92056, 29160)

In [227]:
n.shape

(23015, 10)

In [325]:
idx2playlist_val = {idx:playlist for playlist, idx in playlist2idx_val.items()}
idx2tag = {idx:tag for tag, idx in tag2idx.items()}
idx2song = {idx:song for song, idx in song2idx.items()}

In [329]:
def restore_item(idx, idx2item, list_type=True):
    if list_type:
        return list(map(lambda x: [idx2item[i] for i in x], idx))
    else:
        return list(map(lambda x: idx2item[x], idx))

In [332]:
def write_answer(recommendations_tag, recommendations_song, idx2playlist, idx2song, idx2tag):
    
    n_data = recommendations_tag.shape[0]

    playlists = restore_item(range(n_data), idx2playlist, False)
    tags = restore_item(recommendations_tag, idx2tag)
    songs = restore_item(recommendations_song, idx2song)

    answer = list()

    for i, t, s in tqdm(zip(playlists, tags, songs)):
        answer.append({'id': i, 'tags': t, 'songs': s})

    return answer

In [333]:
answer = write_answer(recommendations_tag, recommendations_song, idx2playlist_val, idx2song, idx2tag)

23015it [00:00, 569734.98it/s]


In [335]:
write_json(answer, "results/results.json")