In [1]:
# -*- coding: utf-8 -*-
import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np


def write_json(data, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./arena_data/" + parent)
    with io.open("./arena_data/" + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding="utf-8") as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))

In [2]:
from tqdm import tqdm

import pandas as pd
import numpy as np
import scipy.sparse as sp

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

def json_to_dataframe(json_data):
    dataframe_dict = {'id': [], 'plylst_title': [], 'tags': [], 'songs': [], 'like_cnt': [], 'updt_date': []}

    for data in tqdm(json_data):
        dataframe_dict['id'].append(data['id'])
        dataframe_dict['plylst_title'].append(data['plylst_title'])
        dataframe_dict['tags'].append(data['tags'])
        dataframe_dict['songs'].append(data['songs'])
        dataframe_dict['like_cnt'].append(data['like_cnt'])
        dataframe_dict['updt_date'].append(data['updt_date'])
    
    dataframe = pd.DataFrame(dataframe_dict)
    dataframe['updt_date'] = pd.to_datetime(dataframe.updt_date)

    return dataframe


def get_unique_items(dataframe, column, list_type=True):
    unique_items = set()
    if list_type:
        for c in tqdm(dataframe[column]):
            unique_items |= set(c)
    else:
        assert len(dataframe[column].unique()) == len(dataframe[column])
        unique_items = dataframe[column].unique()
    
    return unique_items


def make_item_index_dictionary(items):
    item2idx = {item:idx for idx, item in enumerate(items)}
    idx2item = {idx:item for item, idx in item2idx.items()}
    return item2idx, idx2item


def dataframe_to_matrix(dataframe, item='tags', playlist2idx=None, item2idx=None):
    assert item in ['tags', 'songs']

    matrix_shape = (len(playlist2idx), len(item2idx))

    if 'plylst_id' not in dataframe.columns:
        dataframe['plylst_id'] = dataframe.id.map(playlist2idx)

    column_name = '{}_id'.format(item)
    if column_name not in dataframe.columns:
        dataframe[column_name] = dataframe[item].apply(lambda items: [item2idx[item] for item in items])

    rows = list()
    cols = list()
    data = list()

    for r, cs in tqdm(zip(dataframe.plylst_id, dataframe[column_name])):
        for c in cs:
            rows.append(r)
            cols.append(c)
    
    rows = np.array(rows)
    cols = np.array(cols)
    data = np.ones(rows.shape[0])

    return sp.csr_matrix((data, (rows, cols)), shape=matrix_shape)

def transform_sparse_matrix_tfidf(train_test, train, test):
    transformer = TfidfTransformer(smooth_idf=True)
    transformer.fit(train_test)
    tfidf_train = transformer.transform(train)
    tfidf_test = transformer.transform(test)

    return transformer, tfidf_train, tfidf_test


def _calculate_cosine_similarity(A, B):
    return cosine_similarity(A, B, dense_output=False)

### fint neighbors of test from train
def find_neigbors(tag_train, tag_test, song_train, song_test, k=5):
    train = sp.hstack([tag_train * 0.15, song_train * 0.85])
    test = sp.hstack([tag_test * 0.15, song_test * 0.85])

    similarity = _calculate_cosine_similarity(test, train)
    neighbors = list()
    for rid in tqdm(range(similarity.shape[0])):
        neighbors.append(np.argsort(similarity[rid, :].toarray()[0])[::-1][:k])
    
    neighbors = np.array(neighbors)
    return similarity, neighbors

def recommend_items(id, train, test, neighbors, idx2item, n):
    counter = 0
    items = list()
    for item_id in train[neighbors, :].toarray().sum(axis=0).argsort()[::-1]:
        if item_id not in test[id, :].nonzero()[1]:
            item = idx2item[item_id]
            items.append(item)
            counter += 1
        if counter == n:
            break
    return items

In [3]:
class KnnModel:
    def __init__(self, k=10):
        # Model Parameters
        self.k = k

        # Item Dictionaries
        self.idx2tag = None
        self.idx2song = None
        self.idx2playlist = None


    def _preprocess_data(self, train, test, tfidf=True):
        unique_tags = get_unique_items(pd.concat([train, test], ignore_index=True, copy=False), 'tags', list_type=True)
        unique_songs = get_unique_items(pd.concat([train, test], ignore_index=True, copy=False), 'songs', list_type=True)
        unique_playlists_train = get_unique_items(train, 'id', list_type=False)
        unique_playlists_test = get_unique_items(test, 'id', list_type=False)

        tag2idx, idx2tag = make_item_index_dictionary(unique_tags)
        song2idx, idx2song = make_item_index_dictionary(unique_songs)

        playlist2idx_train, idx2playlist_train = make_item_index_dictionary(unique_playlists_train)
        playlist2idx_test, idx2playlist_test = make_item_index_dictionary(unique_playlists_test)

        N_TRAIN = len(playlist2idx_train)

        playlist2idx = {playlist:idx for playlist, idx in playlist2idx_train.items()}
        for playlist, idx in playlist2idx_test.items():
            playlist2idx[playlist] = (idx + N_TRAIN)
        idx2playlist = {idx:playlist for playlist, idx in playlist2idx.items()}

        assert len(playlist2idx_train) + len(playlist2idx_test) == len(playlist2idx)

        self.idx2tag = idx2tag
        self.idx2song = idx2song
        self.idx2playlist = (idx2playlist_train,  idx2playlist_test)

        PT_train = dataframe_to_matrix(train, item='tags', playlist2idx=playlist2idx_train, item2idx=tag2idx)
        PT_test = dataframe_to_matrix(test, item='tags', playlist2idx=playlist2idx_test, item2idx=tag2idx)
        PT = dataframe_to_matrix(pd.concat([train, test], ignore_index=True, copy=False), item='tags', playlist2idx=playlist2idx, item2idx=tag2idx)

        PS_train = dataframe_to_matrix(train, item='songs', playlist2idx=playlist2idx_train, item2idx=song2idx)
        PS_test = dataframe_to_matrix(test, item='songs', playlist2idx=playlist2idx_test, item2idx=song2idx)
        PS = dataframe_to_matrix(pd.concat([train, test], ignore_index=True, copy=False), item='songs', playlist2idx=playlist2idx, item2idx=song2idx)

        if tfidf:
            _, PT_tfidf_train, PT_tfidf_test = transform_sparse_matrix_tfidf(PT, PT_train, PT_test)
            _, PS_tfidf_train, PS_tfidf_test = transform_sparse_matrix_tfidf(PS, PS_train, PS_test)

            return PT_tfidf_train, PT_tfidf_test, PS_tfidf_train, PS_tfidf_test

        return PT_train, PT_test, PS_train, PS_test


    def recommend(self, train, test):
        print("Preprocessing data... > to sparse matrix")
        PT_train, PT_test, PS_train, PS_test = self._preprocess_data(train, test)
        
        print("Prepare for recommendations... > find {} neighbors".format(self.k))
        _, neighbors = find_neigbors(PT_train, PT_test, PS_train, PS_test, self.k)

        print("Recommend items...")
        recommendations = list()

        assert PT_test.shape[0] == PS_test.shape[0]
        N_TEST = PT_test.shape[0]

        for rid in tqdm(range(N_TEST)):
            playlist = self.idx2playlist[1][rid]
            rid_neighbors = neighbors[rid, :]

            tags = recommend_items(rid, PT_train, PT_test, rid_neighbors, self.idx2tag, 10)
            songs = recommend_items(rid, PS_train, PS_test, rid_neighbors, self.idx2song, 100)

            recommendations.append({
                "id": playlist,
                "songs": songs,
                "tags": tags,
            })

        return recommendations

In [4]:
class PlaylistContinuation:
    def __init__(self):
        self.model = None

    def _generate_answers(self, train, test):
        print("Preprocessing data... > to dataframe")
        train = json_to_dataframe(train)
        test = json_to_dataframe(test)

        self.model = KnnModel(k=1000)
        return self.model.recommend(train, test)


    def run(self, train_fname, question_fname):
        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        test_data = load_json(question_fname)

        answers = self._generate_answers(train_data, test_data)
    
        print("Writing answers...")
        write_json(answers, "results/results.json")

In [5]:
class ModelKNNTFIDF:
    def _generate_answers(self, train, questions):
        print("Preprocessing data... > to dataframe")
        train = json_to_dataframe(train)
        test = json_to_dataframe(questions)

        N_TRAIN = len(train)
        N_TEST = len(test)

        print("Preprocessing data... > to sparse matrix")
        unique_tags = get_unique_items(pd.concat([train, test], ignore_index=True, copy=False), 'tags', list_type=True)
        unique_songs = get_unique_items(pd.concat([train, test], ignore_index=True, copy=False), 'songs', list_type=True)
        unique_playlists_train = get_unique_items(train, 'id', list_type=False)
        unique_playlists_test = get_unique_items(test, 'id', list_type=False)

        tag2idx, idx2tag = make_item_index_dictionary(unique_tags)
        song2idx, idx2song = make_item_index_dictionary(unique_songs)

        playlist2idx_train, idx2playlist_train = make_item_index_dictionary(unique_playlists_train)
        playlist2idx_test, idx2playlist_test = make_item_index_dictionary(unique_playlists_test)

        playlist2idx = {playlist:idx for playlist, idx in playlist2idx_train.items()}
        for playlist, idx in playlist2idx_test.items():
            playlist2idx[playlist] = (idx + N_TRAIN)
        idx2playlist = {idx:playlist for playlist, idx in playlist2idx.items()}

        assert len(playlist2idx_train) + len(playlist2idx_test) == len(playlist2idx)

        PT_train = dataframe_to_matrix(train, item='tags', playlist2idx=playlist2idx_train, item2idx=tag2idx)
        PT_test = dataframe_to_matrix(test, item='tags', playlist2idx=playlist2idx_test, item2idx=tag2idx)
        PT = dataframe_to_matrix(pd.concat([train, test], ignore_index=True, copy=False), item='tags', playlist2idx=playlist2idx, item2idx=tag2idx)

        PS_train = dataframe_to_matrix(train, item='songs', playlist2idx=playlist2idx_train, item2idx=song2idx)
        PS_test = dataframe_to_matrix(test, item='songs', playlist2idx=playlist2idx_test, item2idx=song2idx)
        PS = dataframe_to_matrix(pd.concat([train, test], ignore_index=True, copy=False), item='songs', playlist2idx=playlist2idx, item2idx=song2idx)

        _, PT_tfidf_train, PT_tfidf_test = transform_sparse_matrix_tfidf(PT, PT_train, PT_test)
        _, PS_tfidf_train, PS_tfidf_test = transform_sparse_matrix_tfidf(PS, PS_train, PS_test)
        
        k = 100
        print("Prepare for recommendations... > find {} neighbors".format(k))
        _, neighbors = find_neigbors(PT_tfidf_train, PT_tfidf_test, PS_tfidf_train, PS_tfidf_test, k)

        answers = list()
        for rid in tqdm(range(N_TEST)):
            playlist = idx2playlist_test[rid]
            rid_neighbors = neighbors[rid, :]

            tags = recommend_items(rid, PT_tfidf_train, PT_tfidf_test, rid_neighbors, idx2tag, 10)
            songs = recommend_items(rid, PS_tfidf_train, PS_tfidf_test, rid_neighbors, idx2song, 100)

            answers.append({
                "id": playlist,
                "songs": songs,
                "tags": tags,
            })

        return answers

    def run(self, train_fname, question_fname):
        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        test_data = load_json(question_fname)

        answers = self._generate_answers(train_data, test_data)

        print("Writing answers...")
        write_json(answers, "results/results.json")

In [6]:
# -*- coding: utf-8 -*-
import fire
import numpy as np

class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)

        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)

In [7]:
# # k-neighbor = 10

# model = ModelKNNTFIDF()
# model.run(train_fname='../arena_data/orig/train.json', question_fname='../arena_data/questions/val.json')

# evaluator = ArenaEvaluator()
# evaluator.evaluate(gt_fname='../arena_data/answers/val.json', rec_fname='./arena_data/results/results.json')

Music nDCG: 0.10744
Tag nDCG: 0.323821
Score: 0.139897

In [8]:
# # k-neighbor = 100

# model = ModelKNNTFIDF()
# model.run(train_fname='../arena_data/orig/train.json', question_fname='../arena_data/questions/val.json')

# evaluator = ArenaEvaluator()
# evaluator.evaluate(gt_fname='../arena_data/answers/val.json', rec_fname='./arena_data/results/results.json')

Music nDCG: 0.111104
Tag nDCG: 0.39274
Score: 0.15335

In [9]:
# k-neighbor = 100, Similarity Ration = (0.15, 0.85)

model = ModelKNNTFIDF()
model.run(train_fname='../arena_data/orig/train.json', question_fname='../arena_data/questions/val.json')

evaluator = ArenaEvaluator()
evaluator.evaluate(gt_fname='../arena_data/answers/val.json', rec_fname='./arena_data/results/results.json')

Loading train file...
100%|██████████| 9205/9205 [00:00<00:00, 605349.23it/s]
100%|██████████| 2302/2302 [00:00<00:00, 761998.88it/s]
100%|██████████| 11507/11507 [00:00<00:00, 771715.45it/s]
100%|██████████| 11507/11507 [00:00<00:00, 123103.55it/s]Loading question file...
Preprocessing data... > to dataframe
Preprocessing data... > to sparse matrix

9205it [00:00, 718192.05it/s]
2302it [00:00, 1037422.13it/s]
11507it [00:00, 788470.50it/s]
9205it [00:00, 85200.79it/s]
2302it [00:00, 182075.62it/s]
11507it [00:00, 104690.65it/s]
 14%|█▍        | 322/2302 [00:00<00:00, 3219.01it/s]Prepare for recommendations... > find 100 neighbors
100%|██████████| 2302/2302 [00:00<00:00, 3221.05it/s]
100%|██████████| 2302/2302 [02:02<00:00, 18.82it/s]
Writing answers...
Music nDCG: 0.112757
Tag nDCG: 0.362646
Score: 0.150241


Music nDCG: 0.13871
Tag nDCG: 0.395219
Score: 0.177186

In [10]:
# # k-neighbor = 100, Similarity Ration = (0.1, 0.9)

# model = ModelKNNTFIDF()
# model.run(train_fname='../arena_data/orig/train.json', question_fname='../arena_data/questions/val.json')

# evaluator = ArenaEvaluator()
# evaluator.evaluate(gt_fname='../arena_data/answers/val.json', rec_fname='./arena_data/results/results.json')

Music nDCG: 0.13866
Tag nDCG: 0.386885
Score: 0.175894

In [11]:
# # k-neighbor = 100, Similarity Ration = (0.15, 0.85)

# playlist_continuation = PlaylistContinuation()
# playlist_continuation.run(train_fname='../res/train.json', question_fname='../res/val.json')