In [1]:
import pandas as pd
from pandarallel import pandarallel
from tqdm import tqdm

pandarallel.initialize(nb_workers=9)

INFO: Pandarallel will run on 9 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from itertools import islice
import re

from khaiii import KhaiiiApi
from itertools import chain

api = KhaiiiApi()
def tokenize(text):
    if len(text.replace(' ', '')) == 0:
        return list()
    morphs = list(chain(*[w.morphs for w in api.analyze(text)]))
    return [(m.lex, m.tag) for m in morphs]

def cleanse_text(text):
    text = text.replace('_', ' ')
    return re.sub('[^0-9a-zA-Zㄱ-힗 ]', '', text)

def window(seq, n=2):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result
        
def tag_filter(tag):
    if tag.startswith('N'):
        return True
    elif tag.startswith('V'):
        return True
    elif tag.startswith('S'):
        return True
    elif tag == 'XR':
        return True
    elif tag.startswith('M'):
        return True
    else:
        return False

def preprocess_tags(tags):
    t_tags = list(chain(*[tokenize(cleanse_text(t)) for t in tags]))
    t_tags = [t[0].lower() for t in t_tags if tag_filter(t[1])]
    return list(set(t_tags + tags))

def preprocess_title(title):
    title = cleanse_text(title)
    tokens = tokenize(title)
    
    return list(set([t[0].lower() for t in tokens if tag_filter(t[1])] + title.split(' ')))

def preprocess_text(df):
    df['p_title'] = df['plylst_title'].parallel_map(lambda x: preprocess_title(x))
    df['p_tags'] = df['tags'].parallel_map(lambda x: preprocess_tags(x))

In [3]:
df = pd.read_json('./data/train.json')
test_offset = len(df) - int(len(df) / 10)
train_df, test_df = df[:test_offset], df[test_offset:]
train_df.head(5)
train_df = df

val_df = pd.read_json('./data/val.json')
val_df.head()

preprocess_text(train_df)
preprocess_text(val_df)

In [4]:
from tqdm import tqdm
import math

def similarity(a, b):
    intersection = set(a) & set(b)
    return  len(intersection) / (len(a) + len(b) - len(intersection))

def cos_similarity(a, b):
    intersection = set(a) & set(b)
    return len(intersection) / (math.sqrt(len(a)) * math.sqrt(len(b)))
    
def predict(row):
    p_title = row['p_title']
    p_tags = row['p_tags']
    songs = row['songs']
    tags = row['tags']
    if len(songs) != 0:
        sim = train_df['songs'].map(lambda x: cos_similarity(x, songs)).sort_values()
    elif len(tags) != 0:
        sim = train_df['p_tags'].map(lambda x: cos_similarity(x, p_tags)).sort_values()
    else:
        sim = train_df['p_title'].map(lambda x: cos_similarity(x, p_title)).sort_values()
    
    topk = 100
    while True:
        pred_tags = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for tag in train_df['tags'].iat[i]:
                pred_tags[tag] = pred_tags.get(tag, 0) + v

        pred_tags = [k for k, v in sorted(pred_tags.items(), key=lambda x: x[1], reverse=True) if k not in tags][:10]        
        if len(pred_tags) < 10:
            topk += 100
        else:
            break
    
    topk = 25
    while True:
        pred_songs = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for song in train_df['songs'].iat[i]:
                pred_songs[song] = pred_songs.get(song, 0) + v

        pred_songs = [k for k, v in sorted(pred_songs.items(), key=lambda x: x[1], reverse=True) if k not in songs][:100]
        
        if len(pred_songs) < 100:
            topk += 25
            continue

        return row['id'], pred_tags, pred_songs

import time
start = time.time()
result = val_df.parallel_apply(predict, axis=1)
time.time() - start

1468.7355222702026

In [7]:
result_df = pd.DataFrame(list(result.values), columns=['id', 'tags', 'songs'])

In [8]:
result_df.to_json('results.json', orient='records', force_ascii=False)

In [38]:
val_df.sample(20)

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,p_title,p_tags
647,[],104169,,"[207908, 145369, 162231, 144983, 313261, 21213...",10,2017-08-21 04:02:29.000,[],[]
18623,[가을],97761,,"[674134, 160759, 678215, 171150, 68921, 517689...",1,2015-09-09 15:14:38.000,[],[가을]
10010,"[힙합, 최신힙합, 스포츠, 다이어트, 국힙]",85407,,"[638149, 689319, 350303, 409609, 65266, 188388...",5,2020-02-14 16:59:14.000,[],"[최신힙합, 합, 스포츠, 국, 국힙, 신, 다이어트, 힙합]"
20987,[],36266,★★기분전환 클럽음악★★,[],36,2014-01-08 18:07:33.000,"[클럽음악, 기분전환, 전환, 기분]",[]
2009,[알앤비],1062,,"[316564, 358493, 84946, 87179, 317585, 59932, ...",14,2015-10-25 11:05:47.000,[],[알앤비]
13162,[],93519,,"[699365, 104945, 300027, 463488, 583525, 18783...",0,2020-04-23 21:42:58.000,[],[]
14627,[],21261,,"[399737, 647679, 440727, 419548, 284683, 56483...",0,2019-03-27 15:28:23.000,[],[]
5651,[],18781,★★ 상쾌한 POP ★★,[],0,2006-01-12 22:53:24.000,"[, POP, 상쾌한, pop, 상쾌]",[]
4724,[아이돌],131988,,"[477151, 569401, 152266, 154927, 489136, 27233...",3,2019-12-04 14:06:53.000,[],[아이돌]
9860,[],112661,,"[480158, 263906, 623507, 203642, 559052, 39848...",17,2013-06-07 15:23:19.000,[],[]
