In [1]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize()

SONG_MAX = 707989

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from itertools import islice
import re

def cleanse_text(text):
    return re.sub('[^0-9a-zA-Zㄱ-힗]', '', text)

def window(seq, n=2):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

def preprocess_tags(tags):
    result = list()
    c_tags = [cleanse_text(t) for t in tags]
    for t in c_tags:
        if len(t) < 2:
            result.append(t)
        else:
            for w in window(t):
                result.append(''.join(w))
    return list(set(result + tags))

def preprocess_title(title):
    """
    result = list()
    title = cleanse_text(title)
    if len(title) < 2:
        result.append(title)
    else:
        for w in window(title):
            result.append(''.join(w))
    return result
    """
    return preprocess_tags(title.split(' '))

def preprocess_text(df):
    df['p_title'] = df['plylst_title'].map(lambda x: preprocess_title(x))
    df['p_tags'] = df['tags'].map(lambda x: preprocess_tags(x))

In [3]:
df = pd.read_json('./data/train.json')
test_offset = len(df) - int(len(df) / 10)
train_df, test_df = df[:test_offset], df[test_offset:]
train_df.head(5)
train_df = df

val_df = pd.read_json('./data/val.json')
val_df.head()

preprocess_text(train_df)
preprocess_text(val_df)

In [4]:
from tqdm import tqdm

def similarity(a, b):
    intersection = set(a) & set(b)
    r =  len(intersection) / (len(a) + len(b) - len(intersection))
    return r

def predict_with_songs(row):
    tags = row['tags']
    songs = row['songs']
    
    topk = 100
    sim = train_df['songs'].map(lambda x: similarity(x, songs)).sort_values()
    while True:
        pred_tags = dict()
        pred_songs = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for tag in train_df['tags'].iat[i]:
                pred_tags[tag] = pred_tags.get(tag, 0) + (1 / (1.00001 - v))
            for song in train_df['songs'].iat[i]:
                pred_songs[song] = pred_songs.get(song, 0) + (1 / (1.00001 - v))

        pred_tags = [k for k, v in sorted(pred_tags.items(), key=lambda x: x[1], reverse=True) if k not in tags][:10]
        pred_songs = [k for k, v in sorted(pred_songs.items(), key=lambda x: x[1], reverse=True) if k not in songs][:100]
        
        if len(pred_tags) < 10 or len(pred_songs) < 100:
            topk += 100
            continue

        return row['id'], pred_tags, pred_songs

def predict_with_tags(row):
    p_tags = row['p_tags']
    tags = row['tags']
    songs = row['songs']
    
    topk = 100
    sim = train_df['p_tags'].map(lambda x: similarity(x, p_tags)).sort_values()
    while True:
        pred_tags = dict()
        pred_songs = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for tag in train_df['tags'].iat[i]:
                pred_tags[tag] = pred_tags.get(tag, 0) + (1 / (1.00001 - v))
            for song in train_df['songs'].iat[i]:
                pred_songs[song] = pred_songs.get(song, 0) +  (1 / (1.00001 - v))

        pred_tags = [k for k, v in sorted(pred_tags.items(), key=lambda x: x[1], reverse=True) if k not in tags][:10]
        pred_songs = [k for k, v in sorted(pred_songs.items(), key=lambda x: x[1], reverse=True) if k not in songs][:100]
        
        if len(pred_tags) < 10 or len(pred_songs) < 100:
            topk += 100
            continue

        return row['id'], pred_tags, pred_songs
    
def predict_with_title(row):
    p_title = row['p_title']
    tags = row['tags']
    songs = row['songs']
    
    topk = 100
    sim = train_df['p_title'].map(lambda x: similarity(x, p_title)).sort_values()
    while True:
        pred_tags = dict()
        pred_songs = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for tag in train_df['tags'].iat[i]:
                pred_tags[tag] = pred_tags.get(tag, 0) + (1 / (1.00001 - v))
            for song in train_df['songs'].iat[i]:
                pred_songs[song] = pred_songs.get(song, 0) + (1 / (1.00001 - v))

        pred_tags = [k for k, v in sorted(pred_tags.items(), key=lambda x: x[1], reverse=True) if k not in tags][:10]
        pred_songs = [k for k, v in sorted(pred_songs.items(), key=lambda x: x[1], reverse=True) if k not in songs][:100]
        
        if len(pred_tags) < 10 or len(pred_songs) < 100:
            topk += 100
            continue

        return row['id'], pred_tags, pred_songs
    
def predict(row):
    songs = row['songs']
    tags = row['tags']
    if len(songs) != 0:
        return predict_with_songs(row)
    elif len(tags) != 0:
        return predict_with_tags(row)
    else:
        return predict_with_title(row)

result = val_df.parallel_apply(predict, axis=1)

In [7]:
result_df = pd.DataFrame(list(result.values), columns=['id', 'tags', 'songs'])

In [8]:
result_df.to_json('results.json', orient='records', force_ascii=False)

In [13]:
val_df[(val_df['tags'].map(lambda x: len(x)) == 0) & (val_df['songs'].map(lambda x: len(x)) == 0)]

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,p_title,p_tags
1,[],131447,앨리스테이블,[],1,2014-07-16 15:24:24.000,"[앨리, 리스, 스테, 테이, 이블]",[]
9,[],142007,기분 좋은 재즈와 함께 만드는 달달한 하루,[],0,2015-06-22 09:11:02.000,"[기분, 분좋, 좋은, 은재, 재즈, 즈와, 와함, 함께, 께만, 만드, 드는, 는...",[]
35,[],65114,"■■■■ 사랑,그리고이별 ■■■■",[],6,2010-10-27 10:34:34.000,"[사랑, 랑그, 그리, 리고, 고이, 이별]",[]
57,[],87700,마쉬멜로우같은 멜로우한 음악,[],6,2016-01-14 10:19:30.000,"[마쉬, 쉬멜, 멜로, 로우, 우같, 같은, 은멜, 멜로, 로우, 우한, 한음, 음악]",[]
71,[],35271,공부와 독서를 위한 #Newage,[],10,2020-01-17 15:46:20.000,"[공부, 부와, 와독, 독서, 서를, 를위, 위한, 한N, Ne, ew, wa, a...",[]
...,...,...,...,...,...,...,...,...
22903,[],140513,10년이 지나 들어도 좋은 감성 Ballad,[],405,2016-01-11 10:58:05.000,"[10, 0년, 년이, 이지, 지나, 나들, 들어, 어도, 도좋, 좋은, 은감, 감...",[]
22920,[],124704,가사의 의미와 뜻은모른다!! 오직 멜로디로만 선곡한 팝송!!,[],27,2016-02-05 12:31:59.000,"[가사, 사의, 의의, 의미, 미와, 와뜻, 뜻은, 은모, 모른, 른다, 다오, 오...",[]
22981,[],13045,＊카페 느낌 샹송♭,[],38,2011-07-12 00:58:39.000,"[카페, 페느, 느낌, 낌샹, 샹송]",[]
22991,[],32537,컨트리 황제 조니 캐시가 선 레코드 시절 발표한 초기 대표작,[],28,2019-06-17 14:22:48.000,"[컨트, 트리, 리황, 황제, 제조, 조니, 니캐, 캐시, 시가, 가선, 선레, 레...",[]
