In [1]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from itertools import islice
import re

def cleanse_text(text):
    return re.sub('[^0-9a-zA-Zㄱ-힗]', '', text)

def window(seq, n=2):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

def preprocess_tags(tags):
    result = list()
    c_tags = [cleanse_text(t) for t in tags]
    for t in c_tags:
        if len(t) < 2:
            result.append(t)
        else:
            for w in window(t):
                result.append(''.join(w))
    return list(set(result + tags))

def preprocess_title(title):
    """
    result = list()
    title = cleanse_text(title)
    if len(title) < 2:
        result.append(title)
    else:
        for w in window(title):
            result.append(''.join(w))
    return result
    """
    return preprocess_tags(title.split(' '))

def preprocess_text(df):
    df['p_title'] = df['plylst_title'].map(lambda x: preprocess_title(x))
    df['p_tags'] = df['tags'].map(lambda x: preprocess_tags(x))

In [3]:
df = pd.read_json('./data/train.json')
test_offset = len(df) - int(len(df) / 10)
train_df, test_df = df[:test_offset], df[test_offset:]
train_df.head(5)
train_df = df

val_df = pd.read_json('./data/val.json')
val_df.head()

preprocess_text(train_df)
preprocess_text(val_df)

In [6]:
from tqdm import tqdm
import math

def similarity(a, b):
    intersection = set(a) & set(b)
    r =  len(intersection) / (len(a) + len(b) - len(intersection))
    return r

def cos_similarity(a, b):
    intersection = set(a) & set(b)
    return len(intersection) / (math.sqrt(len(a)) * math.sqrt(len(b)))

def predict_with_songs(row):
    tags = row['tags']
    songs = row['songs']
    
    topk = 25
    sim = train_df['songs'].map(lambda x: cos_similarity(x, songs)).sort_values()
    while True:
        pred_tags = dict()
        pred_songs = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for tag in train_df['tags'].iat[i]:
                pred_tags[tag] = pred_tags.get(tag, 0) + v
            for song in train_df['songs'].iat[i]:
                pred_songs[song] = pred_songs.get(song, 0) + v

        pred_tags = [k for k, v in sorted(pred_tags.items(), key=lambda x: x[1], reverse=True) if k not in tags][:10]
        pred_songs = [k for k, v in sorted(pred_songs.items(), key=lambda x: x[1], reverse=True) if k not in songs][:100]
        
        if len(pred_tags) < 10 or len(pred_songs) < 100:
            topk += 25
            continue

        return row['id'], pred_tags, pred_songs

def predict_with_tags(row):
    p_tags = row['p_tags']
    tags = row['tags']
    songs = row['songs']
    
    topk = 25
    sim = train_df['p_tags'].map(lambda x: cos_similarity(x, p_tags)).sort_values()
    while True:
        pred_tags = dict()
        pred_songs = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for tag in train_df['tags'].iat[i]:
                pred_tags[tag] = pred_tags.get(tag, 0) + v
            for song in train_df['songs'].iat[i]:
                pred_songs[song] = pred_songs.get(song, 0) + v

        pred_tags = [k for k, v in sorted(pred_tags.items(), key=lambda x: x[1], reverse=True) if k not in tags][:10]
        pred_songs = [k for k, v in sorted(pred_songs.items(), key=lambda x: x[1], reverse=True) if k not in songs][:100]
        
        if len(pred_tags) < 10 or len(pred_songs) < 100:
            topk += 25
            continue

        return row['id'], pred_tags, pred_songs
    
def predict_with_title(row):
    p_title = row['p_title']
    tags = row['tags']
    songs = row['songs']
    
    topk = 25
    sim = train_df['p_title'].map(lambda x: cos_similarity(x, p_title)).sort_values()
    while True:
        pred_tags = dict()
        pred_songs = dict()
        
        topk_sim = sim[-topk:]
        for i, v in zip(topk_sim.index, topk_sim.values):
            for tag in train_df['tags'].iat[i]:
                pred_tags[tag] = pred_tags.get(tag, 0) + v
            for song in train_df['songs'].iat[i]:
                pred_songs[song] = pred_songs.get(song, 0) + v

        pred_tags = [k for k, v in sorted(pred_tags.items(), key=lambda x: x[1], reverse=True) if k not in tags][:10]
        pred_songs = [k for k, v in sorted(pred_songs.items(), key=lambda x: x[1], reverse=True) if k not in songs][:100]
        
        if len(pred_tags) < 10 or len(pred_songs) < 100:
            topk += 25
            continue

        return row['id'], pred_tags, pred_songs
    
def predict(row):
    songs = row['songs']
    tags = row['tags']
    if len(songs) != 0:
        return predict_with_songs(row)
    elif len(tags) != 0:
        return predict_with_tags(row)
    else:
        return predict_with_title(row)

import time
start = time.time()
result = val_df[:10].parallel_apply(predict, axis=1)
time.time() - start

1.3323004245758057

In [5]:
result_df = pd.DataFrame(list(result.values), columns=['id', 'tags', 'songs'])

In [6]:
result_df.to_json('results.json', orient='records', force_ascii=False)