In [1]:
import pandas as pd
from pandarallel import pandarallel
from tqdm import tqdm

pandarallel.initialize(nb_workers=9)

INFO: Pandarallel will run on 9 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from itertools import islice
import re

from khaiii import KhaiiiApi
from itertools import chain

api = KhaiiiApi()
def tokenize(text):
    if len(text.replace(' ', '')) == 0:
        return list()
    morphs = list(chain(*[w.morphs for w in api.analyze(text)]))
    return [(m.lex, m.tag) for m in morphs]

def cleanse_text(text):
    text = text.replace('_', ' ')
    return re.sub('[^0-9a-zA-Zㄱ-힗 ]', '', text)

def window(seq, n=2):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result
        
def tag_filter(tag):
    if tag.startswith('N'):
        return True
    elif tag.startswith('V'):
        return True
    elif tag.startswith('S'):
        return True
    elif tag == 'XR':
        return True
    elif tag.startswith('M'):
        return True
    else:
        return False

def preprocess_tags(tags):
    return list(set(tags))

def preprocess_title(title):

    
    return title.split(' ')

def preprocess_text(df):
    df['p_title'] = df['plylst_title'].parallel_map(lambda x: preprocess_title(x))
    df['p_tags'] = df['tags'].parallel_map(lambda x: preprocess_tags(x))
    
    df['p_all'] = df['songs'] + df['p_tags'] + df['p_title']

In [3]:
df = pd.read_json('./data/train.json')
df['p_title'] = df['plylst_title'].parallel_map(lambda x: preprocess_title(x))
df['p_tags'] = df['tags'].parallel_map(lambda x: preprocess_tags(x))

df['p_all'] = df['songs'] + df['p_tags'] + df['p_title']

In [5]:
df['p_all'][0]

[525514,
 129701,
 383374,
 562083,
 297861,
 139541,
 351214,
 650298,
 531057,
 205238,
 706183,
 127099,
 660493,
 461973,
 121455,
 72552,
 223955,
 324992,
 50104,
 '락',
 '여행같은',
 '음악']