# 180414_soynlp

In [1]:
import soynlp

In [2]:
import os
import pickle
import sys
import warnings

import tensorflow as tf
import pandas as pd

  from ._conv import register_converters as _register_converters


In [3]:
with open("../data/movie.txt") as f:
    data = f.readlines()
    data = [str(t.strip()) for t in data]
    data = pd.DataFrame(data, columns=['comment'])
    
with open("../data/movie_label.txt") as f:
    score = f.readlines()
    data['score'] = pd.DataFrame(score)
    data['score'] = data['score'].map(lambda x: int(x.strip()))

In [4]:
from soynlp.word import WordExtractor

In [5]:
sentences = data['comment'].values

In [6]:
word_extractor = WordExtractor(min_count=100,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0)
word_extractor.train(sentences) # list of str or like
words = word_extractor.extract()

training was done. used memory 1.072 Gb
all cohesion probabilities was computed. # words = 9228
all branching entropies was computed # words = 152694
all accessor variety was computed # words = 152694


In [40]:
word_extractor = WordExtractor(min_count=10,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0)
word_extractor.train(sentences) # list of str or like
words = word_extractor.extract()

training was done. used memory 1.037 Gb
all cohesion probabilities was computed. # words = 75041
all branching entropies was computed # words = 157381
all accessor variety was computed # words = 157381


In [30]:
words_sorted = [(word,(score.cohesion_backward + \
                      score.cohesion_forward)*(score.left_branching_entropy+score.right_branching_entropy)) 
                for word, score in words.items()]

In [34]:
words_sorted = {word:score for word, score in words_sorted if score!=0}

In [36]:
words_sorted

6232

In [37]:
from soynlp.tokenizer import MaxScoreTokenizer

In [38]:
tokenizer = MaxScoreTokenizer(scores=words_sorted)

In [39]:
tokenizer.tokenize("안녕하세요")

['안녕하', '세요']

In [44]:
from collections import Counter

In [46]:
import re

In [62]:
re_movie_actor = re.compile("mv[0-9]*|ac[0-9]*")
ls = []
for review in sentences:
    movie_actor_token = re_movie_actor.findall(review)
    if movie_actor_token:
        ls += movie_actor_token

In [71]:
c = {name:freq for name, freq in c.items() if freq>30}

In [72]:
c

{'mv00036133': 41,
 'mv00501003': 182,
 'ac00000559': 76,
 'mv00457204': 467,
 'mv00000082': 152,
 'ac00920758': 1182,
 'mv00335020': 68,
 'ac00001858': 51,
 'mv00230755': 107,
 'mv00416725': 114,
 'mv00326143': 243,
 'mv00185524': 73,
 'ac00887011': 409,
 'ac01074895': 428,
 'ac01266023': 1011,
 'ac00005578': 113,
 'ac01233717': 344,
 'ac01318447': 1138,
 'mv00217576': 2283,
 'mv00401824': 60,
 'ac00477430': 6147,
 'mv00355558': 364,
 'ac00004657': 64,
 'mv00247312': 384,
 'mv00008680': 75,
 'ac00004612': 68,
 'mv00165583': 724,
 'ac01312395': 370,
 'mv00069433': 965,
 'ac00002131': 108,
 'ac00003175': 34,
 'ac01269983': 115,
 'mv00277210': 188,
 'ac00001054': 48,
 'ac00782011': 178,
 'ac01436025': 89,
 'ac00005938': 542,
 'ac00432043': 134,
 'ac00005548': 92,
 'ac00024886': 167,
 'ac01233441': 175,
 'mv00433675': 63,
 'ac00001624': 176,
 'ac01174141': 72,
 'ac01175227': 273,
 'mv00361969': 283,
 'ac00141505': 64,
 'mv00377748': 489,
 'mv00329092': 45,
 'ac01317645': 253,
 'ac00916045

In [78]:
l = [1,2,3]

In [86]:
s = 'abc'
s.replace('a','1')
s

'abc'

In [83]:
words_sorted.update({'박':1000000000})

In [84]:
sorted(words_sorted.items(), key=lambda x: x[1], reverse=True)

[('박', 1000000000),
 ('ㅋㅋ', 11.833009045021644),
 ('^^', 11.614648293062581),
 ('..', 11.20730539347787),
 ('영화', 10.539577630926946),
 (';;', 10.285064070244998),
 ('ㅎㅎ', 10.257629874887717),
 ('ㄷㄷ', 10.146554013414802),
 ('...', 10.097578211472127),
 ('!!', 10.0515185772582),
 ('ㅉㅉ', 9.996445058728973),
 ('ㅠㅠ', 9.753494636490824),
 ('ㅜㅜ', 9.057708695347726),
 ('-_-', 8.879960942113595),
 ('너무', 8.839562694700145),
 ('ㅋㅋㅋ', 8.561386835767749),
 ('ㅡㅡ', 8.359992621073385),
 ('액션', 8.315829319484239),
 ('솔직히', 7.762938655300572),
 ('정말', 7.572054663039987),
 ('진짜', 7.488597638409546),
 ('주인공', 7.473647422471361),
 ('드라마', 7.316875109417867),
 ('ㄷㄷㄷ', 7.311064172542559),
 ('ㅋㅋㅋㅋ', 7.287575454654342),
 ('+_+', 7.227417011630054),
 ('쓰레기', 7.142920822711188),
 ('....', 7.04769353981828),
 ('!!!', 7.035643633609468),
 ('ㅋㅋㅋㅋㅋ', 6.9073159808439035),
 ('생각', 6.8357974778285495),
 ('때문에', 6.770327354812694),
 ('작품', 6.708542295920357),
 ('마지막', 6.653103426930213),
 ('코미디', 6.581834076419692),
 

In [88]:
from tokenizers import SoyNLPTokenizer

In [90]:
class Config:
    pass
tokenizer = SoyNLPTokenizer(Config)

In [94]:
tokenizer.fit(data.values)

training was done. used memory 1.087 Gb
all cohesion probabilities was computed. # words = 9111
all branching entropies was computed # words = 149119
all accessor variety was computed # words = 149119


{'용을': 0.04636470985760159,
 '사를': 0.1357367254717549,
 '리가': 0.6404853498169146,
 '식의': 0.014849956992525157,
 '구가': 0.03386347024866579,
 '속의': 0.04044185234932494,
 '1은': 0.020329896671908546,
 '치곤': 2.825411940610314,
 '왔다': 0.025980611051999124,
 '기에': 0.3474013766885957,
 '고라': 0.04658602286493049,
 '아서': 0.40727212404451557,
 '많은': 0.053824691032857884,
 '기면': 0.03356831067223783,
 '적임': 0.5545520451077437,
 '성에': 0.037400969602778146,
 '년이': 0.017478956108650554,
 '울한': 0.014864179222169831,
 '부가': 0.02219130467109449,
 '복한': 0.020457722691713538,
 '도면': 0.4357322073775858,
 '예요': 0.1017508222810159,
 '라운': 0.1718740554343142,
 '해서': 1.355876784867556,
 '38': 0.3974469692489844,
 '자를': 0.07316465680871197,
 '들을': 0.23333119041053202,
 '4만': 0.046648218918368865,
 '의를': 0.03113580415049703,
 '느낌': 3.192953444101691,
 '번을': 0.07420697330881716,
 '녀의': 0.028745163200879848,
 '래된': 0.11551074631107137,
 '만든': 1.6039188395034372,
 '식이': 0.014224208761283849,
 '시간': 1.484239996810556