# 180414_soynlp

In [1]:
import soynlp

In [2]:
import os
import pickle
import sys
import warnings

import tensorflow as tf
import pandas as pd

  from ._conv import register_converters as _register_converters


In [3]:
with open("../data/movie.txt") as f:
    data = f.readlines()
    data = [str(t.strip()) for t in data]
    data = pd.DataFrame(data, columns=['comment'])
    
with open("../data/movie_label.txt") as f:
    score = f.readlines()
    data['score'] = pd.DataFrame(score)
    data['score'] = data['score'].map(lambda x: int(x.strip()))

In [4]:
from soynlp.word import WordExtractor

In [105]:
len(data)

520343

In [5]:
sentences = data['comment'].values

In [6]:
word_extractor = WordExtractor(min_count=100,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0)
word_extractor.train(sentences) # list of str or like
words = word_extractor.extract()

training was done. used memory 1.072 Gb
all cohesion probabilities was computed. # words = 9228
all branching entropies was computed # words = 152694
all accessor variety was computed # words = 152694


In [40]:
word_extractor = WordExtractor(min_count=10,
    min_cohesion_forward=0.05, 
    min_right_branching_entropy=0.0)
word_extractor.train(sentences) # list of str or like
words = word_extractor.extract()

training was done. used memory 1.037 Gb
all cohesion probabilities was computed. # words = 75041
all branching entropies was computed # words = 157381
all accessor variety was computed # words = 157381


In [30]:
words_sorted = [(word,(score.cohesion_backward + \
                      score.cohesion_forward)*(score.left_branching_entropy+score.right_branching_entropy)) 
                for word, score in words.items()]

In [34]:
words_sorted = {word:score for word, score in words_sorted if score!=0}

In [36]:
words_sorted

6232

In [37]:
from soynlp.tokenizer import MaxScoreTokenizer

In [38]:
tokenizer = MaxScoreTokenizer(scores=words_sorted)

In [39]:
tokenizer.tokenize("안녕하세요")

['안녕하', '세요']

In [44]:
from collections import Counter

In [46]:
import re

In [62]:
re_movie_actor = re.compile("mv[0-9]*|ac[0-9]*")
ls = []
for review in sentences:
    movie_actor_token = re_movie_actor.findall(review)
    if movie_actor_token:
        ls += movie_actor_token

In [71]:
c = {name:freq for name, freq in c.items() if freq>30}

In [78]:
l = [1,2,3]

In [86]:
s = 'abc'
s.replace('a','1')
s

'abc'

In [83]:
words_sorted.update({'박':1000000000})

In [88]:
from tokenizers import SoyNLPTokenizer

In [90]:
class Config:
    pass
tokenizer = SoyNLPTokenizer(Config)

In [94]:
tokenizer.fit(data.values)

training was done. used memory 1.087 Gb
all cohesion probabilities was computed. # words = 9111
all branching entropies was computed # words = 149119
all accessor variety was computed # words = 149119


In [101]:
tokens_set = set()
for review in sentences:
    tokens = tokenizer.tokenize(review)
    tokens_set.update(set(tokens))

In [104]:
len(tokens_set)

196319

In [109]:
import seaborn as sns

In [117]:
c = Counter(ls)
print("총 언급된 영화/배우 수  : {}".format(len(c)))
print("1번 언급된 영화/배우 수 : {}".format(len([x for x in c.values() if x==1])))
print("2번 언급된 영화/배우 수 : {}".format(len([x for x in c.values() if x==2])))
print("3번 언급된 영화/배우 수 : {}".format(len([x for x in c.values() if x==3])))
print("4번 언급된 영화/배우 수 : {}".format(len([x for x in c.values() if x==4])))
print("5번 언급된 영화/배우 수 : {}".format(len([x for x in c.values() if x==5])))

총 언급된 영화/배우 수  : 8888
1번 언급된 영화/배우 수 : 4159
2번 언급된 영화/배우 수 : 1338
3번 언급된 영화/배우 수 : 627
4번 언급된 영화/배우 수 : 416
5번 언급된 영화/배우 수 : 310


In [121]:
from gensim.models import FastText
from konlpy.tag import Twitter

In [123]:
twitter = Twitter()

In [125]:
tokenized_reviews = [tokenizer.tokenize(review) for review in sentences]

In [127]:
tokenized_reviews[0]

['아', '련한', '향', '수를', '떠올', '리게', '만', '드는', '추', '억의', '영화']

In [129]:
fasttext = FastText(sentences=tokenized_reviews, sg=1, size=100, window=5, negative=10, min_n=1, max_n=4, iter=20)

In [131]:
fasttext.most_similar("쓰레기",topn=30)

  """Entry point for launching an IPython kernel.


[('개쓰래기', 0.83719801902771),
 ('쓰래기', 0.8296200633049011),
 ('개망작', 0.7729056477546692),
 ('개허접', 0.7694747447967529),
 ('개졸작', 0.7684503793716431),
 ('개쓰래', 0.7640445828437805),
 ('개막장', 0.7610095143318176),
 ('저질', 0.7569889426231384),
 ('졸작', 0.7537485361099243),
 ('쓰렉', 0.7430212497711182),
 ('선동용', 0.741576611995697),
 ('ㅆㄹㄱ', 0.7334646582603455),
 ('개똥', 0.7254996299743652),
 ('쓰뤠기', 0.7234551906585693),
 ('쓰랙', 0.72145676612854),
 ('레기', 0.7201757431030273),
 ('갖다버려', 0.718366265296936),
 ('쓰렉이', 0.7159989476203918),
 ('국뽕', 0.7091118693351746),
 ('재활', 0.709067165851593),
 ('하급', 0.7087844014167786),
 ('개망', 0.7077004313468933),
 ('이꼴', 0.7057012319564819),
 ('개한', 0.7045454382896423),
 ('개떡', 0.7035484910011292),
 ('최악임', 0.7012590169906616),
 ('쓰래', 0.700785219669342),
 ('3류', 0.7007443904876709),
 ('한쿡', 0.7006980776786804),
 ('개거품', 0.6983471512794495)]

In [132]:
fasttext.most_similar("병신",topn=30)

  """Entry point for launching an IPython kernel.


[('븅신', 0.8223216533660889),
 ('빙신', 0.8182974457740784),
 ('병신들', 0.8076996803283691),
 ('병1신', 0.796699583530426),
 ('무뇌충', 0.7643649578094482),
 ('ㅄ', 0.7523525357246399),
 ('제정신', 0.748960554599762),
 ('중2병', 0.7420171499252319),
 ('또라이', 0.7412704229354858),
 ('바보들', 0.7355386018753052),
 ('미친것', 0.7331432104110718),
 ('어린것', 0.7294114828109741),
 ('이꼴', 0.7250562310218811),
 ('미친놈', 0.723762571811676),
 ('정신병자', 0.7205844521522522),
 ('내참', 0.7205693125724792),
 ('무뇌', 0.7200883030891418),
 ('일본놈', 0.7199276089668274),
 ('등신', 0.7197530269622803),
 ('개독', 0.7193326354026794),
 ('다보네', 0.7182186841964722),
 ('병', 0.7159683108329773),
 ('놈', 0.7156092524528503),
 ('낚시꾼', 0.715136706829071),
 ('무뇌아', 0.7143422961235046),
 ('새끼', 0.7137678265571594),
 ('새끼들', 0.7124447226524353),
 ('기자', 0.7118920683860779),
 ('알바생', 0.711039125919342),
 ('쿠', 0.7103320956230164)]

In [136]:
fasttext.most_similar("존나")

  """Entry point for launching an IPython kernel.


[('졸라', 0.9146998524665833),
 ('조낸', 0.8310585618019104),
 ('존내', 0.8295345902442932),
 ('존니', 0.8209677338600159),
 ('존나재', 0.8044064044952393),
 ('ㅈㄴ', 0.8017397522926331),
 ('진짜', 0.7920728325843811),
 ('겁나', 0.788737952709198),
 ('존나잼', 0.7864083647727966),
 ('존나웃', 0.7783069014549255)]

In [138]:
fasttext.most_similar("허접")

  """Entry point for launching an IPython kernel.


[('개허접', 0.8225301504135132),
 ('유치', 0.7779809236526489),
 ('엉성', 0.7752565741539001),
 ('허접해', 0.7681140303611755),
 ('어색', 0.7557356953620911),
 ('시시', 0.7530478835105896),
 ('허접함', 0.7498729228973389),
 ('부실', 0.7300997972488403),
 ('조잡', 0.7283927202224731),
 ('허약', 0.7272347807884216)]

In [139]:
fasttext.most_similar("시발",topn=30)

  """Entry point for launching an IPython kernel.


[('ㄲㅈ', 0.7385265231132507),
 ('ㅆㅃ', 0.7364158034324646),
 ('ㅆㄹㄱ', 0.7333856821060181),
 ('ㅆㅂ', 0.7262677550315857),
 ('🐱아', 0.7224671244621277),
 ('팔아', 0.718827486038208),
 ('신발', 0.7162114381790161),
 ('ㅁㅊ', 0.7144342064857483),
 ('씨발', 0.707223117351532),
 ('죵나', 0.7027004957199097),
 ('절대아', 0.7014756798744202),
 ('ㅂㅂ', 0.7007321119308472),
 ('아싸', 0.7003445029258728),
 ('알바아', 0.6995455026626587),
 ('괜히봄', 0.6956772804260254),
 ('재용아', 0.6947632431983948),
 ('장난치냐', 0.6945180296897888),
 ('뒤져라', 0.6944048404693604),
 ('시팔', 0.6903988122940063),
 ('절름발', 0.689546525478363),
 ('경규형', 0.6884580254554749),
 ('개안습', 0.68743896484375),
 ('나원참', 0.6874103546142578),
 ('ㅅㅂ', 0.686514139175415),
 ('시1발', 0.6861288547515869),
 ('개쓰래', 0.6856763362884521),
 ('선동렬', 0.6851179003715515),
 ('내놔라', 0.6849446892738342),
 ('개~', 0.6831190586090088),
 ('개굿', 0.6822612881660461)]

In [140]:
import re

In [141]:
r = re.compile("쓰레기|ㅆㄹㄱ|ㅆㄹㄱ")

In [142]:
r.findall("쓰레기 ㅆㄹㄱ 같은 ㅈ같은 영화")

['쓰레기', 'ㅆㄹㄱ']

In [159]:
r = re.compile("[A-Za-z]+")

In [160]:
r.findall("이런 개쓰레기 같은 movie")

['mo', 'vie']

In [172]:
english_words = []
for sentence in sentences:
    english = r.findall(sentence)
    english = list(set(english) - set(['mv', 'ac']))
    if english:
        english_words += english

In [175]:
Counter(english_words).most_common(100)

[('good', 1705),
 ('CG', 1077),
 ('SF', 717),
 ('gt', 592),
 ('lt', 554),
 ('b', 469),
 ('ost', 440),
 ('D', 424),
 ('cg', 411),
 ('B', 361),
 ('OST', 321),
 ('DVD', 289),
 ('TV', 281),
 ('sf', 233),
 ('GOOD', 206),
 ('very', 189),
 ('d', 179),
 ('T', 176),
 ('a', 175),
 ('tv', 168),
 ('best', 163),
 ('of', 160),
 ('X', 146),
 ('the', 146),
 ('I', 140),
 ('dvd', 126),
 ('bad', 125),
 ('x', 124),
 ('is', 124),
 ('s', 119),
 ('so', 111),
 ('A', 103),
 ('movie', 96),
 ('ocn', 83),
 ('my', 82),
 ('n', 77),
 ('you', 76),
 ('OCN', 74),
 ('but', 66),
 ('not', 65),
 ('it', 62),
 ('The', 60),
 ('bb', 59),
 ('love', 59),
 ('up', 58),
 ('CF', 56),
 ('vs', 56),
 ('Very', 55),
 ('no', 54),
 ('BEST', 52),
 ('i', 52),
 ('It', 46),
 ('to', 46),
 ('C', 46),
 ('No', 45),
 ('cgv', 45),
 ('l', 45),
 ('m', 44),
 ('o', 44),
 ('zz', 42),
 ('me', 42),
 ('O', 41),
 ('v', 41),
 ('great', 40),
 ('wow', 40),
 ('EBS', 40),
 ('CGV', 40),
 ('in', 37),
 ('t', 36),
 ('Oh', 36),
 ('oh', 36),
 ('zzz', 36),
 ('z', 35),
 

In [179]:
"안녕하세요aa".upper()

'안녕하세요AA'