In [1]:
modelpath = "../../model/"
filepath = "../../data/"
category = "정치"

In [None]:
import pickle
from konlpy.tag import Mecab
import pandas as pd
import unicodedata

def generate_dict_from_WPM(modelpath, modelname) :
    def combine_tokens(tokens, pos = "NNP") :
        return ''.join([t[0] for t in tokens]), pos
    
    filename = "{}{}-vocab.txt".format(modelpath, modelname)
    vocabs = open(filename, 'r') .read().split('\n')
    vocabs = [unicodedata.normalize('NFC',vocab.replace("##","")) for vocab in vocabs]

    mecab = Mecab()
    words = []
    dic = []

    for vocab in vocabs :    
        word = pos = ""
        tokens = mecab.pos(vocab)

        if len(tokens) == 1 : #기존 Mecab 사전에 등록되어있는 경우 제외
            continue
        elif len(tokens) >1 : #기존 Mecab 사전에 등록되어있지 않은 경우
            pos_list = [t[1][0] for t in tokens]
            if ('SY' in [t[1] for t in tokens]) | ('UNKNOWN' in [t[1] for t in tokens]) :
                continue
            elif "의" in vocab[-1] : # "~의" 경우 처리
                if vocab[-2:] in ["들의", "과의", "와의"] :
                    tokens = mecab.pos(vocab[:-2])
                    vocab = vocab[:-2]
                elif vocab[-2:] == "주주의" :
                    tokens = mecab.pos(vocab[:-1])
                    vocab = vocab[:-1]
                elif vocab[-2:] == "회의" :
                    word, pos = combine_tokens(tokens)         
                else :
                    tokens = mecab.pos(vocab[:-1])
                    if (len(tokens) == 1) : #Mecab에 이미 등록되어있는 사전 제외
                        continue
            elif list(set(pos_list)) == ["N"] : #사전에서 분리한 토큰 목록이 전부 명사이면 복합명사로 판단
                pass     
            elif (tokens[-1][1] in ["NNG", "NNP"]) : #마지막 토큰이 명사인 경우 복합명사로 판단
                pos = tokens[-1][1]
            elif ('N' in pos_list ) & (len(pos_list) > 3) : #토큰 중 하나이상이 명사이고 
                continue
            elif 'N' not in pos_list : #명사중 토큰이 하나도 없는 경우
                continue    
            else :            
                continue 

            word, pos = combine_tokens(tokens)
            if (len(word) > 0) & (word not in words) :
                words.append(word)
                dic.append(((word, pos), tokens))
    return dic

In [None]:
import collections
from itertools import takewhile
from collections import defaultdict

def generate_dict_by_category(modelpath, categories=['정치', '경제', '사회', 'IT과학', '생활문화']) :
    compound_dict = defaultdict()
    for cate in categories :
        compound_dict[cate] = generate_dict_from_WPM(modelpath, cate)
        
    vocabs_total = [v[0] for k in compound_dict.keys() for v in compound_dict[k]]
    counter = collections.Counter(vocabs_total) #ngram을 빈도별로 집계
    stopwords = dict(takewhile(lambda i: i[1] == len(compound_dict.keys()), counter.most_common())) #빈도 cutoff 이상 ngram 추출
    stopwords = [k for k in stopwords.keys()]
    
    for cate in categories :
        compound_dict[cate] = [w for w in compound_dict[cate] if w[0] not in stopwords]
        
    pickle_out = open("{}compound_dict.pickle".format(modelpath),"wb")
    pickle.dump(compound_dict, pickle_out)
    pickle_out.close()
    
    return vocabs

compound_dict = generate_dict_by_category(modelpath)

In [2]:
import pickle

pickle_in = open("{}compound_dict.pickle".format(modelpath),"rb")
compound_dict = pickle.load(pickle_in)
compound_dict

defaultdict(None,
            {'정치': [(('한국당', 'NNP'), [('한국', 'NNP'), ('당', 'NNG')]),
              (('바른미래', 'NNP'), [('바른', 'VA+ETM'), ('미래', 'NNG')]),
              (('사일', 'NNP'), [('사', 'NR'), ('일', 'NNBC')]),
              (('최고위원', 'NNP'), [('최고', 'NNG'), ('위원', 'NNG')]),
              (('패스트트랙', 'NNP'), [('패스트', 'NNP'), ('트랙', 'NNG')]),
              (('미더뉴스', 'NNP'), [('미', 'NNG'), ('더', 'MAG'), ('뉴스', 'NNG')]),
              (('공수처', 'NNP'), [('공', 'NNG'), ('수처', 'NNG')]),
              (('인사청', 'NNP'), [('인사', 'NNG'), ('청', 'NNG')]),
              (('민주평화', 'NNP'), [('민주', 'NNG'), ('평화', 'NNG')]),
              (('한국당은', 'NNP'), [('한국', 'NNP'), ('당은', 'NNP+JX')]),
              (('컷뉴스', 'NNP'), [('컷', 'NNG'), ('뉴스', 'NNG')]),
              (('지소미아', 'NNP'), [('지소', 'NNG'), ('미아', 'NNG')]),
              (('주가시', 'NNP'), [('주가', 'NNG'), ('시', 'NNG')]),
              (('모두발언', 'NNP'), [('모두', 'MAG'), ('발언', 'NNG')]),
              (('검찰개혁', 'NNP'), [('검찰', 'NNG'), ('개혁', 'NNG

In [3]:
from konlpy.tag import Mecab

def tokenize(content, compound_dict) :
    mecab = Mecab()
    pos_str = "[" + ", ".join([str(t) for t in mecab.pos(content)]) + "]"
    for pattern in compound_dict :
        src = ", ".join([str(i) for i in pattern[1]])
        tgt = str(pattern[0])
        pos_str = pos_str.replace(src, tgt)

    tokenized_text = eval(pos_str)
    return tokenized_text

In [30]:
import pickle
import collections
from itertools import takewhile
from nltk import ngrams
from konlpy.tag import Mecab
from collections import defaultdict

def extract_compound(contents, compound_dict, category, ngram_n=7, cutoff = 2, group_by_pos = ['NNP', 'NNG', 'SN', 'SH', 'SL', 'NNBC']) :
    def split_by_pos(content, compound_dict, group_by_pos) :
        tmp = []
        split_li = []
        for t in tokenize(content, compound_dict) :
            if t[1] not in group_by_pos :
                if len(tmp) > 1 :
                    split_li.append(tmp)
                tmp = []
            else :
                tmp.append(t)
        return split_li

    mecab = Mecab()
    
    ngrams_li = []
    for content in contents : 
        splits = split_by_pos(content, compound_dict[category], group_by_pos)
        for split in [s for s in splits] :
            ngrams_li.extend([ngram for n in range(min(ngram_n, len(split)), 1, -1) for ngram in ngrams(split, n)])

    counter = collections.Counter(ngrams_li) #ngram을 빈도별로 집계
    #print(ngram_li)
    ngram_counter =  dict(takewhile(lambda i: i[1] >= cutoff, counter.most_common())) #빈도 cutoff 이상 ngram 추출
    
    # 빈도 우선으로 이미 등록된 ngram을 포함하고 있는 경우는 제외
    dct = defaultdict()
    for item in ngram_counter :        
        chk = next(( False for key in dct.keys() if (set(item).issubset(key)) | (set(key).issubset(item))), True)
        if chk : dct[item] = ngram_counter[item]
    
    ret = []
    for k in dct.keys() :
        word = ' '.join([w[0] for w in k])
        ret.append(((word, 'NNP'), list(k), dct[k]))
        
    return ret

In [None]:
import pandas as pd
import numpy as np
import multiprocessing as mp
from functools import partial

categories=['정치', '경제', '사회', 'IT과학', '생활문화']

for c in categories :
    print(c)
    df = pd.read_csv("../../data/{}.csv".format(c))
    contents = df['content'].tolist()
    
    print(num_cores)
    num_cores =mp.cpu_count() - 3
    row_split_count = int(len(contents)/num_cores)
    doc_split = [contents[i:i + row_split_count] for i in range(0, len(contents), row_split_count)]
    
    pool = mp.Pool(processes = num_cores)
    compounds = pool.map(partial(extract_compound, compound_dict = compound_dict, category = c, cutoff = 100), doc_split)
    pool.close()
    pool.join()
    
    compounds = [comp for l in compounds for comp in l]    
    
    pickle_out = open('tmp_ngram_{}.pickle'.format(c),"wb")
    pickle.dump(compounds, pickle_out)
    pickle_out.close()

정치
13
경제
13
사회
13


In [24]:
import pickle

pickle_in = open('tmp_ngram_{}.pickle'.format('정치'),"rb")
ngram = pickle.load(pickle_in)
ngram

[(('flash 오류', 'NNP'), [('flash', 'SL'), ('오류', 'NNG')], 769),
 (('함수 추가 function', 'NNP'),
  [('함수', 'NNG'), ('추가', 'NNG'), ('function', 'SL')],
  769),
 (('3 일', 'NNP'), [('3', 'SN'), ('일', 'NNBC')], 688),
 (('2 차', 'NNP'), [('2', 'SN'), ('차', 'NNBC')], 601),
 (('김 위원장', 'NNP'), [('김', 'NNP'), ('위원장', 'NNP')], 565),
 (('무단 전재', 'NNP'), [('무단', 'NNG'), ('전재', 'NNG')], 517),
 (('배포 금지', 'NNP'), [('배포', 'NNG'), ('금지', 'NNG')], 455),
 (('트럼프 대통령', 'NNP'), [('트럼프', 'NNG'), ('대통령', 'NNP')], 363),
 (('독수리 훈련', 'NNP'), [('독수리', 'NNG'), ('훈련', 'NNG')], 325),
 (('차 북미정상회담', 'NNP'), [('차', 'NNBC'), ('북미정상회담', 'NNP')], 314),
 (('4 일', 'NNP'), [('4', 'SN'), ('일', 'NNBC')], 289),
 (('채널 구독', 'NNP'), [('채널', 'NNG'), ('구독', 'NNG')], 286),
 (('기자 회견', 'NNP'), [('기자', 'NNP'), ('회견', 'NNG')], 279),
 (('개학 연기', 'NNP'), [('개학', 'NNG'), ('연기', 'NNG')], 278),
 (('정상 회담', 'NNP'), [('정상', 'NNG'), ('회담', 'NNG')], 276),
 (('2 일', 'NNP'), [('2', 'SN'), ('일', 'NNBC')], 270),
 (('일 오후', 'NNP'), [('일', 'NNBC'), ('