# 형태소 분석
- 품질 좋은 임베딩을 만들기 위해서는 문장이나 단어의 경계를 컴퓨터에 알려줘야 한다.
- 형태소를 잘 나누지 않으면 단어집합의 크기가 커지고, 연산의 비효율 발생
- 한국어는 조사와 어미가 발달한 교착어->형태소 분석이 중요하다.



In [None]:
root_path = '/content/drive/MyDrive/2021-1/AI데이터활용교재개발/code'

## 지도학습 기반 형태소 분석
 - konlpy 사용하기
 - 사용자 기반 사전

### konlpy와 mecab을 사용하기 위한 패키지 설치

In [None]:
! pip install konlpy

# google colab에서 mecab을 사용하기 편하게 만들어주는 shell 파일도 실행시켜 줍니다.
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# ! cd ./Mecab-ko-for-Google-Colab
# ! bash ./Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab190912.sh
! bash ./Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab_light_210108.sh

# shell 파일 출처: https://somjang.tistory.com/entry/Google-Colab에서-Mecab-koMecab-ko-dic-쉽게-사용하기 [솜씨좋은장씨]

### konlpy 를 사용한 품사 태깅 예시 코드

In [None]:
# NameError: name 'Tagger' is not defined 오류 발생 시 런타임을 재실행 해주세요(ctrl + M)

from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma 

# tokenizer 사용 방법
# tokenizer = Komoran()
# tokenizer = Okt()
# tokenizer = Hannanum()
# tokenizer = Kkma()
from konlpy.tag import Mecab
tokenizer = Mecab()


print(tokenizer.morphs("아버지가방에들어가신다"))
print(tokenizer.pos("아버지가방에들어가신다"))


['아버지', '가', '방', '에', '들어가', '신다']
[('아버지', 'NNG'), ('가', 'JKS'), ('방', 'NNG'), ('에', 'JKB'), ('들어가', 'VV'), ('신다', 'EP+EC')]


In [None]:
import re
def tokenize(corpus_fname, output_fname):
    tokenizer = Mecab()

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            tokens = tokenizer.morphs(sentence)
            tokenized_sent = ' '.join(post_processing(tokens))
            f2.writelines(tokenized_sent + '\n')

def post_processing(tokens):
    results = []
    for token in tokens:
        # 숫자에 공백을 주어서 띄우기
        processed_token = [el for el in re.sub(r"(\d)", r" \1 ", token).split(" ") if len(el) > 0]
        results.extend(processed_token)
    return results


In [None]:
print("wiki")
corpus_fname = root_path + '/data/processed/processed_wiki_ko.txt'
output_fname = root_path + '/data/processed/tokenized/wiki_ko_mecab.txt'
tokenize(corpus_fname, output_fname)

print("naver")
corpus_fname = root_path + '/data/processed/processed_ratings.txt'
output_fname = root_path + '/data/processed/tokenized/ratings_mecab.txt'
tokenize(corpus_fname, output_fname)

print("koquard")
corpus_fname = root_path + '/data/processed/processed_korquad.txt'
output_fname = root_path + '/data/processed/tokenized/korquad_mecab.txt'
tokenize(corpus_fname, output_fname)

명사만 형태소로 저장하는 파일 생성

In [None]:
def noun_tokenize(corpus_fname, output_fname):
    tokenizer = Mecab()

    with open(corpus_fname, 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        for line in f1:
            sentence = line.replace('\n', '').strip()
            tokens = tokenizer.nouns(sentence)
            long_tokens = []
            for t in tokens:
              if len(t)>1:
                long_tokens.append(t)
            if len(long_tokens) >0: 
              tokenized_sent = ' '.join(post_processing(long_tokens))
              f2.writelines(tokenized_sent + '\n')

In [None]:
print("naver")
corpus_fname = root_path + '/data/processed/processed_ratings.txt'
output_fname = root_path + '/data/processed/tokenized/ratings_mecab_noun.txt'
noun_tokenize(corpus_fname, output_fname)

print("koquard")
corpus_fname = root_path + '/data/processed/processed_korquad.txt'
output_fname = root_path + '/data/processed/tokenized/korquad_mecab_noun.txt'
noun_tokenize(corpus_fname, output_fname)

# print("wiki")
# corpus_fname = root_path + '/data/processed/processed_wiki_ko.txt'
# output_fname = root_path + '/data/processed/tokenized/wiki_ko_mecab_noun.txt'
# noun_tokenize(corpus_fname, output_fname)

naver
koquard


### 사용자 기반 사전 추가(Komoran)


In [None]:
tokenizer = Komoran(userdic = root_path + '/user_dic.txt')
print(tokenizer.pos("바람과 함께 사라지다는 정말 명작이야"))

## 비 지도학습 기반 형태소 분석
- soynlp
- bert(bpe)

In [None]:

# ! pip install soynlp
from soynlp.word import WordExtractor
import math
from soynlp.tokenizer import LTokenizer
from soynlp.normalizer import *

In [None]:
def compute_soy_word_score(corpus_fname, model_fname):
    # corpus읽어서 sentences에 넣음
    sentences = [sent.strip() for sent in open(corpus_fname, 'r',  encoding='UTF8').readlines()]

    #학습 파라미터 설정
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0
                                   )
    #학습
    word_extractor.train(sentences)

    #모델 저장
    word_extractor.save(model_fname)

    
corpus_fname = root_path + "/data/processed/processed_korquad.txt"
model_fname = root_path + "/data/processed/soyword.model"

compute_soy_word_score(corpus_fname, model_fname)

In [None]:

def soy_tokenize(corpus_fname, model_fname):
  # 학습환경과 동일하게 word_extractor 객체 생성
    word_extractor = WordExtractor(min_frequency=100,
                                   min_cohesion_forward=0.05,
                                   min_right_branching_entropy=0.0
                                   )
    
    #학습된 모델 로드
    word_extractor.load(model_fname)

    #저장된 점수 로드
    scores = word_extractor.word_scores()
    scores = {key:(scores[key].cohesion_forward * math.exp(scores[key].right_branching_entropy)) for key in scores.keys()}
    tokenizer = LTokenizer(scores=scores)

    # 테스트
    tokens = tokenizer.tokenize('패턴을 스스로 학습한다')
    print(tokens)

soy_tokenize(corpus_fname, model_fname)

soynlp space교정

In [None]:
!pip install soyspacing
from soyspacing.countbase import CountSpace

corpus_fname =  root_path + '/data/processed/processed_ratings.txt'
model_fname = root_path + '/data/processed/space-correct.model'

model = CountSpace()
model.train(corpus_fname)
model.save_model(model_fname, json_format = False)

# 모델을 로드하는 부분. 사실 위에서 한번 학습 시킨 것이므로 로드 할 필요는 없지만 나중에 필요할 경우를 위해서 넣어 둠
model.load_model(model_fname, json_format = False)
print(model.correct("어릴때보고다시봐도재미있다"))


### sentence piece + bert

In [None]:
! pip install sentencepiece
import sentencepiece as spm

sentece piece Train 후 bert모델에 넣을 수 있게 형태 변형

In [None]:
in_f= root_path + '/data/processed/processed_wiki_ko.txt '
vocab_f = root_path +'/data/processed/bert.vocab'

def make_bert_vocab(input_fname, output_fname):

  # sentence piece 학습
    train = '--input=' + input_fname + ' --model_prefix=sentpiece --vocab_size=5000 --model_type=bpe --character_coverage=0.9995'
    spm.SentencePieceTrainer.Train(train)

    #[PAD][UNK] 등 bert 모델에서 쓰는 토큰을 넣어 형태 수정
    with open('sentpiece.vocab', 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        f2.writelines("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for line in f1:
            word = line.replace('\n', '').split('\t')[0]
            if not word or word in ["▁", "<unk>", "<s>", "</s>"]:
                continue
            if word[0] == '▁':
                word = word.replace('▁', '')
            else:
                word = '##' + word
            f2.writelines(word + "\n")

make_bert_vocab(in_f, vocab_f)


bert 모델 test

In [None]:
# !pip install transformers
import transformers
vocab_fname = root_path + "/data/processed/bert.vocab"
vocab_fname = root_path + "/data/processed/bert.vocab"
tokenizer =transformers.BertTokenizer(vocab_file = vocab_fname, do_lower_case = False)

print(tokenizer.tokenize("동해물과 백두산이 마르고 닳도록 하느님이 보우하사 우리나라 만세"))
