In [49]:
import re, collections
from IPython.display import display, Markdown, Latex
import pickle
from tensorflow.keras.models import load_model
from gensim.models import FastText

with open('Emotional_BQE_Dict.pickle', 'rb') as fr:
    bpe_codes = pickle.load(fr)
    
with open('Emotional_WordToIndex.pickle', 'rb') as fr:
    Emo_WordToIndex = pickle.load(fr)
    
with open('NER_WordToIndex.pickle', 'rb') as fr:
    NER_WordToIndex = pickle.load(fr)
    
with open('Index_To_NER.pickle', 'rb') as fr:
    index_to_NER= pickle.load(fr)

Emo_model = load_model('Emo_class_model.h5')
Emo_FT = FastText.load('Emo_FT.model')
NER_model = load_model('NER_class_model.h5')
NER_FT = FastText.load('NER_FT.model')

In [50]:
def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as a tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


def BPE_encode(orig):
    """Encode word based on list of BPE merge operations, which are applied consecutively"""

    word = tuple(orig) + ('</w>',)
    # display(Markdown("__word split into characters:__ <tt>{}</tt>".format(word)))

    pairs = get_pairs(word)    

    if not pairs:
        return orig

    iteration = 0
    while True:
        iteration += 1
        # display(Markdown("__Iteration {}:__".format(iteration)))

        # print("bigrams in the word: {}".format(pairs))
        bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
        # print("candidate for merging: {}".format(bigram))
        if bigram not in bpe_codes:
            # display(Markdown("__Candidate not in BPE merges, algorithm stops.__"))
            break
        first, second = bigram
        new_word = []
        i = 0
        while i < len(word):
            try:
                j = word.index(first, i)
                new_word.extend(word[i:j])
                i = j
            except:
                new_word.extend(word[i:])
                break

            if word[i] == first and i < len(word)-1 and word[i+1] == second:
                new_word.append(first+second)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        new_word = tuple(new_word)
        word = new_word
        # print("word after merging: {}".format(word))
        if len(word) == 1:
            break
        else:
            pairs = get_pairs(word)

    # 특별 토큰인 </w>는 출력하지 않는다.
    if word[-1] == '</w>':
        word = word[:-1]
    elif word[-1].endswith('</w>'):
        word = word[:-1] + (word[-1].replace('</w>',''),)

    return word

In [63]:
from konlpy.tag import Okt
from konlpy.tag import Kkma
from eunjeon import Mecab
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

mecab = Mecab()

def tokenizing(inputText):
    tokenized_sentence = []
    
    tagged_Text = mecab.pos(inputText)
    
    for tag, pos in tagged_Text:
        tokenized_sentence.append(tag)
        
    return tokenized_sentence
    
def Int_encode(inputList, FT_model, vocab):
    templist=[]
    
    for word in inputList:
        if word not in vocab:
            similars = FT_model.wv.most_similar(word)
            tag, value = similars[0]
            if value < 0.8:  # 유사성이 심하게 낮을 때 BPE로 subword로 분리해버린 다음에 인코딩
                words = BPE_encode(word)
                for word in words:
                    if word in vocab:
                        templist.append(vocab[word])
                        
            else:
                templist.append(vocab[tag])
        
        else:        
            templist.append(vocab[word])
            
    return templist

def PreProcessing(InputText, FT_model, vocab):
    tokenized_Text = tokenizing(InputText)
    encoded_Text = Int_encode(tokenized_Text, FT_model, vocab)
    
    return encoded_Text

def Predict(Processed_Text, model):
    Input = np.array([Processed_Text])
    result = model.predict(Input)
    
    return list(result[0])
    
def EmotionClassification(InputText, model, FT_model, vocab):
    processed = PreProcessing(InputText, FT_model, vocab)
    predicted = Predict(processed, model)
    Label = ""
    
    if predicted[0] > predicted[1]:
        Label = "부정"
    else:
        Label = "긍정"
            
    return Label

def NERClassification(InputText, model, FT_model, vocab):
    tokenized = tokenizing(InputText)
    encoded = temp_int_encode(tokenized, FT_model, vocab)
    predicted = Predict(encoded, model)
    y_predicted = np.argmax(predicted, axis=-1) # 확률 벡터를 정수 인코딩으로 변경함
    
    Ner_list = []
    
    for i in range(len(y_predicted)):
        if y_predicted[i] != 1:
            Ner_tuple = (tokenized[i], index_to_NER[y_predicted[i]])
            Ner_list.append(Ner_tuple)
    
    return Ner_list
    
def temp_int_encode(inputList, FT_model, vocab):
    templist=[]
    
    for word in inputList:
        if word not in vocab:
            similars = FT_model.wv.most_similar(word)
            tag, value = similars[0]
            
            templist.append(vocab[tag])
            
        else:        
            templist.append(vocab[word])
            
    return templist

def CreateDataStruct(name, InputText, Emo_model, Emo_FT, NER_model, NER_FT, Emo_vocab, NER_vocab):
    Emo_result = EmotionClassification(InputText, Emo_model, Emo_FT, Emo_vocab)
    NER_result = NERClassification(InputText, NER_model, NER_FT, NER_vocab)
    
    print("\n내담자 : {0}\n입력문장 : {1}\n[\n  감정정보 : {2}\n  개체명 인식 : {3}\n]"\
          .format(name, InputText, Emo_result, NER_result))

In [64]:
testText = "지난 해에 범죄로 확정된 건은 15건이고 미해결된게 1700여건이면 문제 심각한거같은데"
counselee = "김수진"
# print(EmotionClassification(testText, Emo_model, Emo_FT, Emo_WordToIndex))
# print(NERClassification(testText, NER_model, NER_FT, NER_WordToIndex))

CreateDataStruct(counselee, testText, Emo_model, Emo_FT, NER_model, NER_FT, Emo_WordToIndex, NER_WordToIndex)


내담자 : 김수진
입력문장 : 지난 해에 범죄로 확정된 건은 15건이고 미해결된게 1700여건이면 문제 심각한거같은데
[
  감정정보 : 부정
  개체명 인식 : [('지난', 'DAT_B'), ('해', 'DAT_I'), ('15', 'NUM_B'), ('1700', 'NUM_B')]
]
