## 실무 테스트 - 임베딩

**데이터 전처리

In [16]:
import pandas as pd
import re

df = pd.read_csv("my_data/review_tags.csv")

print("null 확인",df.isnull().sum(),"\n")
print(df.dtypes)

null 확인 Unnamed: 0                0
brand_code                0
id                        0
sentiment                 0
review_id                 0
review_tag_type_id        0
keywords                 64
message                   0
product_id                0
product_name          15408
created_at                0
dtype: int64 

Unnamed: 0             int64
brand_code            object
id                     int64
sentiment              int64
review_id              int64
review_tag_type_id     int64
keywords              object
message               object
product_id             int64
product_name          object
created_at            object
dtype: object


In [17]:
df = df.dropna().reset_index(drop=True)

In [18]:
df['bnum'] = df['keywords'].apply(lambda x : x.split(",")[0])
df['enum'] =  df['keywords'].apply(lambda x : x.split(",")[1])

df['bnum'] = [re.sub('[^a-zA-Z0-9가-힣\s]', '', df['bnum'][x]) for x in range(0,len(df)) ]
df['enum'] = [re.sub('[^a-zA-Z0-9가-힣\s]', '', df['enum'][x]) for x in range(0,len(df)) ]

df['highlight'] = [df['message'][x][int(df['bnum'][x]):int(df['enum'][x])] for x in range(0,len(df)) ]


In [None]:
df.head()

**하이라이트 테이블

In [20]:
df_message = df.drop_duplicates(['highlight'], keep='first').dropna().reset_index(drop=True)

In [21]:
print("null 확인",df_message.isnull().sum(),"\n")
print("길이",len(df_message))

null 확인 Unnamed: 0            0
brand_code            0
id                    0
sentiment             0
review_id             0
review_tag_type_id    0
keywords              0
message               0
product_id            0
product_name          0
created_at            0
bnum                  0
enum                  0
highlight             0
dtype: int64 

길이 585301


In [22]:
ms_list = df_message['highlight'].tolist()
brand_list = df_message['brand_code'].tolist()
id_list = df_message['review_id'].tolist()
high_list = df_message['highlight'].tolist()

In [23]:
print(len(ms_list)==len(brand_list)==len(id_list)==len(df_message))

True


**리뷰 전처리

In [24]:
WIKI_REMOVE_CHARS = re.compile("'+|(=+.{2,30}=+)|__TOC__|(ファイル:).+|:(en|de|it|fr|es|kr|zh|no|fi):|\n", re.UNICODE)
WIKI_SPACE_CHARS = re.compile("(\\s|゙|゚|　)+", re.UNICODE)
EMAIL_PATTERN = re.compile("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", re.UNICODE)
MULTIPLE_SPACES = re.compile(' +', re.UNICODE)
# URL_PATTERN = re.compile("(ftp|http|https)?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", re.UNICODE)
# WIKI_REMOVE_TOKEN_CHARS = re.compile("(\\*$|:$|^파일:.+|^;)", re.UNICODE)

In [25]:
ms_list = [re.sub(EMAIL_PATTERN, ' ', x) for x in ms_list]  # remove email pattern
ms_list = [re.sub(WIKI_REMOVE_CHARS, ' ', x) for x in ms_list]  
ms_list = [re.sub(WIKI_SPACE_CHARS, ' ', x) for x in ms_list]     
ms_list = [re.sub(MULTIPLE_SPACES, ' ', x) for x in ms_list]    
# ms_list = re.sub(URL_PATTERN, ' ', ms_list) # remove url pattern
# tokens = ms_list.replace(", )", "").split(" ")

In [26]:
# 학습데이터&토큰나이저 입력 단위 길이
length = 50000

text = "\n".join(ms_list[0:length])

In [27]:
text.split("\n")[0]

'사이즈가 색별로 조금식 다른건지 물건마다 다른건지 좀 다르네'

**지도토크나이저 mecab

In [28]:
from konlpy.tag import Mecab
tokenizer = Mecab()

In [14]:
# 학습 데이터 생성
path = "my_data/mecab_list.txt"

with open(path, 'w') as f:
    for i in range(0,1000):
        item = " ".join(tokenizer.morphs(text.split("\n")[i]))
        f.write("%s\n" % item)

**리뷰 토큰 테이블 생성

In [29]:
final_df = pd.DataFrame()
for i in range(0,100):
    df_new = pd.DataFrame(tokenizer.pos(text.split("\n")[i]),columns = ['token','pos'])
    df_new['brand_code'] = brand_list[i]
    df_new['review_id'] = id_list[i]
    df_new['highlight'] = high_list[i]
    final_df = pd.concat([df_new,final_df])

In [30]:
final_df = final_df.reset_index(drop=True)

In [None]:
final_df.head()

In [32]:
final_df['nchar'] = final_df['token'].str.len()

In [33]:
anti_df = final_df[final_df['nchar']==1 & final_df['pos'].str.contains('^N') ]

In [34]:
final_df = pd.merge(final_df, anti_df, how='outer', indicator=True)


In [35]:
final_df = final_df.query("_merge=='left_only'")
final_df = final_df.drop(columns=['_merge','nchar'])

**동사 명사 형용사만 남기기

In [36]:
pos = "^NN|^VA|^VV|^VX|^XR|MAG"

final_df = final_df[final_df['pos'].str.contains(pos) ]

In [None]:
final_df.head()

In [None]:
def add_da(row):
    if row['pos'].startswith('V'):  
        return row['token'] + '다'
    return row['token']


final_df['new_token'] = final_df.apply(lambda row: add_da(row), axis=1)
final_df.head()

**리뷰 태그 아이디 붙이기

In [None]:
df_final = pd.merge(final_df, df[['review_tag_type_id','highlight','review_id','brand_code']], how='left', on=['brand_code','review_id','highlight'])
df_final.head()

**top k 집계

In [40]:
final_df['nchar'] = final_df['pos'].str.len()

gdf = final_df[final_df['nchar']< 4 ].groupby('new_token')['new_token'].count().reset_index(name='counts')
gdf.sort_values(by=['counts'], ascending=False, inplace=True)

In [41]:
gdf = gdf.reset_index(drop=True)

gdf['perc'] = gdf['counts']/sum(gdf['counts'])
gdf['running_total'] = gdf['perc'].cumsum()


gdf.head()

Unnamed: 0,new_token,counts,perc,running_total
0,좋다,42,0.069536,0.069536
1,입다,33,0.054636,0.124172
2,사이즈,22,0.036424,0.160596
3,있다,15,0.024834,0.18543
4,잘,13,0.021523,0.206954


In [42]:
if len(gdf[gdf['perc'] <= 0.9]) > 1000:
    top_df = gdf.iloc[:1000,:]
else:
    top_df = gdf[gdf['perc'] <= 0.9]
    
top_df.head()

Unnamed: 0,new_token,counts,perc,running_total
0,좋다,42,0.069536,0.069536
1,입다,33,0.054636,0.124172
2,사이즈,22,0.036424,0.160596
3,있다,15,0.024834,0.18543
4,잘,13,0.021523,0.206954


## 실무 테스트 - 임베딩 모델 적용하기

In [43]:
corpus_fname = "my_data/mecab_list.txt"
model_fname = "my_data/word2vec"

# from gensim.models import Word2Vec

# corpus = [sent.strip().split(" ") for sent in open(corpus_fname, 'r').readlines()]
# model = Word2Vec(corpus, size = 100, workers = 8, sg =0)
# model.save(model_fname)

In [219]:
# from models.word_eval import WordEmbeddingEvaluator
# model = WordEmbeddingEvaluator(model_fname, method = "word2vec", dim =100, tokenizer_name = "mecab")
# model.most_similar("사이즈",topn = 15)

**top p 데이터 유사토큰 점검

In [44]:
import sys
import numpy as np
import scipy.stats as st
from gensim.models import Word2Vec
from fasttext import load_model as load_ft_model
from sklearn.preprocessing import normalize

from soynlp.hangle import compose, character_is_korean
from preprocess import get_tokenizer, jamo_sentence

sys.path.append('models')
from visualize_utils import visualize_words, visualize_between_words
import sys, re, argparse
from khaiii import KhaiiiApi
from konlpy.tag import Okt, Komoran, Mecab, Hannanum, Kkma



def get_tokenizer(tokenizer_name):
    if tokenizer_name == "komoran":
        tokenizer = Komoran()
    elif tokenizer_name == "okt":
        tokenizer = Okt()
    elif tokenizer_name == "mecab":
        tokenizer = Mecab()
    elif tokenizer_name == "hannanum":
        tokenizer = Hannanum()
    elif tokenizer_name == "kkma":
        tokenizer = Kkma()
    elif tokenizer_name == "khaiii":
        tokenizer = KhaiiiApi()
    else:
        tokenizer = Mecab()
    return tokenizer



class WordEmbeddingEvaluator2:

    def __init__(self, vecs_txt_fname, vecs_bin_fname=None, method="word2vec", dim=100, tokenizer_name="mecab"):
        self.tokenizer = get_tokenizer(tokenizer_name)
        self.tokenizer_name = tokenizer_name
        self.dim = dim
        self.method = method
        self.dictionary, self.words, self.vecs = self.load_vectors(vecs_txt_fname, method)
        if "fasttext" in method:
            self.model = load_ft_model(vecs_bin_fname)
        

    def load_vectors(self, vecs_fname, method):
        if method == "word2vec":
            model = Word2Vec.load(vecs_fname)
            words = model.wv.index2word
            vecs = model.wv.vectors
        else:
            words, vecs = [], []
            with open(vecs_fname, 'r', encoding='utf-8') as f:
                if "fasttext" in method:
                    next(f)  # skip head line
                for line in f:
                    if method == "swivel":
                        splited_line = line.strip().split("\t")
                    else:
                        splited_line = line.strip().split(" ")
                    words.append(splited_line[0])
                    vec = [float(el) for el in splited_line[1:]]
                    vecs.append(vec)
        unit_vecs = normalize(vecs, norm='l2', axis=1)
        dictionary = {}
        for word, vec in zip(words, unit_vecs):
            dictionary[word] = vec
        return dictionary, words, unit_vecs

    def get_word_vector(self, word):
        if self.method == "fasttext-jamo":
            word = jamo_sentence(word)
        if self._is_in_vocabulary(word):
            vector = self.dictionary[word]
        else:
            if "fasttext" in self.method:
                vector = self.model.get_word_vector(word)
            else:
                vector = np.zeros(self.dim)
        return vector

    # token vector들을 lookup한 뒤 평균을 취한다
    def get_sentence_vector(self, sentence):
        if self.tokenizer_name == "khaiii":
            tokens = []
            for word in self.tokenizer.analyze(sentence):
                tokens.extend([str(m).split("/")[0] for m in word.morphs])
        else:
            tokens = self.tokenizer.morphs(sentence)
        token_vecs = []
        for token in tokens:
            token_vecs.append(self.get_word_vector(token))
        return np.mean(token_vecs, axis=0)

    def _is_in_vocabulary(self, word):
        if self.method == "fasttext-jamo":
            word = jamo_sentence(word)
        return word in self.dictionary.keys()

    def most_similar(self, query, topn=10):
        query_vec = self.get_sentence_vector(query)
        return self.most_similar_by_vector(query_vec, topn)

    def most_similar_by_vector(self, query_vec, topn=10):
        query_vec_norm = np.linalg.norm(query_vec)
        if query_vec_norm != 0:
            query_unit_vec = query_vec / query_vec_norm
        else:
            query_unit_vec = query_vec
        scores = np.dot(self.vecs, query_unit_vec)
        topn_candidates = sorted(zip(self.words, scores), key=lambda x: x[1], reverse=True)[1:topn+1]
        total_list = []
        for i in range(0,len(topn_candidates)):
            total_list.append(topn_candidates[i][0])
        return total_list

    def jamo_to_word(self, jamo):
        jamo_list, idx = [], 0
        while idx < len(jamo):
            if not character_is_korean(jamo[idx]):
                jamo_list.append(jamo[idx])
                idx += 1
            else:
                jamo_list.append(jamo[idx:idx + 3])
                idx += 3
        word = ""
        for jamo_char in jamo_list:
            if len(jamo_char) == 1:
                word += jamo_char
            elif jamo_char[2] == "-":
                word += compose(jamo_char[0], jamo_char[1], " ")
            else:
                word += compose(jamo_char[0], jamo_char[1], jamo_char[2])
        return word

    """
    Word similarity test
    Inspired by:
    https://github.com/dongjun-Lee/kor2vec/blob/master/test/similarity_test.py
    """
    def word_sim_test(self, test_fname):
        actual_sim_list, pred_sim_list = [], []
        missed = 0
        with open(test_fname, 'r') as pairs:
            for pair in pairs:
                w1, w2, actual_sim = pair.strip().split(",")
                try:
                    w1_vec = self.get_sentence_vector(w1)
                    w2_vec = self.get_sentence_vector(w2)
                    score = np.dot(w1_vec, w2_vec)
                    actual_sim_list.append(float(actual_sim))
                    pred_sim_list.append(score)
                except KeyError:
                    missed += 1
        spearman, _ = st.spearmanr(actual_sim_list, pred_sim_list)
        pearson, _ = st.pearsonr(actual_sim_list, pred_sim_list)
        print("spearman corr:", spearman, ", pearson corr:", pearson, ", # of errors:", missed)

    """
    Word Analogy test
    Inspired by:
    https://github.com/dongjun-Lee/kor2vec/blob/master/test/analogy_test.py
    """
    def word_analogy_test(self, test_fname, topn=30, verbose=False):
        correct, total, missed = 0, 0, 0
        with open(test_fname, 'r', encoding='utf-8') as f:
            for line in f:
                if line.startswith("#") or len(line) <= 1:
                    continue
                words = line.strip().split(" ")
                query_vecs = self.get_analogy_vector(words[:-1])
                try:
                    word_with_scores = self.most_similar_by_vector(query_vecs, topn)
                    if verbose:
                        print(words[0] + " - " + words[1] + " + " + words[2])
                        print("correct answer:", words[3])
                        print("predicted answers:", word_with_scores)
                        print("")
                    similar_words = [el[0] for el in word_with_scores]
                    if words[-1] in similar_words:
                        correct += 1
                except:
                    missed += 1
                total += 1
        print("# of correct answer:", correct, ", # of data:", total, ", # of errors:", missed)

    def get_analogy_vector(self, words):
        if len(words) == 3:
            token_1 = self.get_sentence_vector(words[0])
            token_2 = self.get_sentence_vector(words[1])
            token_3 = self.get_sentence_vector(words[2])
            result = token_2 + token_3 - token_1
        else:
            result = np.zeros(self.dim)
        return result

    """
    Visualize word representions with T-SNE, Bokeh
    Inspired by:
    https://www.kaggle.com/yohanb/t-sne-bokeh
    https://bokeh.pydata.org
    """
    def visualize_words(self, words_fname, palette="Viridis256"):
        words = set()
        for line in open(words_fname, 'r', encoding='utf-8'):
            if not line.startswith("#"):
                for word in line.strip().split(" "):
                    if len(word) > 0:
                        words.add(word)
        vecs = np.array([self.get_sentence_vector(word) for word in words])
        visualize_words(words, vecs, palette)

    def visualize_between_words(self, words_fname, palette="Viridis256"):
        words = set()
        for line in open(words_fname, 'r'):
            if not line.startswith("#"):
                for word in line.strip().split(" "):
                    if len(word) > 0:
                        words.add(word)
        vecs = [self.get_sentence_vector(word) for word in words]
        visualize_between_words(words, vecs, palette)

In [45]:
model = WordEmbeddingEvaluator2(model_fname, method = "word2vec", dim =100, tokenizer_name = "mecab")


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [46]:
top_df['similar_list'] = [model.most_similar(top_df['new_token'][i],topn = 15) for i in range(0,len(top_df))]


In [47]:
top_df

Unnamed: 0,new_token,counts,perc,running_total,similar_list
0,좋다,42,0.069536,0.069536,"[다, 괜찮, 정말, 별, 마음, 맘, 너무너무, 추가, 참, 넘, 도, 너무나, ..."
1,입다,33,0.054636,0.124172,"[다, 입혀, 입히, 다네, 막상, 코디, 매치, 갖, 입힐, 밖, 받쳐입, 착용,..."
2,사이즈,22,0.036424,0.160596,"[즈, 95, 105, 치수, 싸이, 85, 크기, 100, 사이, 평소, 선택, ..."
3,있다,15,0.024834,0.185430,"[다, 잇, 없, 로운, 어쩔, 정치, 느낄, 처리, 없이, 롭, 벤딩, 모르, 야..."
4,잘,13,0.021523,0.206954,"[딱, 자주, 요긴, 편하, 으려고, 트랜드, 얼추, 수, 가볍, 정말, 이쁘, 올..."
5,맞다,13,0.021523,0.228477,"[맞, 맞음, 어울린다고, 이즈라, 85, 90, 정사, 95, 빠졌, 어요, 받,..."
6,같다,13,0.021523,0.250000,"[같, 애요, 습니다, 네요, 지요, 말, 올라옵니다, 곤, 어요, 습니, 듯, 였..."
7,좀,10,0.016556,0.266556,"[조금, 살짝, 약간, 다소, 쫌, 많이, 통, 다리, 어깨, 소매, 오히려, 나오..."
8,재질,9,0.014901,0.281457,"[원단, 촉감, 소재, 옷감, 감촉, 천, 질감, 착용감, 색감, 품질, 질, 안감..."
9,너무,9,0.014901,0.296358,"[넘, 너무너무, 아주, 엄청, 정말, 진짜, 도, 너무나, 흔하, 완전, 이뻐요,..."


In [48]:
top_df.explode('similar_list')

AttributeError: 'DataFrame' object has no attribute 'explode'