# 문장수준 임베딩

In [None]:
root_path = "/content/drive/MyDrive/2021-1/AI데이터활용교재개발/code"
import pandas as pd


## LDA : 잠재의미분석
잠재의미분석(LSA : Latent semantic analysis)

단어-문서 행렬이나 TF-IDF 행렬에 특이값 분해로 차원축소를 시행하고 여기세어 해당하는 벡터를 취해 임베딩을 만드는 방법

In [None]:
# mecab 설치
! pip install konlpy

# google colab에서 mecab을 사용하기 편하게 만들어주는 shell 파일도 실행시켜 줍니다.
! git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
! bash ./Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab_light_210108.sh

# shell 파일 출처: https://somjang.tistory.com/entry/Google-Colab에서-Mecab-koMecab-ko-dic-쉽게-사용하기 [솜씨좋은장씨]

In [None]:
import konlpy
from konlpy.tag import Mecab
import pandas as pd

output_fname = root_path + "/data/processed/processed_movie2.txt"
# corpus_fname = root_path + "/data/raw/NewsResult.xlsx"
corpus_fname = root_path + "/data/processed/processed_review_movieid.txt"
model_path = root_path + "/model/lsa-tfidf2.vecs"
tokenizer = Mecab()


In [None]:
# 영화댓글 전처리
noun_corpus = []
titles = []
raw_corpus= []
with open(corpus_fname, encoding='utf-8') as f:
        for line in f:
            try:
                sentence, movie_id = line.strip().split("\u241E")
                raw_corpus.append(sentence)
                titles.append(movie_id)
                tokens = tokenizer.nouns(sentence)
                noun_corpus.append(' '.join(tokens))
            except:
                continue


In [None]:
# corpus = pd.read_excel(corpus_fname)

titles = corpus['제목'].str.replace("\n"," ")
raw_corpus = corpus['본문'].str.replace("\n"," ")
noun_corpus = []

for text in raw_corpus:
    nouns = tokenizer.nouns(text)
    noun_corpus.append(' '.join(nouns))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
# construct tf-idf matrix
vectorizer = TfidfVectorizer(
    min_df=1,
    ngram_range=(1, 1),
    lowercase=True,
    tokenizer=lambda x: x.split())
input_matrix = vectorizer.fit_transform(noun_corpus)

#
# compute truncated SVD(Singular Value Decomposition)
svd = TruncatedSVD(n_components=100)
vecs = svd.fit_transform(input_matrix)
with open(model_path, 'w') as f:
    for doc_idx, vec in enumerate(vecs):
        str_vec = [str(el) for el in vec]
        f.writelines(titles[doc_idx] + "\u241E" + raw_corpus[doc_idx] + '\u241E' + ' '.join(str_vec) + "\n")

In [None]:
# LSA수행
from sklearn.preprocessing import normalize

class LSAEvaluator:
    def __init__(self, model_fname="data/sentence-embeddings/lsa-tfidf/lsa-tfidf.vecs",
                 use_notebook=True):
        self.titles, self.vectors = self.load_model(model_fname)
        self.use_notebook = use_notebook

    def most_similar(self, doc_id, topn=10):
        query_doc_vec = self.vectors[doc_id]
        query_vec_norm = np.linalg.norm(query_doc_vec)
        if query_vec_norm != 0:
            query_unit_vec = query_doc_vec / query_vec_norm
        else:
            query_unit_vec = query_doc_vec
        query_sentence = self.titles[doc_id]
        scores = np.dot(self.vectors, query_unit_vec)
        return [query_sentence, sorted(zip(self.titles, scores), key=lambda x: x[1], reverse=True)[1:topn + 1]]

    def load_model(self, model_fname):
        titles, vectors = [], []

        with open(model_fname, 'r', encoding='utf-8') as f:
            for line in f:
                title, _, str_vec = line.strip().split("\u241E")
                vector = [float(el) for el in str_vec.split()]
                titles.append(title)
                vectors.append(vector)
        return titles, normalize(vectors, axis=1, norm='l2')


model = LSAEvaluator(model_path, use_notebook = True)


In [None]:
from konlpy.tag import Mecab
from collections import defaultdict
from sklearn.preprocessing import normalize
import numpy as np


titles, vectors = [], []

with open(model_path, 'r', encoding='utf-8') as f:
    for line in f:
        title, _, str_vec = line.strip().split("\u241E")
        vector = [float(el) for el in str_vec.split()]
        titles.append(title)
        vectors.append(vector)

vectors =  normalize(vectors, axis=1, norm='l2')


In [None]:
# 벡터 내적을 통해 가장 비슷한 영화를 찾아주는 코드
import requests
from lxml import html
def most_similar(doc_id, vectors, titles, topn=10):
        query_doc_vec = vectors[doc_id]
        query_vec_norm = np.linalg.norm(query_doc_vec)
        if query_vec_norm != 0:
            query_unit_vec = query_doc_vec / query_vec_norm
        else:
            query_unit_vec = query_doc_vec
        query_sentence = titles[doc_id]
        scores = np.dot(vectors, query_unit_vec)
        return [query_sentence, sorted(zip(titles, scores), key=lambda x: x[1], reverse=True)[1:topn + 1]]


def get_movie_title(movie_id):
    url = 'http://movie.naver.com/movie/point/af/list.nhn?st=mcode&target=after&sword=%s' % movie_id.split("_")[0]
    resp = requests.get(url)
    root = html.fromstring(resp.text)
    try:
        title = root.xpath('//div[@class="choice_movie_info"]//h5//a/text()')[0]
    except:
        title = ""
    return title

LSA 모델에 대해 타겟문서와 유사문서 테스트

In [None]:
movie_list = most_similar(doc_id =1, vectors = vectors, titles = titles, topn = 10)

print("타겟 문서 : " + get_movie_title(movie_list[0]))

print("")
for m in movie_list[1]:
    print("유사 문서 : " + get_movie_title(m[0]))

타겟 문서 : 은밀하게 위대하게

유사 문서 : 장미의 이름
유사 문서 : 카이지
유사 문서 : 싸이코
유사 문서 : 포세이돈 어드벤쳐
유사 문서 : 나는 나를 파괴할 권리가 있다
유사 문서 : 둠
유사 문서 : 각시탈 철면객
유사 문서 : 좋은 놈, 나쁜 놈, 이상한 놈
유사 문서 : 쏘우 5
유사 문서 : 크라잉 프리맨


## Doc2Vec
- w2v에 이어 구글팀이 개발한 문서 임베딩 기법
- 문서를 하나의 word처럼 취급해서 임베딩한다

In [None]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec, ldamulticore

def make_form(corpus_fname):
    tagged = []

    tokenizer = Mecab()
    with open(corpus_fname, encoding='utf-8') as f:
        for line in f:
            try:
                sentence, movie_id = line.strip().split("\u241E")
                tokens = tokenizer.morphs(sentence)
                tagged_doc = TaggedDocument(words=tokens, tags=['MOVIE_%s' % movie_id])
                tagged.append(tagged_doc)
            except:
                continue
    return tagged




def make_form2(corpus_fname):
    corpus = pd.read_excel(corpus_fname)
    titles = corpus['제목'].str.replace("\n"," ")
    raw_corpus = corpus['본문'].str.replace("\n"," ")

    tokenizer = Mecab()
    tagged = []
    for i,title in enumerate( titles):
        tokens = tokenizer.morphs(title)
        tagged_doc = TaggedDocument(words=tokens, tags=['NEWS_%s' % str(i)])
        tagged.append(tagged_doc)

    return tagged


In [None]:
output_fname = root_path + "/data/processed/processed_movie_d2v.txt"
corpus_fname = root_path + "/data/processed/processed_review_movieid.txt"

corpus = make_form(corpus_fname)


In [None]:
corpus_fname = root_path + "/data/raw/NewsResult.xlsx"

output_fname = root_path + "/model/lda_news"

corpus = make_form2(corpus_fname)

In [None]:
model = Doc2Vec(corpus, vector_size=50)                                                
model.save(output_fname)

In [None]:
from lxml import html
import sys, requests, random

class Doc2VecEvaluator:
    def __init__(self, model_fname="data/doc2vec.vecs", use_notebook=False):
        self.model = Doc2Vec.load(model_fname)
        self.doc2idx = {el:idx for idx, el in enumerate(self.model.docvecs.doctags.keys())}
        self.use_notebook = use_notebook

    def most_similar(self, movie_id, topn=10):
        similar_movies = self.model.docvecs.most_similar('MOVIE_' + str(movie_id), topn=topn)
        for movie_id, score in similar_movies:
            print(self.get_movie_title(movie_id), score)

    def get_titles_in_corpus(self, n_sample=5):
        movie_ids = random.sample(self.model.docvecs.doctags.keys(), n_sample)
        return {movie_id: self.get_movie_title(movie_id) for movie_id in movie_ids}

    def get_movie_title(self, movie_id):
        url = 'http://movie.naver.com/movie/point/af/list.nhn?st=mcode&target=after&sword=%s' % movie_id.split("_")[1]
        resp = requests.get(url)
        root = html.fromstring(resp.text)
        try:
            title = root.xpath('//div[@class="choice_movie_info"]//h5//a/text()')[0]
        except:
            title = ""
        return title

    def visualize_movies(self, n_sample=30, palette="Viridis256", type="between"):
        movie_ids = self.get_titles_in_corpus(n_sample=n_sample)
        movie_titles = [movie_ids[key] for key in movie_ids.keys()]
        movie_vecs = [self.model.docvecs[self.doc2idx[movie_id]] for movie_id in movie_ids.keys()]
        if type == "between":
            visualize_between_words(movie_titles, movie_vecs, palette, use_notebook=self.use_notebook)
        else:
            visualize_words(movie_titles, movie_vecs, palette, use_notebook=self.use_notebook)

In [None]:
# News
from lxml import html
import sys, requests, random

class Doc2VecEvaluator2:
    def __init__(self, corpus_fname, model_fname="data/doc2vec.vecs", use_notebook=False):
        corpus = pd.read_excel(corpus_fname)
        self.model = Doc2Vec.load(model_fname)
        self.doc2idx = {el:idx for idx, el in enumerate(self.model.docvecs.doctags.keys())}
        self.use_notebook = use_notebook
        self.titles =  corpus['제목'].str.replace("\n"," ")

    def most_similar(self, movie_id, topn=10):
        similar_movies = self.model.docvecs.most_similar('NEWS_' + str(movie_id), topn=topn)
        print("타겟 문서 : " + self.titles[int(movie_id)])
        print("")
        for movie_id, score in similar_movies:
            print("유사문서 : " + self.titles[int(movie_id[5:])] +" "+ str(score))


뉴스제목 비슷한 것 테스트

In [None]:
model = Doc2VecEvaluator2(root_path + "/data/raw/NewsResult.xlsx",output_fname)
model.most_similar(10)

영화에 대해 유사한 영화 테스트

In [None]:
model = Doc2VecEvaluator(output_fname)
model.get_titles_in_corpus(n_sample = 1)

{'MOVIE_10851': '특경도룡'}

In [None]:
# 92575 : '은밀하게 위대하게'
model.most_similar(92575, topn = 10)

고스톱 살인 0.7861570119857788
롤플레이 0.7839573621749878
이니셜 D - 극장판 0.7826319932937622
맥스 페인 0.7758529782295227
짐승 0.7750623822212219
칠검 0.7738490104675293
내일의 죠 0.7731249332427979
48 + 1 0.772041916847229
아이리스 - 극장판 0.7693309783935547
베를린 0.7651496529579163


In [None]:
model.doc2idx

## LDA : Latent Dirichlet Allocation

- 주어진 문서에 대하여 각 문서에 어떤 토픽들이 존재하는지에 대한 확률 모델
- 말뭉치 이면에 잠재된 토픽(주제)를 추출한다는 의미에서 토필 모델링이라고 부르기도 한다.
- 토픽의 수를 미리 정해야 한다

In [None]:
from gensim.models import ldamulticore
from gensim import corpora
import pandas as pd
import pdb

# corpus_fname = root_path + "/data/raw/NewsResult.xlsx"
corpus_fname = root_path + "/data/processed/processed_review_movieid.txt"
output_fname = root_path + '/model/lda'

document, tokenized_corpus = [], []

with open(corpus_fname, 'r', encoding='utf-8') as f:
    
    for sentence in f:
        tokens = list(set(tokenizer.nouns(sentence.strip())))
        document.append(sentence)
        tokenized_corpus.append(tokens)
dictionary = corpora.Dictionary(tokenized_corpus)
corpus = [dictionary.doc2bow(text) for text in tokenized_corpus]
LDA = ldamulticore.LdaMulticore(corpus, id2word=dictionary,
                                num_topics=30,
                                minimum_probability=0.0,
                                workers=4)
# 특정 토픽의 확률이 0.5보다 클 경우에만 데이터를 리턴한다
# 확률의 합은 1이기 때문에 해당 토픽이 해당 문서에서 확률값이 가장 큰 토픽이 된다
all_topics = LDA.get_document_topics(corpus, minimum_probability=0.5, per_word_topics=False)
with open(output_fname + ".resultss", 'w') as f:
    for doc_idx, topic in enumerate(all_topics):
        if len(topic) == 1:
            topic_id, prob = topic[0]
            f.writelines(document[doc_idx].strip() + "\u241E" + ' '.join(tokenized_corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + "\n")
LDA.save(output_fname + ".model")


In [None]:
from collections import defaultdict
from gensim.models import LdaModel
class LDAEvaluator:

    def __init__(self, model_path, tokenizera):
        self.tokenizer = tokenizer
        self.all_topics = self.load_results(model_path + ".results")
        self.model = LdaModel.load(model_path + ".model")

    def load_results(self, results_fname):
        topic_dict = defaultdict(list)
        with open(results_fname, 'r', encoding='utf-8') as f:
            for line in f:
                sentence, _, topic_id, prob = line.strip().split("\u241E")
                topic_dict[int(topic_id)].append((sentence, float(prob)))
        for key in topic_dict.keys():
            topic_dict[key] = sorted(topic_dict[key], key=lambda x: x[1], reverse=True)
        return topic_dict

    def show_topic_docs(self, topic_id, topn=10):
        return self.all_topics[topic_id][:topn]

    def show_topic_words(self, topic_id, topn=10):
        return self.model.show_topic(topic_id, topn=topn)

    def show_new_document_topic(self, documents):
        tokenized_documents = [self.tokenizer.nouns(document) for document in documents]
        curr_corpus = [self.model.id2word.doc2bow(tokenized_document) for tokenized_document in tokenized_documents]
        topics = self.model.get_document_topics(curr_corpus, minimum_probability=0.5, per_word_topics=False)
        for doc_idx, topic in enumerate(topics):
            if len(topic) == 1:
                topic_id, prob = topic[0]
                print(documents[doc_idx], ", topic id:", str(topic_id), ", prob:", str(prob))
            else:
                print(documents[doc_idx], ", there is no dominant topic")



In [None]:
model = LDAEvaluator(output_fname,tokenizer)

In [None]:
# 토픽 10 에 있는 댓글들
text = model.show_topic_docs(topic_id =10)
for t in text:
    print(t)
for t in text:
    print(tokenizer.nouns(t[0]))

('10년전에도 10년이 지난 지금에도 못보겠는건 마찬가지...다들 그 풋풋했던 그 시절 느낌이 좋았다는데 난 그때 감성이 없었나봄..', 0.9731481)
('구라안치고 알바아니고 마블팬도아닌데 정말 나는 재밌게 봤음.8~9점주고싶지만 평점조절을 위해 10점', 0.9697917)
('4년만에 다시 보게된 킬미는 그때보다 깊이있고 더 좋다..강혜정의 대사 한마디 한마디가 어찌나 콕콕 박히는지..', 0.96666664)
('10점을 기준으로 평가했던 다른 영화들의 점수를 내릴 수가 없어서 이것만 12점 정도 주고싶은 느낌', 0.96666664)
('왜 별점이 이러냐 알바생들인가 현실을 뒤덮으려하지마라 4대강은 잘못된정책이다.', 0.9641975)
('다들 알바들인가? 왜이렇게 점수가후해.. 0점주고싶은데 안되서 1점줍니다~~', 0.9628205)
('남자배우 제레미 아이언스 인줄 알았음..노숙자가 단숨에 집도마련하고,미국이좆쿠나', 0.9597222)
('생각보다 볼만하고 평점이 너무 낮아 10점줌 냉정하게 7.8점 정도 영화임...', 0.9597222)
('평점이 너무 낮은거 아닌가... 나름 괜찬던데 긴장감도 있고 적어도 8점짜리 영화라고 생각함', 0.9597222)
('서 울고 알바생들이 단체로와서 10점 찍었네..ㅋㅋ 솔직히 3점도 아깝다.', 0.9597222)
['년', '전', '년', '지금', '건', '마찬가지', '다', '시절', '느낌', '데', '그때', '감성']
['구라', '바', '마블', '팬', '나', '점', '평점', '조절', '점']
['년', '만', '킬미', '그때', '강혜정', '대사', '한마디', '한마디']
['점', '기준', '평가', '영화', '점수', '수', '이것', '점', '정도', '느낌']
['별점', '바', '현실', '대강', '정책']
['바', '점수', '후', '점', '점']
['남자', '배우', '제레미', '아이언스', '인줄', '노숙', '집',

In [None]:
# 토픽들
model.show_topic_words(topic_id = 10)

[('말', 0.100373015),
 ('액션', 0.09122683),
 ('편', 0.06642185),
 ('영화', 0.04693405),
 ('필요', 0.046399146),
 ('스릴러', 0.029394941),
 ('수작', 0.02696253),
 ('물', 0.024509482),
 ('걸작', 0.01568005),
 ('맛', 0.014427378)]

In [None]:
model.show_new_document_topic(["말이 필요없는 액션 영화. 스릴러물"])

말이 필요없는 액션 영화. 스릴러물 , topic id: 10 , prob: 0.6424094
