In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Mounted at /content/gdrive


In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import urllib.request
import zipfile
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize

### Data Load & Preprocessing

In [4]:
# 향수 데이터 불러오기
import pandas as pd
import numpy as np
dataset = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/sample_data.csv')

In [5]:
# 전체 리뷰 합친 corpus생성
corpus = ''
for i in range(len(dataset)):
    corpus = ' '.join([corpus, dataset.loc[i,"review"]])

In [6]:
# NLTK를 이용하여 문장 토큰화를 수행.
sent_text = sent_tokenize(corpus)

# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_text = []
for string in sent_text:
    tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
    normalized_text.append(tokens)

In [7]:
# NLTK를 이용하여 문장 토큰화를 수행.
sent_text = sent_tokenize(corpus)

# 각 문장에 대해서 구두점을 제거하고, 대문자를 소문자로 변환.
normalized_text = []
for string in sent_text:
    tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
    normalized_text.append(tokens)

# 불용어 리스트 불러오기
stop_words = []
f = open("/content/gdrive/MyDrive/Colab Notebooks/deeplearning_NLP/perfume/stopword_sample.txt", "r")
lines = f.readlines()
for line in lines:
    line = line.strip()
    if(len(line)!=1):
        stop_words.append(line)
f.close()


# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
result = [word_tokenize(sentence) for sentence in normalized_text]

# # 향수 샘플 불용어 제거
# result2 = []
# for review in result:
#     reviews=[]
#     for word in review:
#         words=''
#         if word not in stop_words:
#             words=''.join([words,word])
#             reviews.append(words)
#     result2.append(reviews)
# result = result2

In [8]:
print('총 샘플의 개수 : {}'.format(len(result)))

총 샘플의 개수 : 13161


In [9]:
# 샘플 3개만 출력
for line in result[:3]:
    print(line)

['this', 'has', 'been', 'my', 'staple', 'since', '1994', 'when', 'i', 'was', 'in', 'high', 'school', 'still', 'love', 'this', 'today']
['the', 'green', 'tea', 'base', 'note', 'is', 'amazing']
['u', 'already', 'know']


## GloVe in Perfume Data

### Training GloVe

In [12]:
! pip install glove-python-binary

Collecting glove-python-binary
[?25l  Downloading https://files.pythonhosted.org/packages/cc/11/d8510a80110f736822856db566341dd2e1e7c3af536f77e409a6c09e0c22/glove_python_binary-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (948kB)
[K     |▍                               | 10kB 17.2MB/s eta 0:00:01[K     |▊                               | 20kB 20.9MB/s eta 0:00:01[K     |█                               | 30kB 24.4MB/s eta 0:00:01[K     |█▍                              | 40kB 28.1MB/s eta 0:00:01[K     |█▊                              | 51kB 30.1MB/s eta 0:00:01[K     |██                              | 61kB 25.2MB/s eta 0:00:01[K     |██▍                             | 71kB 24.9MB/s eta 0:00:01[K     |██▊                             | 81kB 25.0MB/s eta 0:00:01[K     |███                             | 92kB 24.4MB/s eta 0:00:01[K     |███▌                            | 102kB 25.4MB/s eta 0:00:01[K     |███▉                            | 112kB 25.4MB/s eta 0:00:01[K     |████

In [13]:
from glove import Corpus, Glove

corpus = Corpus() 
corpus.fit(result, window=5)
# 훈련 데이터로부터 GloVe에서 사용할 동시 등장 행렬 생성

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
# 학습에 이용할 쓰레드의 개수는 4로 설정, 에포크는 20.

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [28]:
# glove.most_similar()는 입력 단어의 가장 유사한 단어들의 리스트를 리턴
model_result1=glove.most_similar("patchouli")
print(model_result1)

[('coconut', 0.9777404853748229), ('vanilla', 0.9607000615640804), ('amber', 0.9598875188035558), ('citrus', 0.959758655991852)]


## FastText in Perfume Data

In [29]:
from gensim.models import FastText
model = FastText(result, size=100, window=5, min_count=5, workers=4, sg=1)

In [30]:
# 오타를 추가하여 검색 결과 FastText는 유사한 단어를 계산해서 출력
model.wv.most_similar("patchoulii")

[('patchouli', 0.9973238110542297),
 ('benzoin', 0.960613489151001),
 ('patch', 0.9547777771949768),
 ('chocolate', 0.9479886293411255),
 ('ambergris', 0.9479672312736511),
 ('combo', 0.9436361789703369),
 ('undertone', 0.9431796073913574),
 ('combines', 0.9403115510940552),
 ('tonka', 0.9400050640106201),
 ('amber', 0.9394311308860779)]