In [1]:
import konlpy
from konlpy.tag import Hannanum, Kkma, Komoran

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk

import re
import sys
import os
import time
import random
import pandas as pd
import numpy as np

In [2]:
#감정분석용 데이터 불러오기
train_data = pd.read_csv('data/english_train.csv',encoding='utf-8', engine='python', index_col=0)
test_data = pd.read_csv('data/english_test.csv', encoding='utf-8', engine='python',index_col=0)

In [3]:
# 전처리를 위해 데이터 합치기
whole_data = pd.concat([train_data, test_data]).reset_index(drop=True)

In [4]:
docs_tr = [re.sub(r"\%\$ ?\([^)]+\)-", "", x) for x in whole_data.reviews.values.tolist()]
docs_tr = [re.sub("\'", " ", x) for x in docs_tr]
docs_tr = [re.sub('\S*@\S*\s?', '', x) for x in docs_tr]
docs_tr = [re.sub('\s+', ' ', x) for x in docs_tr]
docs_tr = [re.sub('[^\w\s]','',x) for x in docs_tr]

In [5]:
whole_data.reviews = docs_tr

In [6]:
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Cho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
#토큰화
docs_tr = [word_tokenize(review) for review in whole_data.reviews.values.tolist()]

In [8]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/Cho/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# 표제어 추출
lemmatizer = WordNetLemmatizer()
lemm_docs_tr = [[lemmatizer.lemmatize(word) for word in doc] for doc in docs_tr]

In [10]:
nltk.download('stopwords'
)

[nltk_data] Downloading package stopwords to /Users/Cho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
#불용어 제거
stop_words = stopwords.words('english')
filtered_docs = [[w for w in doc if not w in stop_words] for doc in lemm_docs_tr]

In [12]:
whole_data.reviews =filtered_docs

In [13]:
tokens = [tok for sublist in filtered_docs for tok in sublist]

In [14]:
# nltk 객체생성
text=nltk.Text(tokens)

In [15]:
print(len(set(text.tokens)))

41166


In [16]:
print(text.vocab().most_common(10))

[('film', 10963), ('movie', 6854), ('one', 5756), ('wa', 4935), ('ha', 4747), ('character', 3853), ('like', 3651), ('time', 2849), ('get', 2785), ('scene', 2638)]


In [17]:
# TF-IDF 생성
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [18]:
train_docs = whole_data.reviews.values.tolist()[:len(train_data)]
test_docs = whole_data.reviews.values.tolist()[-len(test_data):]

In [19]:
train_docs = [' '.join(doc) for doc in train_docs]

In [20]:
test_docs = [' '.join(doc) for doc in test_docs]

In [21]:
train_x = vectorizer.fit_transform(train_docs)

In [22]:
test_x = vectorizer.transform(test_docs)

In [23]:
train_y = whole_data.sentiment.values.tolist()[:len(train_data)]
test_y = whole_data.sentiment.values.tolist()[-len(test_data):]

In [24]:
# SVM Linear 생성 성능이 제일 좋았음
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(train_x, train_y)
prediction_linear = classifier_linear.predict(test_x)

In [25]:
print(accuracy_score(test_y, prediction_linear))

0.905


# 모델 생성 완료 후 리뷰데이터 가져와서 학습된 모델로 감정분석해보기

In [26]:
# 리뷰 파일 가져오기

i7_csv = pd.read_csv('[Textmining]review_galaxy_iphone_use/GSMArena/data_from_GSMArena_i7_test.csv')

S8_csv = pd.read_csv('[Textmining]review_galaxy_iphone_use/GSMArena/data_from_GSMArena_S8_test.csv')

In [27]:
# 리뷰만 다 읽어와
# i[1:] 한 이유는 맨앞에 b를제거하기 위해서 엑셀에 보면 맨앞에 b가 붙어있음 byte데이터라서

i7_reviews = [ i[1:] for i in i7_csv['Text']]
S8_reviews = [ i[1:] for i in S8_csv['Text']]

In [28]:
# 전처리 특문 다 빼고 흔한단어 iphone, galaxy 같은거 뺌

i7_reviews_data = [re.sub(r"\%\$ ?\([^)]+\)-", "", x) for x in i7_reviews]
i7_reviews_data = [re.sub("\'", " ", x) for x in i7_reviews_data]
i7_reviews_data = [re.sub('\S*@\S*\s?', '', x) for x in i7_reviews_data]
i7_reviews_data = [re.sub('\s+', ' ', x) for x in i7_reviews_data]
i7_reviews_data = [re.sub('[^\w\s]','',x) for x in i7_reviews_data]
i7_reviews_data = [x.lower() for x in i7_reviews_data]
i7_reviews_data = [re.sub('(iphone|phone|apple|galaxy|android|samsung|7)','',x) for x in i7_reviews_data]

S8_reviews_data = [re.sub(r"\%\$ ?\([^)]+\)-", "", x) for x in S8_reviews]
S8_reviews_data = [re.sub("\'", " ", x) for x in S8_reviews_data]
S8_reviews_data = [re.sub('\S*@\S*\s?', '', x) for x in S8_reviews_data]
S8_reviews_data = [re.sub('\s+', ' ', x) for x in S8_reviews_data]
S8_reviews_data = [re.sub('[^\w\s]','',x) for x in S8_reviews_data]
S8_reviews_data = [x.lower() for x in S8_reviews_data]
S8_reviews_data = [re.sub('(iphone|phone|apple|galaxy|android|samsung|7)','',x) for x in S8_reviews_data]

In [29]:
# tokenize 띄어쓰기로 나눈다

i7_tokens = [word_tokenize(review) for review in i7_reviews_data]
S8_tokens = [word_tokenize(review) for review in S8_reviews_data]

In [30]:
# lemmatizer(표제어추출)

lemmatizer = WordNetLemmatizer()
i7_lemm = [[lemmatizer.lemmatize(word) for word in doc] for doc in i7_tokens]
S8_lemm = [[lemmatizer.lemmatize(word) for word in doc] for doc in S8_tokens]

In [31]:
# stopword 제거 불용어 제거하기 비동사나 관사가 빠진다

stop_words = stopwords.words('english')
filtered_i7 = [[w for w in doc if not w in stop_words] for doc in i7_lemm]
filtered_S8 = [[w for w in doc if not w in stop_words] for doc in S8_lemm]

In [32]:
# 감정분석에 사용했던 TF-IDF vectorizer를 가져와서 적용한다

i7_joined = [' '.join(doc) for doc in filtered_i7]
tfidf_i7 = vectorizer.transform(i7_joined)
S8_joined = [' '.join(doc) for doc in filtered_S8]
tfidf_S8 = vectorizer.transform(S8_joined)

In [33]:
i7_predict = classifier_linear.predict(tfidf_i7)
S8_predict = classifier_linear.predict(tfidf_S8)

In [34]:
i7_labeled = []
S8_labeled = []

for i in range(len(i7_reviews_data)):
    i7_labeled.append([i7_reviews[i], i7_predict[i]])
for i in range(len(S8_reviews_data)):
    S8_labeled.append([S8_reviews[i], S8_predict[i]])

In [35]:
# 감정분석 데이터 보기

i7_labeled = pd.DataFrame(i7_labeled, columns=['Review','Sentimental'])
S8_labeled = pd.DataFrame(S8_labeled, columns=['Review','Sentimental'])

In [36]:
i7_labeled

Unnamed: 0,Review,Sentimental
0,'Which is a better option 6s plus or iPhone 7D...,-1
1,'Which is a better option 6s plus or iPhone 77+',-1
2,'I want to buy 7 32 gb.. \r\nIs it ok guys... ...,1
3,'Only true on poor countries. When people are ...,-1
4,'Which is a better option 6s plus or iPhone 7',-1
...,...,...
3175,"""It can't be a hexacore. Highest will be a qua...",-1
3176,"'The size of this phone can\'t be 4.7"". We sho...",-1
3177,'I buy lumia 640 for $20 clearance runs like ...,-1
3178,'720p display in 2016 .. Hahahhaha i am writin...,-1


In [37]:
S8_labeled[S8_labeled.Sentimental==1]

Unnamed: 0,Review,Sentimental
7,"""iPhones are known for future proof n long las...",1
11,"""True S8 is the best looking phone with all th...",1
14,"""Bro, even the fingerprint scanner can be hack...",1
22,'using this device about 7 days onword. no iss...,1
24,'Samsung should also work in compact mini phon...,1
...,...,...
2874,"""jaja plase don't be so stupid first of all 24...",1
2875,'2 days after buying the s7 now they planned t...,1
2877,'after oneplus 3 lunch samsung wants to sale s...,1
2880,"""Still no signs of IR Blaster! What's wrong wi...",1


In [38]:
# Convert the dataframe to an XlsxWriter Excel object.
i7_labeled.to_excel(r'./Review_senti_i7.xlsx')
# Convert the dataframe to an XlsxWriter Excel object.
S8_labeled.to_excel(r'./Review_senti_S8.xlsx')