In [1]:
# statsmodels : 통계 추정용 모델 패키지
# seaborn : matplotlib 기반, 시각화 패키지
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import scipy as sp
import matplotlib as mpl

In [3]:
!pip install wget
import wget

# wget : 파일 다운로드용 패키지
# 데이터 : 네이버 영화 리뷰 모음 txt 
url1 = 'https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt'
wget.download(url1)

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=e429bd3cbe3e8aa49dadde1d3958f51253ae1237a945e5132af3d2097b4107a9
  Stored in directory: /home/ehdals5744/.cache/pip/wheels/01/46/3b/e29ffbe4ebe614ff224bad40fc6a5773a67a163251585a13a9
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


'ratings_train.txt'

In [6]:
url2 = 'https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt'
wget.download(url2)

'ratings_test (2).txt'

In [7]:
import codecs

# 파일 열어서 \t으로 구분 -> 한 줄씩 읽는 형식으로 dataframe 저장
with codecs.open("ratings_train.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]
    data = data[1:]   # header 제외

In [None]:
# data 형식 확인
# 72	['5679106', '졸작', '0'] -> 한 row에 id, 리뷰, label 저장됨
data

In [10]:
from pprint import pprint
pprint(data[72])

['5679106', '졸작', '0']


In [11]:
# zip(*data)를 통해 데이터 분리후 X, y에 저장
X = list(zip(*data))[1] # 리뷰
y = np.array(list(zip(*data))[2], dtype=int) # label을 int형태로 저장

In [None]:
# 데이터를 다항 나이브 베이즈 모형으로 학습

In [12]:
# CountVectorizer : 단어 등장 빈도를 벡터화
# MultinominalNB : 단어 등장 빈도 -> 다항 Naive Bayes로 학습
# Pipeline : 모델 전처리 및 설정을 한번에
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

model1 = Pipeline([
    ('vect', CountVectorizer()),
    ('mb', MultinomialNB()),
])

In [13]:
# 생성했던 model 인스턴스 학습
model1.fit(X, y)

In [14]:
import codecs

# test 데이터셋 생성
with codecs.open("ratings_test (2).txt", encoding='utf-8') as f:
    data_test = [line.split('\t') for line in f.read().splitlines()]
    data_test = data_test[1:]   # header 제외

In [16]:
X_test = list(zip(*data_test))[1]
y_test = np.array(list(zip(*data_test))[2], dtype=int)

print(classification_report(y_test, model1.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.82     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [None]:
# Tfidf 방법을 사용했을 때와 비교

In [17]:
# 단순 Count -> TF-IDF 형식으로 변경
from sklearn.feature_extraction.text import TfidfVectorizer

model2 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('mb', MultinomialNB()),
])

In [18]:
model2.fit(X, y)

In [21]:
# accuracy 동일
print(classification_report(y_test, model2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.81      0.84      0.83     24827
           1       0.84      0.81      0.83     25173

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [None]:
# 형태소 분석기를 사용한 결과와 비교

In [22]:
# konlpy 설치
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading JPype1-1.5.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.9/488.9 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [23]:
# 3번째 방법으로는 konlpy의 Okt(형태소 분석 tool) -> countvectorizer
from konlpy.tag import Okt
pos_tagger = Okt()

def tokenize_pos(doc):
    return ['/'.join(t) for t in pos_tagger.pos(doc)]

In [24]:
model3 = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize_pos)),
    ('mb', MultinomialNB()),
])

In [25]:
# 학습 진행 -> 8분 소요
model3.fit(X, y)



In [26]:
# 1, 2번째 방법보다 accuracy 상승
print(classification_report(y_test, model3.predict(X_test)))

              precision    recall  f1-score   support

           0       0.85      0.86      0.85     24827
           1       0.86      0.85      0.85     25173

    accuracy                           0.85     50000
   macro avg       0.85      0.85      0.85     50000
weighted avg       0.85      0.85      0.85     50000



In [None]:
# (1,2)-gram 을 사용하면 성능이 더 개선

In [27]:
# n-gram 개념 도입 -> 연속적인 text의 흐름 반영
model4 = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=tokenize_pos, ngram_range=(1, 2))),
    ('mb', MultinomialNB()),
])

In [28]:
# 학습 진행 -> 8분 30초 소요 (2-gram 까지는 학습 소요 시간이 비슷한 것으로 추정)
model4.fit(X, y)



In [29]:
# accuracy 더욱 상승
print(classification_report(y_test, model4.predict(X_test)))

              precision    recall  f1-score   support

           0       0.86      0.87      0.87     24827
           1       0.87      0.86      0.87     25173

    accuracy                           0.87     50000
   macro avg       0.87      0.87      0.87     50000
weighted avg       0.87      0.87      0.87     50000

