In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

In [21]:
test_df = pd.read_csv('./data/ratings_test.txt', sep="\t")
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


In [22]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        50000 non-null  int64 
 1   document  49997 non-null  object
 2   label     50000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.1+ MB


In [23]:
test_df = test_df.dropna()

In [24]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49997 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        49997 non-null  int64 
 1   document  49997 non-null  object
 2   label     49997 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.5+ MB


In [6]:
import re
from konlpy.tag import Mecab

In [31]:
mecab = Mecab()

In [7]:
def text_clean(x):
    # 한글, 영문대소문자, 숫자만 남기고 모두 제거
    cleaned = re.sub(r'[^가-힣a-zA-Z0-9]', " ", x)
    cleaned = cleaned.replace("  ", " ").replace("  ", " ").strip()
    return cleaned

In [33]:
def tokenizer(text):
    tokens = mecab.morphs(text)
    return tokens

# document 컬럼 전처리(특수문자 제거)

In [25]:
test_df.loc[:, 'document'] = test_df['document'].apply(text_clean)

In [26]:
test_df.head()

Unnamed: 0,id,document,label
0,6270596,굳,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임 돈주고 보기에는,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데 왜 3D로 나와서 제 심기를 불편하게 하죠,0


In [27]:
X = test_df['document']
y = test_df['label']

In [4]:
import joblib

In [9]:
# 모델 불러오기
cv_mecab = joblib.load('./data/cv_mecab.joblib')

In [29]:
tfidf_mecab = joblib.load('./data/tfidf_mecab.joblib')

In [32]:
cv_mecab_X = cv_mecab.transform(X)
tfidf_mecab_X = tfidf_mecab.transform(X)

# Countvectorizer로 훈련한 모델

In [34]:
from sklearn.metrics import classification_report

In [10]:
mnb1 = joblib.load('./data/mnb1_model.joblib')

In [35]:
pred1 = mnb1.predict(cv_mecab_X)
print(classification_report(y, pred1))

              precision    recall  f1-score   support

           0       0.84      0.86      0.85     24826
           1       0.86      0.84      0.85     25171

    accuracy                           0.85     49997
   macro avg       0.85      0.85      0.85     49997
weighted avg       0.85      0.85      0.85     49997



# Tfidfvectorizer로 훈련한 모델

In [37]:
mnb2 = joblib.load('./data/mnb2_model.joblib')

In [38]:
pred2 = mnb2.predict(tfidf_mecab_X)
print(classification_report(y, pred2))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86     24826
           1       0.87      0.84      0.86     25171

    accuracy                           0.86     49997
   macro avg       0.86      0.86      0.86     49997
weighted avg       0.86      0.86      0.86     49997

