# 4.2 텍스트분류
###### 로지스틱회기, 랜덤포레스트를 통한 텍스트 분류
- 네이버 영화 리뷰 데이터 활용


In [None]:
import os

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import json

In [None]:

DATA_IN_PATH ='/content/drive/MyDrive/2021-1/AI데이터활용교재개발/자연어처리(텐서플로,머신러닝)/code/data/'
DATA_OUT_PATH = '/content/drive/MyDrive/2021-1/AI데이터활용교재개발/자연어처리(텐서플로,머신러닝)/code/data/'

INPUT_DATA = 'clean_nsmc.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [None]:
data = pd.read_csv(DATA_IN_PATH + INPUT_DATA,index_col = False)


### TF-IDF + 로지스틱 회기

In [None]:
# 단어를 TF-IDF 벡터라이즈
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer=lambda x:x, sublinear_tf=True, ngram_range=(1,3), max_features=5000) 

X = vectorizer.fit_transform(data['review'])
sentiments = list(data['sentiment'])

print(X.shape)

(200000, 2667)


In [None]:
# 학습데이터, validation데이터 나눔
from sklearn.model_selection import train_test_split
import numpy as np

RANDOM_SEED = 42
TEST_SPLIT = 0.2

y = np.array(sentiments)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [None]:
# 로지스틱 회기 모듈 import, 학습실행
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight = 'balanced', max_iter=300)
lgs.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=300, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# 성능 확인
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score

precision = precision_score( y_eval,lgs.predict(X_eval))
recall = recall_score(y_eval,lgs.predict(X_eval))
print(precision)
print(recall)

print(classification_report(y_eval, lgs.predict(X_eval)))

0.7916520694659926
0.7841951316246096
              precision    recall  f1-score   support

           0       0.78      0.79      0.79     19829
           1       0.79      0.78      0.79     20171

    accuracy                           0.79     40000
   macro avg       0.79      0.79      0.79     40000
weighted avg       0.79      0.79      0.79     40000



In [None]:
pd.DataFrame({"X_eval" : X_eval, "predict" : lgs.predict(X_eval), "answer" : y_eval})

Unnamed: 0,X_eval,predict,answer
0,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",1,1
1,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",1,1
2,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",1,1
3,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",0,0
4,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",0,0
...,...,...,...
39995,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",0,0
39996,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",1,1
39997,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",1,1
39998,"(0, 1328)\t0.6399530946535764\n (0, 1717)\t...",0,0


### Word2Vec + Logistic 회기

In [None]:
reviews = list(map(eval,data['review']))
sentence_reviews = []
for review in reviews:
    sentence_reviews.append(' '.join(review))

In [None]:
num_features = 100 
min_word_count =100   
num_workers = 4       
context = 10          
downsampling = 1e-3

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
   level=logging.INFO)

In [None]:
# 워드 투벡터 모델 학습
from gensim.models import word2vec


model = word2vec.Word2Vec(reviews, workers=num_workers, \
           size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

In [None]:
model_name = "100minwords"
model.save(INPUT_DATA + model_name)

2021-04-29 09:06:44,076 : INFO : saving Word2Vec object under clean_nsmc.csv100minwords, separately None
2021-04-29 09:06:44,081 : INFO : not storing attribute vectors_norm
2021-04-29 09:06:44,084 : INFO : not storing attribute cum_table
2021-04-29 09:06:44,123 : INFO : saved clean_nsmc.csv100minwords


In [None]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)
    # 0으로 나누지 않게 하기 위해 처음부터 1로 초기화
    num_words = 1
    # 어휘 사전 준비
    index2word_set = set(model.wv.index2word)

    for w in words:
        if w in index2word_set:
            num_words += 1
            # 사전에 해당하는 단어에 대해 단어 벡터를 더함
            feature_vector = np.add(feature_vector, model[w])

    # 문장의 단어 수만큼 나누어 단어 벡터의 평균값을 문장 벡터로 함
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [None]:
# 모든 리뷰 데이터에 대해 문장의 벡터를 구하는 코드
def get_dataset(reviews, model, num_features):
    dataset = list()

    for s in reviews:
        dataset.append(get_features(s, model, num_features))

    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [None]:
train_data_vecs = get_dataset(reviews, model, num_features)

  if sys.path[0] == '':


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = train_data_vecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [None]:
# W2V + 로지스틱 회기모델 학습
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced', max_iter = 300)
lgs.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=300, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# 성능 확인
print(classification_report(y_test, lgs.predict(X_test)))

              precision    recall  f1-score   support

           0       0.79      0.80      0.79     19829
           1       0.80      0.79      0.80     20171

    accuracy                           0.80     40000
   macro avg       0.80      0.80      0.80     40000
weighted avg       0.80      0.80      0.80     40000



### 랜덤포레스트 분류모델

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", max_features = 5000) 
data_features = vectorizer.fit_transform(sentence_reviews)

In [None]:
TEST_SIZE = 0.2
RANDOM_SEED = 42

In [None]:
train_input, eval_input, train_label, eval_label = train_test_split(data_features, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 랜덤 포레스트 분류기에  100개 의사 결정 트리를 사용한다.
forest = RandomForestClassifier(n_estimators = 100) 
# 단어 묶음을 벡터화한 데이터와 정답 데이터를 가지고 학습을 시작한다.
forest.fit( train_input, train_label )

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
print("Accuracy: %f" % forest.score(eval_input, eval_label))  # 검증함수로 정확도 측정

Accuracy: 0.775850
