# 네이버 영화 리뷰를 활용한 감정 분석

> 1) KoNLP, 2) tf-idf를 활용한 임베딩

- 가능하면 해당 데이터를 기반으로 간단한 웹 페이지도 만들어보자
- NLP 주요 라이브러리 설치
    - `pip install konlpy`
    - `pip install joblib`

In [1]:
import numpy as np
import pandas as pd
import re

from konlpy.tag import Okt

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

## 데이터 불러오기

In [2]:
train_df = pd.read_table("data/ratings_train.txt")
test_df = pd.read_table("data/ratings_test.txt")

## 전처리

In [3]:
train_df = train_df.fillna(" ")
test_df = test_df.fillna(" ")

In [4]:
train_df["document"] = train_df["document"].apply(lambda x: re.sub(r"\d+", " ", x))
test_df["document"] = test_df["document"].apply(lambda x: re.sub(r"\d+", " ", x))

## 임베딩

In [5]:
okt = Okt()

def tw_tokenzier(text):
    tokenzier_ko = okt.morphs(text)
    return tokenzier_ko

In [6]:
tfidf_vect = TfidfVectorizer(tokenizer=tw_tokenzier, ngram_range=(1,2), min_df=3, max_df=0.9)
tfidf_vect.fit(train_df["document"])
tfidf_matrix_train = tfidf_vect.transform(train_df["document"])



## 학습

In [None]:
lr = LogisticRegression(C=3.5, random_state=42)
lr.fit(tfidf_matrix_train, train_df["label"])

## 예측

In [None]:
tfidf_matrix_test = tfidf_vect.transform(test_df["document"])

In [None]:
preds = lr.predict(tfidf_matrix_test)
accuracy_score(test_df["label"], preds)