# **Sentiment Analysis(TF-IDF)**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
# fasttext.util.download_model('ko', if_exists='ignore')   # FastText 모델 사용 시에만 필요
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Read Data**

In [2]:
stop_words = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt")

In [3]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']

df_list = []
for i in range(len(corp_list)):
    globals()[corp_list[i]] = pd.read_csv('../../../../Final Data/{}. {}_score.csv'.format(i+1, corp_list[i]))
    df_list.append(globals()[corp_list[i]])

total = pd.concat(df_list, axis=0, ignore_index=True).reset_index(drop=True)

In [4]:
total.tail(1)

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,...,Close,Volume,Change,UpDown,Tokenization,Positive_Score,Negative_Score,Ratio,Pred,NSI
70556,셀트리온,68270,아시아경제,2021092617,"외국인, 2주 연속 '사자'…삼전 가장 많이 사들여",https://view.asiae.co.kr/article/2021092617353...,[아시아경제 송화정 기자]외국인이 국내 증시에서 2주 연속 매수세를 나타냈다. 2주...,2021-09-27,17,272000,...,267500,512408,-0.025501,-1,아시아 경제 송화 기자 외국인 국내 증시 연속 연속 전자 성전 코스피 증권 정보 현...,7,5,0.583333,1,116.666667


In [5]:
total.isna().sum()

st_n              0
st_cd             0
news              0
datetime          0
title             0
url               0
text              0
date              0
time              0
Open              0
High              0
Low               0
Close             0
Volume            0
Change            0
UpDown            0
Tokenization      0
Positive_Score    0
Negative_Score    0
Ratio             0
Pred              0
NSI               0
dtype: int64

## **TF-IDF Vetorizing & Logistic Regression**

In [6]:
total.shape

(70557, 22)

In [7]:
train = total[(total['date'].str[:4]=='2018') | (total['date'].str[:4]=='2019') | (total['date'].str[:4]=='2020')].reset_index(drop=True)
test = total[total['date'].str[:4]=='2021'].reset_index(drop=True)

In [8]:
# Training-Validation Set 분리
X_train, X_test, y_train, y_test = train_test_split(train['Tokenization'], train['Pred'], test_size=0.3, random_state=0)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((39231,), (39231,), (16814,), (16814,))

In [9]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words=stop_words)),
    ('lr_clf', LogisticRegression())
])

# Pipeline에 기술된 각각의 객체 변수에 언더바(_) 2개를 연달아 붙여 GridSearchCV에 사용될 파라미터/하이퍼파라미터 이름과 값을 설정
params = {
    'tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf_vect__max_df': [100, 300, 700],
    'lr_clf__C': [1, 5, 10]
}

# GridSearchCV의 생성자에 Estimator가 아닌 Pipeline 객체 입력
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)   # scoring='accuracy'
grid_cv_pipe.fit(X_train, y_train)
print(' <1> parameters :', grid_cv_pipe.best_params_, '\n', '<2> best score :', grid_cv_pipe.best_score_)

pred = grid_cv_pipe.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# 모델 저장
joblib.dump(grid_cv_pipe, '../../../../Final Data/TF-IDF(Pos-Neg).h5')

In [None]:
# 모델 로드
grid_cv_pipe = joblib.load('../../../../Final Data/TF-IDF(Pos-Neg).h5')
pred = grid_cv_pipe.predict(test['Tokenization'])

# Score 확인
print('Accuracy  : {0:.3f}'.format(accuracy_score(test['Pred'], pred)))
print('Precision : {0:.3f}'.format(precision_score(test['Pred'], pred)))
print('Recall    : {0:.3f}'.format(recall_score(test['Pred'], pred)))
print('F1-Score  : {0:.3f}'.format(f1_score(test['Pred'], pred)))

In [None]:
pred

In [None]:
Coundter(pred)

In [None]:
y_test['Pred'].value_counts()