# **TF-IDF & Logistic**

In [1]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# DB 연결
# !pip install pymysql
import pymysql

# 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

# 텍스트 분석
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score

# 모델 저장 및 로드
import joblib

# 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Read Data**

In [2]:
news_18to20 = pd.read_csv('../../../../Code/Data/news_18to20.csv')
news_2021 = pd.read_csv('../../../../Code/Data/news_2021.csv')

news_18to20.shape, news_2021.shape

((52490, 18), (12668, 18))

In [3]:
# https://gist.github.com/spikeekips/40eea22ef4a89f629abd87eed535ac6a#file-stopwords-ko-txt
with open('stopwords-ko.txt', encoding='utf-8') as sw:
    stop_words = sw.readlines()
stop_words = [sw.replace('\n', '') for sw in stop_words]

## **Sentiment Analysis**

### **TF-IDF 기반 벡터화 및 모델 학습**

In [4]:
news_18to20.shape

(52490, 18)

In [5]:
news_18to20.isna().sum()

st_n                 0
st_cd                0
news                 0
datetime             0
title                0
url                  0
text                 6
date                 0
time                 0
score                0
Open                 0
High                 0
Low                  0
Close                0
Volume               0
Change               0
UpDown               0
Extremely_Changed    0
dtype: int64

In [6]:
news_18to20[news_18to20['Change'].isna()]['date'].value_counts()

Series([], Name: date, dtype: int64)

In [7]:
news_18to20[news_18to20['text'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,score,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed
22853,현대차,5380,매일경제,2019021914,"이시한, 취업계 2년 연속 트렌드 예측 경향성 소름 돋는 적중, ‘문어시한’이 떴다...",http://edu.mk.co.kr/edunews/news_view.php?sc=5...,,2019-02-19,14,0,119500,120500,119000,119500,470357,-0.008299,-1,0
22860,현대차,5380,매일경제,2019022011,"이시한이 말하는, 100:1이 일반화된 채용시장에서 살아남는 방법 - 인터뷰 #2",http://edu.mk.co.kr/edunews/news_view.php?sc=5...,,2019-02-20,11,0,119500,121500,119000,119000,735087,-0.004184,-1,0
26962,sk하이닉스,660,매일경제,2019112017,이시한의 2020년 취업트렌드코리아 #2 : Implement AI,http://edu.mk.co.kr/edunews/news_view.php?sc=5...,,2019-11-21,17,0,81700,82500,80500,80900,3346746,-0.021765,-1,0
50391,LG화학,51910,아시아경제,2020052013,"[속보]구광모 LG 회장, LG화학 대산공장 긴급 방문",https://view.asiae.co.kr/article/2020052013340...,,2020-05-20,13,0,352500,362500,351500,362500,358098,0.015406,1,0
50449,LG화학,51910,아시아경제,2020062309,"코로나 재확산 우려에 국내기업 비상 대응 ""할 수 있는 건 다한다""",https://view.asiae.co.kr/article/2020062211370...,,2020-06-23,9,0,512000,518000,500000,504000,516968,-0.003953,-1,0
52175,sk하이닉스,660,아시아경제,2020102008,"[속보]SK하이닉스, 인텔 낸드 사업 10조3000억원에 인수",https://view.asiae.co.kr/article/2020102008583...,,2020-10-20,8,0,85300,90900,83700,85200,6622637,-0.017301,-1,0


In [8]:
news_18to20.dropna(axis=0, inplace=True)

In [9]:
# Train-Test Set 분리
X_train, X_test, y_train, y_test = train_test_split(news_18to20['text'], news_18to20['UpDown'], test_size=0.2, random_state=0)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((41987,), (41987,), (10497,), (10497,))

In [None]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words=stop_words)),
    ('lr_clf', LogisticRegression())
])

# Pipeline에 기술된 각각의 객체 변수에 언더바(_) 2개를 연달아 붙여 GridSearchCV에 사용될 파라미터/하이퍼파라미터 이름과 값을 설정
params = {
    'tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf_vect__max_df': [100, 300, 700],
    'lr_clf__C': [1, 5, 10]
}

# GridSearchCV의 생성자에 Estimator가 아닌 Pipeline 객체 입력
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print(' <1> parameters :', grid_cv_pipe.best_params_, '\n', '<2> best score :', grid_cv_pipe.best_score_)

pred = grid_cv_pipe.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [None]:
# 5시간 동안 돌아감...;;

In [None]:
# 모델 저장
joblib.dump(grid_cv_pipe, '../../../../Code/Model/TF-IDF(18to20).pkl')
joblib.dump(grid_cv_pipe, '../../../../Code/Model/TF-IDF(18to20).h5')

In [None]:
news_2021.dropna(axis=0, inplace=True)

In [None]:
# 모델 로드
grid_cv_pipe = joblib.load('../../../../Code/TF-IDF(18to20).h5')
pred = grid_cv_pipe.predict(news_2021['text'])

# Accuracy 확인
print('Pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}'.format(accuracy_score(news_2021['UpDown'], pred)))

In [None]:
pred

In [None]:
Counter(pred)