# **News Media**

In [None]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# DB 연결
# !pip install pymysql
import pymysql

# 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

# 텍스트 분석
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score

# 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Read Data**

### **① KOSELF 감성 어휘 사전**

In [None]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② News Data from DB**

In [None]:
# db = pymysql.connect(user='root',
#                      passwd='1234',
#                      host='3.35.70.166',
#                      db='proj',
#                      charset='utf8')

# cursor = db.cursor(pymysql.cursors.DictCursor)

# # 전체 종목 뉴스 데이터
# corp_list = ['samsung', 'hyundai', 'lg', 'sk']
# stock_num_list = ['005930', '005380', '051910', '000660']
# for i in range(len(corp_list)):
#     sql = "select * from maeil_news_craw_{} where length(date)=10".format(stock_num_list[i])
#     cursor.execute(sql)
#     result = cursor.fetchall()
    
#     # DataFrame으로 변경
#     globals()[corp_list[i]] = pd.DataFrame(result)

In [None]:
# AWS 서버 사용할 수 없을 때
corp_list = ['samsung', 'hyundai', 'lg', 'sk']
stock_num_list = ['005930', '005380', '051910', '000660']

for i in range(len(corp_list)):
    globals()[corp_list[i]] = pd.read_csv('../../../../Code/Data/news_craw_{}.csv'.format(stock_num_list[i]))
    globals()[corp_list[i]]['date'] = globals()[corp_list[i]]['date'].apply(lambda _: str(_))

In [None]:
for i in range(len(corp_list)):
    globals()[corp_list[i]].rename(columns={'date': 'datetime'}, inplace=True)
    
    # DataFrame 형태를 통일하기 위해 date 컬럼 추가
    globals()[corp_list[i]]['date'] = globals()[corp_list[i]]['datetime'].str[0:4] + '-' + globals()[corp_list[i]]['datetime'].str[4:6] + '-' + globals()[corp_list[i]]['datetime'].str[6:8]
    globals()[corp_list[i]]['date'] = pd.to_datetime(globals()[corp_list[i]]['date'])
    
    # 결측치 제거 → 데이터 로드 시 완료했기 때문에 그다지 필요하지 않은 과정
    globals()[corp_list[i]] = globals()[corp_list[i]].dropna()
    
    # 시간순으로 정렬
    globals()[corp_list[i]].sort_values('datetime', inplace=True)
    globals()[corp_list[i]].reset_index(inplace=True, drop=True)

In [None]:
# samsung, hyndai, lg, sk 데이터 불러오기 완료

### **③ FinanceDataReader**

In [None]:
start_date = '20180101'
end_date = datetime.datetime.now().strftime("%Y%m%d")

for i in range(len(corp_list)):
    globals()['stock_' + corp_list[i]] = fdr.DataReader(stock_num_list[i], start=start_date, end=end_date).reset_index()

In [None]:
print('####### Maximun #######')
print('SAMSUNG :', stock_samsung['Change'].max())
print('HYUNDAI :', stock_hyundai['Change'].max())
print('LG      :', stock_lg['Change'].max())
print('SK      :', stock_sk['Change'].max())
print('\n')
print('####### Minimun #######')
print('SAMSUNG :', stock_samsung['Change'].min())
print('HYUNDAI :', stock_hyundai['Change'].min())
print('LG      :', stock_lg['Change'].min())
print('SK      :', stock_sk['Change'].min())

In [None]:
# 종목별 주식가격 변동
for i in range(len(corp_list)):
    plt.figure(figsize=(60, 3))
    plt.subplot(1, 4, i+1)
    sns.lineplot(data=globals()['stock_' + corp_list[i]], x='Date', y='Close')
    plt.title(corp_list[i] + ' stock price')
    plt.xticks(rotation=45)

plt.show()

### **④ Holidays**

In [None]:
holidays = pd.read_csv('holidays.csv')

holidays

In [None]:
holidays['date'] = pd.to_datetime(holidays['date'])

### **⑤ Stop Words**

In [None]:
# https://gist.github.com/spikeekips/40eea22ef4a89f629abd87eed535ac6a#file-stopwords-ko-txt
with open('stopwords-ko.txt', encoding='utf-8') as sw:
    stop_words = sw.readlines()
stop_words = [sw.replace('\n', '') for sw in stop_words]

## **Preprocessing**

In [None]:
# 감성 어휘 사전 : negative / positive
# 뉴스 데이터 : samsung / hyundai / lg / sk
# 주식 데이터 : stock_samsung / stock_hyundai / stock_lg / stock_sk
# 공휴일 데이터 : holidays

### **① 뉴스 데이터 날짜 조정**
- <span style="color:blue">***전일 15시 ~ 금일 15시***</span>

In [None]:
# 업로드 시각 컬럼 추가
for i in range(len(corp_list)):
    globals()[corp_list[i]]['time'] = globals()[corp_list[i]]['datetime'].str[-2:]

In [None]:
after_market = ['15', '16', '17', '18', '19', '20', '21', '22', '23']

for i in range(len(corp_list)):
    for j in range(len(globals()[corp_list[i]]['time'])):
        if globals()[corp_list[i]]['time'][j] in after_market:
            globals()[corp_list[i]]['date'][j] += datetime.timedelta(1)
        else:
            pass

In [None]:
# 텍스트 전처리
for i in range(len(corp_list)):
    globals()[corp_list[i]]['text'] = globals()[corp_list[i]]['text'].str.replace('[\n|\t|\r]', '')

### **② 주말 및 공휴일 제외**

In [None]:
market_closed = holidays[holidays['holiday']=="O"].reset_index(drop=True)

market_closed

In [None]:
# 휴장일 List 생성
market_closed_list = list(market_closed['date'])

In [None]:
# 주말 및 공휴일 제외 함수
def stock_market_closed(df):
    for i in range(len(df['date'])):
        if df['date'][i] in market_closed_list:
            df['date'][i] += datetime.timedelta(1)
            stock_market_closed(df)
        else:
            pass
    return df

In [None]:
limit_number = 15000
sys.setrecursionlimit(limit_number)

In [None]:
for i in range(len(corp_list)):
    stock_market_closed(globals()[corp_list[i]])

In [None]:
samsung[samsung['date']=='2021-08-29']

In [None]:
hyundai[hyundai['date']=='2021-08-29']

In [None]:
lg[lg['date']=='2021-08-29']

In [None]:
sk[sk['date']=='2021-08-29']

In [None]:
samsung

In [None]:
# # 연습용 데이터 저장
# samsung.to_csv('../../../../Code/Data/test_samsung.csv', index=False)
# hyundai.to_csv('../../../../Code/Data/test_hyundai.csv', index=False)
# lg.to_csv('../../../../Code/Data/test_lg.csv', index=False)
# sk.to_csv('../../../../Code/Data/test_sk.csv', index=False)

## **Sentiment Analaysis**

### **① 종목별 긍부정 Score 계산**

In [None]:
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_test'] = globals()[corp_list[i]].copy()

In [None]:
for i in range(len(corp_list)):
    globals()[corp_list[i]]['score'] = 0
    tokenizer = Okt()
    
    for x in range(len(globals()[corp_list[i]]['date'])):
        score = 0
        num = tokenizer.nouns(globals()[corp_list[i]]['text'][x])
        for y in num:
            # KOSELF 감성 어휘 사전
            if y in positive:
                score += 1
            elif y in negative:
                score -= 1
            else:
                score = score
        
        globals()[corp_list[i]]['score'][x] = score

In [None]:
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_test']['score'] = 0
    tokenizer = Okt()
    
    for x in range(len(globals()[corp_list[i] + '_test']['date'])):
        score = 0
        num = tokenizer.nouns(globals()[corp_list[i] + '_test']['text'][x])
        for y in num:
            # 블로그 긍부정 텍스트
            if y in positive_blog:
                score += 1
            elif y in negative_blog:
                score -= 1
            else:
                score = score
        
        globals()[corp_list[i] + '_test']['score'][x] = score

### **② 주식가격 데이터와 결합**

In [None]:
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'] = pd.merge(globals()[corp_list[i]], globals()['stock_' + corp_list[i]], how='left', left_on='date', right_on='Date')
    globals()[corp_list[i] + '_label'].drop('Date', axis=1, inplace=True)
    globals()[corp_list[i] + '_label']['UpDown'] = np.where(globals()[corp_list[i] + '_label']['Change']>0, 1,
                                                            np.where(globals()[corp_list[i] + '_label']['Change']<0, -1, 0))
    globals()[corp_list[i] + '_label']['Extremely_Changed'] = np.where((globals()[corp_list[i] + '_label']['Change']>globals()[corp_list[i] + '_label']['Change'].quantile(.95)) & (globals()[corp_list[i] + '_label']['Change']>0), 1,
                                                                  np.where((globals()[corp_list[i] + '_label']['Change']<globals()[corp_list[i] + '_label']['Change'].quantile(.05)) & (globals()[corp_list[i] + '_label']['Change']<0), -1, 0))

In [None]:
# samsung_label, hyundai_label, lg_label, sk_label 생성

In [None]:
samsung_label.tail(3)

In [None]:
samsung_label['Change'].quantile(.05)

In [None]:
print('### Change 상위 5% ###')
print('SAMSUNG :', samsung_label[samsung_label['Change']>samsung_label['Change'].quantile(.95)].shape)
print('HYUNDAI :', hyundai_label[hyundai_label['Change']>hyundai_label['Change'].quantile(.95)].shape)
print('LG      :', lg_label[lg_label['Change']>lg_label['Change'].quantile(.95)].shape)
print('SK      :', sk_label[sk_label['Change']>sk_label['Change'].quantile(.95)].shape)
print('\n')
print('### Change 하위 5% ###')
print('SAMSUNG :', samsung_label[samsung_label['Change']<samsung_label['Change'].quantile(.05)].shape)
print('HYUNDAI :', hyundai_label[hyundai_label['Change']<hyundai_label['Change'].quantile(.05)].shape)
print('LG      :', lg_label[lg_label['Change']<lg_label['Change'].quantile(.05)].shape)
print('SK      :', sk_label[sk_label['Change']<sk_label['Change'].quantile(.05)].shape)

In [None]:
print('##### SAMSUNG #####\n', samsung_label['UpDown'].value_counts())
print('##### HYUNDAI #####\n', hyundai_label['UpDown'].value_counts())
print('#####   LG    #####\n', lg_label['UpDown'].value_counts())
print('#####   SK    #####\n', sk_label['UpDown'].value_counts())

### **③ 전체 종목 DataFrame 통합**

In [None]:
total_label = pd.concat([samsung_label, hyundai_label, lg_label, sk_label], axis=0)

total_label.columns

In [None]:
# Train-Test Set 분리
X_train, X_test, y_train, y_test = train_test_split(total_label['text'], total_label['UpDown'], test_size=0.3, random_state=0)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

### **④ TF-IDF 기반 벡터화 및 모델 학습**

In [None]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words=stop_words)),
    ('lr_clf', LogisticRegression())
])

# Pipeline에 기술된 각각의 객체 변수에 언더바(_) 2개를 연달아 붙여 GridSearchCV에 사용될 파라미터/하이퍼파라미터 이름과 값을 설정
params = {
    'tfidf_vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf_vect__max_df': [100, 300, 700],
    'lr_clf__C': [1, 5, 10]
}

# GridSearchCV의 생성자에 Estimator가 아닌 Pipeline 객체 입력
grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print(' <1> parameters :', grid_cv_pipe.best_params_, '\n', '<2> best score :', grid_cv_pipe.best_score_)

pred = grid_cv_pipe.predict(X_test)
print('Pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}'.format(accuracy_score(y_test, pred)))