# **Preorocessing Program**

In [7]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# DB 연결
# !pip install pymysql
import pymysql

# DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

# 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

# 텍스트 분석
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

# 모델 저장 및 로드
import joblib

# 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Develop Preprocessing Program**

In [2]:
def arguments():
    
    
    ### 1) 매체 선택
    media_list = ['매일경제', '아시아경제', '삼프로TV', '슈카월드', '한국경제TV']
    media_name = str(input('***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) :'))
    while media_name not in media_list:
        media_name = str(input('***매체명 다시 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) :'))
        if media_name in media_list:
            break
    if media_name == '매일경제':
        globals()['craw_media'] = 'maeil_news_craw'
    elif media_name == '아시아경제':
        globals()['craw_media'] = 'asia_news_craw'
    elif media_name == '삼프로TV':
        globals()['craw_media'] = 'youtube_sampro'
    elif media_name == '슈카월드':
        globals()['craw_media'] = 'youtube_suka'
    else:
        globals()['craw_media'] = 'youtube_hk'
    
    
    ### 2) date 지정
    
    ## 2-1) 시작 날짜
    globals()['start_date'] = str(input('***시작 날짜(YYYY-MM-DD) :'))
    while len(globals()['start_date']) != 10:
        start_date = str(input('***시작 날짜 다시 입력(YYYY-MM-DD) :'))
        if len(globals()['start_date']) == 10:
            break
    globals()['start_date'] = globals()['start_date'].replace('/', '-')
    globals()['start_date'] = globals()['start_date'].replace('.', '-')
    
    ## 2-2) 종료 날짜
    globals()['end_date'] = str(input('***종료 날짜(YYYY-MM-DD) :'))
    while len(globals()['end_date']) != 10:
        globals()['end_date'] = str(input('***종료 날짜 다시 입력(YYYY-MM-DD) :'))
        if len(end_date) == 10:
            break
    globals()['end_date'] = globals()['end_date'].replace('/', '-')
    globals()['end_date'] = globals()['end_date'].replace('.', '-')
    
    ## 2-3) if 시작 날짜 < 종료 날짜: ...;;
    if globals()['start_date'] > globals()['end_date']:
        globals()['start_date'], globals()['end_date'] = globals()['end_date'], globals()['start_date']
    else:
        pass
    
    
    return globals()['craw_media'], globals()['start_date'], globals()['end_date']

In [3]:
#*** 아직 YouTube 채널 크롤링 데이터는 별도의 전처리 코드 작성 필요 ***#
def media_stock_prediction(craw_media, start_date, end_date):
    #### 1. Read Data
    
    
    ### 1) KOSELF 감성 어휘 사전
    #*** 추후에 감성사전도 DB 연결해서 사용하도록 코드 변경 필요 ***#
    with open('KOSELF_pos.txt', encoding='utf-8') as pos:
        positive = pos.readlines()
    positive = [pos.replace('\n', '') for pos in positive]
    with open('KOSELF_neg.txt', encoding='utf-8') as neg:
        negative = neg.readlines()
    negative = [neg.replace('\n', '') for neg in negative]
    
    
    ### 2) News Data from DB
    db = pymysql.connect(user='root',
                         passwd='1234',
                         host='3.35.70.166',
                         db='proj',
                         charset='utf8')

    cursor = db.cursor(pymysql.cursors.DictCursor)
    
    ## 2-1) 전체 종목 뉴스 데이터
    corp_list = ['samsung', 'hyundai', 'lg', 'sk']
    stock_num_list = ['005930', '005380', '051910', '000660']
    
    ## 2-2) DB의 date 컬럼과 형태 통일
    start_date = start_date.replace('-', '')
    end_date = end_date.replace('-', '')
    for i in range(len(corp_list)):
        sql = "select * from {0}_{1} where (length(date)=10) and (date between {2}00 and {3}23)".format(craw_media, stock_num_list[i], start_date, end_date)
        cursor.execute(sql)
        result = cursor.fetchall()

        # DataFrame으로 변경
        globals()[corp_list[i]] = pd.DataFrame(result)
    
    ## 2-3) text 컬럼의 NaN 제거
    globals()[corp_list[i]].dropna(axis=0, inplace=True)
        
    ## 2-4) 날짜와 시간 구분
    for i in range(len(corp_list)):
        globals()[corp_list[i]].rename(columns={'date': 'datetime'}, inplace=True)

        # DataFrame 형태를 통일하기 위해 date 컬럼 추가
        globals()[corp_list[i]]['date'] = globals()[corp_list[i]]['datetime'].str[0:4] + '-' + globals()[corp_list[i]]['datetime'].str[4:6] + '-' + globals()[corp_list[i]]['datetime'].str[6:8]
        globals()[corp_list[i]]['date'] = pd.to_datetime(globals()[corp_list[i]]['date'])

        # 결측치 제거 → 데이터 로드 시 완료했기 때문에 그다지 필요하지 않은 과정
        globals()[corp_list[i]] = globals()[corp_list[i]].dropna()

        # 시간순으로 정렬
        globals()[corp_list[i]].sort_values('datetime', inplace=True)
        globals()[corp_list[i]].reset_index(inplace=True, drop=True)
    
    
    ### 3) FinanceDataReader
    # 종료 날짜는 현재 시각을 기준으로
    end_date = datetime.datetime.now().strftime("%Y%m%d")
    for i in range(len(corp_list)):
        globals()['stock_' + corp_list[i]] = fdr.DataReader(stock_num_list[i], start=start_date, end=end_date).reset_index()
    
    
    ### 4) Holidays
    db = pymysql.connect(user='root',
                         passwd='1234',
                         host='3.35.70.166',
                         db='proj',
                         charset='utf8')

    cursor = db.cursor(pymysql.cursors.DictCursor)

    # 4-1) 주말 및 공휴일 데이터
    sql = "select * from holidays"
    cursor.execute(sql)
    result = cursor.fetchall()
    
    # DataFrame으로 변경
    globals()['holidays'] = pd.DataFrame(result)
    
    # 4-2) date 컬럼을 날짜 형식으로 변경
    globals()['holidays']['date'] = pd.to_datetime(holidays['date'])
    
    
    ### 5) Stop Words
    #*** 추후에 Stop Words도 DB 연결해서 사용하도록 코드 변경 필요 ***#
    with open('stopwords-ko.txt', encoding='utf-8') as sw:
        globals()['stop_words'] = sw.readlines()
    globals()['stop_words'] = [sw.replace('\n', '') for sw in stop_words]
    
    
    
    
    #### 2. Preprocessing
    '''감성 어휘 사전 : negative / positive
       뉴스 데이터 : samsung / hyundai / lg / sk
       주식 데이터 : stock_samsung / stock_hyundai / stock_lg / stock_sk
       공휴일 데이터 : holidays'''
    
    
    ### 1) 뉴스 데이터 날짜 조정
    
    ## 1-1)업로드 시각 컬럼 추가
    for i in range(len(corp_list)):
        globals()[corp_list[i]]['time'] = globals()[corp_list[i]]['datetime'].str[-2:]
    
    ## 1-2) 전일 15시 ~ 금일 15시로 날짜 조정
    after_market = ['15', '16', '17', '18', '19', '20', '21', '22', '23']

    for i in range(len(corp_list)):
        for j in range(len(globals()[corp_list[i]]['time'])):
            if globals()[corp_list[i]]['time'][j] in after_market:
                globals()[corp_list[i]]['date'][j] += datetime.timedelta(1)
            else:
                pass
    
    ## 1-3) 텍스트 전처리
    for i in range(len(corp_list)):
        globals()[corp_list[i]]['text'] = globals()[corp_list[i]]['text'].str.replace('[\n|\t|\r]', '')
    
    
    ### 2) 주말 및 공휴일 제외
    
    ## 2-1) 주말 및 공휴일만 추출
    market_closed = globals()['holidays'][globals()['holidays']['holiday']=="O"].reset_index(drop=True)
    
    ## 2-3) 휴장일 List 생성
    market_closed_list = list(market_closed['date'])
    
    ## 2-4) iteration limit 조정
    limit_number = 15000
    sys.setrecursionlimit(limit_number)
    
    ## 2-5) 휴장일 제외 함수 적용
    # 주말 및 공휴일 제외 함수
    def stock_market_closed(df):
        for i in range(len(df['date'])):
            if df['date'][i] in market_closed_list:
                df['date'][i] += datetime.timedelta(1)
                stock_market_closed(df)
            else:
                pass
        return df
    
    for i in range(len(corp_list)):
        stock_market_closed(globals()[corp_list[i]])
    
    
    
    
    #### 3. Sentiment Analysis
    
    
    ### 1) 종목별 긍부정 Score 계산
    for i in range(len(corp_list)):
        globals()[corp_list[i]]['score'] = 0
        tokenizer = Okt()

        for x in range(len(globals()[corp_list[i]]['date'])):
            score = 0
            num = tokenizer.nouns(globals()[corp_list[i]]['text'][x])
            for y in num:
                # KOSELF 감성 어휘 사전
                if y in positive:
                    score += 1
                elif y in negative:
                    score -= 1
                else:
                    score = score

            globals()[corp_list[i]]['score'][x] = score
    
    
    ### 2) 주식가격 데이터와 결합
    corp_label_list = []
    for i in range(len(corp_list)):
        
        ### 2-1) 결합
        globals()[corp_list[i] + '_label'] = pd.merge(globals()[corp_list[i]], globals()['stock_' + corp_list[i]], how='left', left_on='date', right_on='Date')
        globals()[corp_list[i] + '_label'].drop('Date', axis=1, inplace=True)
        
        ### 2-2) UpDown과 Extremely_Changed(Change 상하위 5%) 컬럼 생성
        # 주식 매매 수수료 평균 : 0.1% 정도(?) → 0을 추가해도 1, -1만 나옴
        globals()[corp_list[i] + '_label']['UpDown'] = np.where((globals()[corp_list[i] + '_label']['Close'] * globals()[corp_list[i] + '_label']['Change'])>(globals()[corp_list[i] + '_label']['Close']*0.001), 1,
                                                                np.where((globals()[corp_list[i] + '_label']['Close']*globals()[corp_list[i] + '_label']['Change']<(globals()[corp_list[i] + '_label']['Close']*0.001), -1, 0)))
#         # 단순히 Change가 (+), 0, (-)인지에 따라 각각 1, 0, -1
#         globals()[corp_list[i] + '_label']['UpDown'] = np.where(globals()[corp_list[i] + '_label']['Change']>0, 1,
#                                                                 np.where(globals()[corp_list[i] + '_label']['Change']<0, -1, 0))
        globals()[corp_list[i] + '_label']['Extremely_Changed'] = np.where((globals()[corp_list[i] + '_label']['Change']>globals()[corp_list[i] + '_label']['Change'].quantile(.95)) & (globals()[corp_list[i] + '_label']['Change']>0), 1,
                                                                            np.where((globals()[corp_list[i] + '_label']['Change']<globals()[corp_list[i] + '_label']['Change'].quantile(.05)) & (globals()[corp_list[i] + '_label']['Change']<0), -1, 0))
        
        ### 2-3) List에 추가
        corp_label_list.append(globals()[corp_list[i] + '_label'])
    
    
    ### 3) 전체 종목 DataFrame 통합
    globals()['total_label'] = pd.concat(corp_label_list, axis=0)

In [11]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 매일경제
***시작 날짜(YYYY-MM-DD) : 2019-01-01
***종료 날짜(YYYY-MM-DD) : 2019-04-30


('maeil_news_craw', '2019-01-01', '2019-04-30')

In [12]:
craw_media, start_date, end_date

('maeil_news_craw', '2019-01-01', '2019-04-30')

In [13]:
media_stock_prediction(craw_media, start_date, end_date)

In [8]:
holidays

Unnamed: 0,date,year,month,day,dd,explanation,weekend,holiday
0,2018-01-01,2018,1,1,Mon,새해,,O
1,2018-01-02,2018,1,2,Tue,,,
2,2018-01-03,2018,1,3,Wed,,,
3,2018-01-04,2018,1,4,Thu,,,
4,2018-01-05,2018,1,5,Fri,,,
...,...,...,...,...,...,...,...,...
1821,2022-12-27,2022,12,27,Tue,,,
1822,2022-12-28,2022,12,28,Wed,,,
1823,2022-12-29,2022,12,29,Thu,,,
1824,2022-12-30,2022,12,30,Fri,,,


In [14]:
# 모델 로드
grid_cv_pipe = joblib.load('../../../../Code/TF-IDF.h5') 
pred = grid_cv_pipe.predict(total_label['text'])

# Acuuracy 확인
print('Pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}'.format(accuracy_score(total_label['UpDown'], pred)))

Pipeline을 통한 Logistic Regression의 예측 정확도 : 0.480


In [15]:
# 모델 로드
grid_cv_pipe = joblib.load('../../../../Code/Model/TF-IDF.pkl') 
pred = grid_cv_pipe.predict(total_label['text'])

# Acuuracy 확인
print('Pipeline을 통한 Logistic Regression의 예측 정확도 : {0:.3f}'.format(accuracy_score(total_label['UpDown'], pred)))

Pipeline을 통한 Logistic Regression의 예측 정확도 : 0.480


In [16]:
pred

array([-1, -1, -1, ..., -1, -1,  1])

In [18]:
Counter(pred)

Counter({-1: 1515, 1: 1655})