# **Sentiment Analysis(Preprocessing)**

In [2]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
# fasttext.util.download_model('ko', if_exists='ignore')   # FastText 모델 사용 시에만 필요
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Develop Full Step Program**

### **① Preprocessing**

In [2]:
def arguments():
    
    
    ### 1) 매체 선택
    media_list = ['매일경제', '아시아경제', '삼프로TV', '슈카월드', '한국경제TV']
    media_name = str(input('***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) :'))
    while media_name not in media_list:
        media_name = str(input('***매체명 다시 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) :'))
        if media_name in media_list:
            break
    if media_name == '매일경제':
        craw_media = 'maeil_news_craw'
    elif media_name == '아시아경제':
        craw_media = 'asia_news_craw'
    elif media_name == '삼프로TV':
        craw_media = 'youtube_sampro'
    elif media_name == '슈카월드':
        craw_media = 'youtube_suka'
    else:
        craw_media = 'youtube_hk'
    
    
    ### 2) date 지정
    
    ## 2-1) 시작 날짜
    start_date = (datetime.datetime.now() - datetime.timedelta(10)).strftime("%Y-%m-%d")
    
    ## 2-2) 종료 날짜
    end_date = datetime.datetime.now().strftime("%Y-%m-%d")
    
    
    return craw_media, start_date, end_date

In [3]:
#*** 아직 YouTube 채널 크롤링 데이터는 별도의 전처리 코드 작성 필요 ***#
def media_stock_prediction(craw_media, start_date, end_date):
    #### 1. Read Data
    
    
    ### 1) KOSELF 감성 어휘 사전
    #*** 추후에 감성사전도 DB 연결해서 사용하도록 코드 변경 필요 ***#
    with open('KOSELF_pos.txt', encoding='utf-8') as pos:
        positive = pos.readlines()
    positive = [pos.replace('\n', '') for pos in positive]
    with open('KOSELF_neg.txt', encoding='utf-8') as neg:
        negative = neg.readlines()
    negative = [neg.replace('\n', '') for neg in negative]
    
    
    ### 2) News Data from DB
    db = pymysql.connect(user='root',
                         passwd='0808',
                         host='127.0.0.1',
                         db='proj',
                         charset='utf8')

    cursor = db.cursor(pymysql.cursors.DictCursor)
    
    ## 2-1) 전체 종목 뉴스 데이터
    corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
    stock_num_list = ['005930', '005380', '051910', '000660', '068270']
    
    ## 2-2) DB의 date 컬럼과 형태 통일
    start_date = start_date.replace('-', '')
    end_date = end_date.replace('-', '')
    for i in range(len(corp_list)):
        sql = "select * from {0}_{1} where (length(date)=10) and (date between {2}00 and {3}23)".format(craw_media, stock_num_list[i], start_date, end_date)
        cursor.execute(sql)
        result = cursor.fetchall()

        # DataFrame으로 변경
        globals()[corp_list[i]] = pd.DataFrame(result)
    
    db.close()   # 메모리 절약
    
    ## 2-3) 날짜와 시간 구분
    for i in range(len(corp_list)):
        globals()[corp_list[i]].rename(columns={'date': 'datetime'}, inplace=True)

        # DataFrame 형태를 통일하기 위해 date 컬럼 추가
        globals()[corp_list[i]]['date'] = globals()[corp_list[i]]['datetime'].str[0:4] + '-' + globals()[corp_list[i]]['datetime'].str[4:6] + '-' + globals()[corp_list[i]]['datetime'].str[6:8]
        globals()[corp_list[i]]['date'] = pd.to_datetime(globals()[corp_list[i]]['date'])

        # 결측치 제거 → 데이터 로드 시 완료했기 때문에 그다지 필요하지 않은 과정
        globals()[corp_list[i]] = globals()[corp_list[i]].dropna()

        # 시간순으로 정렬
        globals()[corp_list[i]].sort_values('datetime', inplace=True)
        globals()[corp_list[i]].reset_index(inplace=True, drop=True)
    
    
    ### 3) FinanceDataReader
    # 종료 날짜는 현재 시각을 기준으로
    end_date = datetime.datetime.now().strftime("%Y%m%d")
    for i in range(len(corp_list)):
        globals()['stock_' + corp_list[i]] = fdr.DataReader(stock_num_list[i], start=start_date, end=end_date).reset_index()
    
    
    ### 4) Holidays
    db = pymysql.connect(user='root',
                         passwd='0808',
                         host='127.0.0.1',
                         db='proj',
                         charset='utf8')

    cursor = db.cursor(pymysql.cursors.DictCursor)

    # 4-1) 주말 및 공휴일 데이터
    sql = "select * from holidays"
    cursor.execute(sql)
    result = cursor.fetchall()
    
    # DataFrame으로 변경
    globals()['holidays'] = pd.DataFrame(result)
    
    db.close()   # 메모리 절약
    
    # 4-2) date 컬럼을 날짜 형식으로 변경
    globals()['holidays']['date'] = pd.to_datetime(holidays['date'])
    
    
    ### 5) Stop Words
#     #*** 추후에 Stop Words도 DB 연결해서 사용하도록 코드 변경 필요 ***#
#     with open('stopwords-ko.txt', encoding='utf-8') as sw:
#         globals()['stop_words'] = sw.readlines()
#     globals()['stop_words'] = [sw.replace('\n', '') for sw in stop_words]
    # GitHub로부터 Stop Words 로드
    stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt")
    # Stop Words List에 각 매체명 추가
    except_media_list = ['매일경제', '매일', '경제', 'maeil', 'MK', 'mk',
                         '아시아경제', '아시아', 'Asia', 'ASIA', 'asia',
                         '삼프로TV', '삼프로', 'TV',
                         '슈카월드', '슈카', '월드'
                         '한국경제TV', '한국']
    for word in [except_media_list]:
        stopwords.append(word)
    
    
    
    
    #### 2. Preprocessing
    '''감성 어휘 사전 : negative / positive
       뉴스 데이터 : samsung / hyundai / lg / sk
       주식 데이터 : stock_samsung / stock_hyundai / stock_lg / stock_sk
       공휴일 데이터 : holidays'''
    
    
    ### 1) 뉴스 데이터 날짜 조정
    
    ## 1-1)업로드 시각 컬럼 추가
    for i in range(len(corp_list)):
        globals()[corp_list[i]]['time'] = globals()[corp_list[i]]['datetime'].str[-2:]
    
    ## 1-2) 전일 15시 ~ 금일 15시로 날짜 조정
    after_market = ['15', '16', '17', '18', '19', '20', '21', '22', '23']

    for i in range(len(corp_list)):
        for j in range(len(globals()[corp_list[i]]['time'])):
            if globals()[corp_list[i]]['time'][j] in after_market:
                globals()[corp_list[i]]['date'][j] += datetime.timedelta(1)
            else:
                pass
    
    ## 1-3) 텍스트 전처리
    # \n, \t, \r 제거
    for i in range(len(corp_list)):
        for j in range(len(globals()[corp_list[i]]['text'])):
            globals()[corp_list[i]]['text'][j] = globals()[corp_list[i]]['text'][j].replace('[\n|\t|\r]', '')
#     # text 컬럼의 Stop Words 제거
#     for i in range(len(corp_list)):
#         globals()[corp_list[i]]['except_stopwords'] = 0
#         for j in range(len(globals()[corp_list[i]]['text'])):            
#             hangeul = re.compile('[^ ㄱ-ㅣ 가-힣]')                         # 정규 표현식  → 한글 추출 규칙 : 띄어쓰기(1개)를 포함한 한글
#             result = hangeul.sub('', globals()[corp_list[i]]['text'][j])   # 위에 설정한 hangeul 규칙을 text에 적용
#             okt = Okt()                                                    # 형태소 추출
#             nouns = okt.nouns(result)
#             nouns = [x for x in nouns if len(x) > 1]                       # 한 글자 키워드 제거
#             nouns = [x for x in nouns if x not in stopwords]               # 불용어 제거
            
#             corpus = " ".join(nouns)                                       # List를 String으로 변환
#             globals()[corp_list[i]]['except_stopwords'][j] = corpus
    
        
    ### 2) 주말 및 공휴일 제외
    
    ## 2-1) 주말 및 공휴일만 추출
    market_closed = globals()['holidays'][globals()['holidays']['holiday']=="O"].reset_index(drop=True)
    
    ## 2-3) 휴장일 List 생성
    market_closed_list = list(market_closed['date'])
    
    ## 2-4) iteration limit 조정
    limit_number = 15000
    sys.setrecursionlimit(limit_number)
    
    ## 2-5) 휴장일 제외 함수 적용
    # 주말 및 공휴일 제외 함수
#     def stock_market_closed(df):
#         for i in range(len(df['date'])):
#             if df['date'][i] in market_closed_list:
#                 df['date'][i] += datetime.timedelta(1)
#                 stock_market_closed(df)
#             else:
#                 pass
#         return df
    
#     for i in range(len(corp_list)):
#         stock_market_closed(globals()[corp_list[i]])
    for i in range(len(corp_list)):
        while len(globals()[corp_list[i]][globals()[corp_list[i]]['date'].isin(market_closed_list)]['date']) != 0:
            for j in globals()[corp_list[i]][globals()[corp_list[i]]['date'].isin(market_closed_list)]['date'].index:
                globals()[corp_list[i]]['date'][j] += datetime.timedelta(1)
    
    
    
        
    #### 3. Sentiment Analysis
    
    
#     ### 1) 종목별 긍부정 Score 계산
#     for i in range(len(corp_list)):
#         globals()[corp_list[i]]['score'] = 0
#         tokenizer = Okt()

#         for x in range(len(globals()[corp_list[i]]['date'])):
#             score = 0
#             num = tokenizer.nouns(globals()[corp_list[i]]['text'][x])
#             for y in num:
#                 # KOSELF 감성 어휘 사전
#                 if y in positive:
#                     score += 1
#                 elif y in negative:
#                     score -= 1
#                 else:
#                     score = score

#             globals()[corp_list[i]]['score'][x] = score
    
    
    ### 2) 주식가격 데이터와 결합
    corp_label_list = []
    for i in range(len(corp_list)):
        
        ## 2-1) 결합
        globals()[corp_list[i] + '_label'] = pd.merge(globals()[corp_list[i]], globals()['stock_' + corp_list[i]], how='left', left_on='date', right_on='Date')
        globals()[corp_list[i] + '_label'].drop('Date', axis=1, inplace=True)
        
        ## 2-2) UpDown과 Extremely_Changed(Change 상하위 5%) 컬럼 생성
#         # 주식 매매 수수료 평균 : 0.1% 정도(?) → 0을 추가해도 1, -1만 나옴
#         globals()[corp_list[i] + '_label']['UpDown'] = np.where((globals()[corp_list[i] + '_label']['Close']*globals()[corp_list[i] + '_label']['Change'])>(globals()[corp_list[i] + '_label']['Close']*0.001), 1,
#                                                                 np.where((globals()[corp_list[i] + '_label']['Close']*globals()[corp_list[i] + '_label']['Change'])<(globals()[corp_list[i] + '_label']['Close']*0.001), -1, 0))
        globals()[corp_list[i] + '_label']['UpDown'] = np.where(globals()[corp_list[i] + '_label']['Change']<0, -1,
                                                                np.where(globals()[corp_list[i] + '_label']['Change']>0, 1, 0))
#         # 단순히 Change가 (+), 0, (-)인지에 따라 각각 1, 0, -1
#         globals()[corp_list[i] + '_label']['UpDown'] = np.where(globals()[corp_list[i] + '_label']['Change']>0, 1,
#                                                                 np.where(globals()[corp_list[i] + '_label']['Change']<0, -1, 0))
#         globals()[corp_list[i] + '_label']['Extremely_Changed'] = np.where((globals()[corp_list[i] + '_label']['Change']>globals()[corp_list[i] + '_label']['Change'].quantile(.95)) & (globals()[corp_list[i] + '_label']['Change']>0), 1,
#                                                                            np.where((globals()[corp_list[i] + '_label']['Change']<globals()[corp_list[i] + '_label']['Change'].quantile(.05)) & (globals()[corp_list[i] + '_label']['Change']<0), -1, 0))
        
        ## 2-3) List에 추가
        corp_label_list.append(globals()[corp_list[i] + '_label'])
        
        ## 2-4) text 컬럼의 NaN 제거
        globals()[corp_list[i] + '_label'].dropna(axis=0, inplace=True)
    
    
    ### 3) Tokenization 컬럼 추가
    for i in range(len(corp_list)):
        globals()[corp_list[i] + '_label']['Tokenization'] = 0
        rows = globals()[corp_list[i] + '_label'].shape[0]
        for j in range(rows):
            hangeul = re.compile('[^ ㄱ-ㅣ 가-힣]')                                    # 정규 표현식 → 한글 추출 규칙 : 띄어쓰기(1개)를 포함한 한글
            result = hangeul.sub('', globals()[corp_list[i] + '_label']['text'][j])   # 위에 설정한 hangeul 규칙을 text에 적용
            okt = Okt()                                                               # 형태소 추출
            nouns = okt.nouns(globals()[corp_list[i] + '_label']['text'][j])
            nouns = [x for x in nouns if len(x) > 1]                                  # 한 글자 키워드 제거
            nouns = [x for x in nouns if x not in stopwords]                          # 불용어 제거
            
            corpus = " ".join(nouns)                                                  # List를 String으로 변환
            globals()[corp_list[i] + '_label']['Tokenization'][j] = corpus
        
    
    ### 4) 전체 종목 DataFrame 통합
    globals()['total_label'] = pd.concat(corp_label_list, axis=0)

### **② Sentiment Analysis**

## **매일경제(2018)**

In [4]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 매일경제
***시작 날짜(YYYY-MM-DD) : 2018-01-01
***종료 날짜(YYYY-MM-DD) : 2018-12-31


('maeil_news_craw', '2018-01-01', '2018-12-31')

In [5]:
media_stock_prediction(craw_media, start_date, end_date)

In [6]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2018_maeil_{0}.csv'.format(corp_list[i]), index=False)

## **아시아경제(2018)**

In [7]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 아시아경제
***시작 날짜(YYYY-MM-DD) : 2018-01-01
***종료 날짜(YYYY-MM-DD) : 2018-12-31


('asia_news_craw', '2018-01-01', '2018-12-31')

In [8]:
media_stock_prediction(craw_media, start_date, end_date)

In [9]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2018_asia_{0}.csv'.format(corp_list[i]), index=False)

## **매일경제(2019)**

In [10]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 매일경제
***시작 날짜(YYYY-MM-DD) : 2019-01-01
***종료 날짜(YYYY-MM-DD) : 2019-12-31


('maeil_news_craw', '2019-01-01', '2019-12-31')

In [11]:
media_stock_prediction(craw_media, start_date, end_date)

In [12]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2019_maeil_{0}.csv'.format(corp_list[i]), index=False)

## **아시아경제(2019)**

In [13]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 아시아경제
***시작 날짜(YYYY-MM-DD) : 2019-01-01
***종료 날짜(YYYY-MM-DD) : 2019-12-31


('asia_news_craw', '2019-01-01', '2019-12-31')

In [14]:
media_stock_prediction(craw_media, start_date, end_date)

In [15]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2019_asia_{0}.csv'.format(corp_list[i]), index=False)

## **매일경제(2020)**

In [16]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 매일경제
***시작 날짜(YYYY-MM-DD) : 2020-01-01
***종료 날짜(YYYY-MM-DD) : 2020-12-31


('maeil_news_craw', '2020-01-01', '2020-12-31')

In [17]:
media_stock_prediction(craw_media, start_date, end_date)

In [18]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2020_maeil_{0}.csv'.format(corp_list[i]), index=False)

## **아시아경제(2020)**

In [19]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 아시아경제
***시작 날짜(YYYY-MM-DD) : 2020-01-01
***종료 날짜(YYYY-MM-DD) : 2020-12-31


('asia_news_craw', '2020-01-01', '2020-12-31')

In [20]:
media_stock_prediction(craw_media, start_date, end_date)

In [21]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2020_asia_{0}.csv'.format(corp_list[i]), index=False)

## **매일경제(2021)**

In [22]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 매일경제
***시작 날짜(YYYY-MM-DD) : 2021-01-01
***종료 날짜(YYYY-MM-DD) : 2021-09-28


('maeil_news_craw', '2021-01-01', '2021-09-28')

In [23]:
media_stock_prediction(craw_media, start_date, end_date)

In [24]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2021_maeil_{0}.csv'.format(corp_list[i]), index=False)

## **아시아경제(2021)**

In [25]:
arguments()

***매체명 입력(매일경제/아시아경제/삼프로TV/슈카월드/한국경제TV) : 아시아경제
***시작 날짜(YYYY-MM-DD) : 2021-01-01
***종료 날짜(YYYY-MM-DD) : 2021-09-28


('asia_news_craw', '2021-01-01', '2021-09-28')

In [26]:
media_stock_prediction(craw_media, start_date, end_date)

In [27]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_label'].to_csv('../../../../Final Data/2021_asia_{0}.csv'.format(corp_list[i]), index=False)