# **Lexicon**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
# fasttext.util.download_model('ko', if_exists='ignore')   # FastText 모델 사용 시에만 필요
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Read Data**

### **① Media News**

In [2]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
years = ['2018', '2019', '2020', '2021']

for i in range(len(corp_list)):
    # 종목별 리스트 생성
    globals()[corp_list[i] + '_years'] = []
    
    # 연도별 뉴스 기사
    for j in range(len(years)):
        globals()[corp_list[i] + '_' + years[j]] = pd.read_csv('../../../../Code/Data/Test/Stock-Year/{}_{}.csv'.format(years[j], corp_list[i]))
        globals()[corp_list[i] + '_years'].append(globals()[corp_list[i] + '_' + years[j]])
    
    # 전체 기간 통합
    globals()[corp_list[i] + '_news'] = pd.concat(globals()[corp_list[i] + '_years'], axis=0).reset_index(drop=True)

### **② Giro Dict**

In [3]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
similar = ['50', '60', '65', '70']

for i in range(len(corp_list)):
    for j in range(len(similar)):
        
        # 긍정어 사전
        with open('../../../../Code/Lexicon/lexicon_{}_pos_{}.txt'.format(corp_list[i], similar[j])) as positive:
            globals()['lexicon_' + corp_list[i] + '_pos_' + similar[j]] = positive.readlines()
        globals()['lexicon_' + corp_list[i] + '_pos_' + similar[j]] = [positive.replace('\n', '') for positive in globals()['lexicon_' + corp_list[i] + '_pos_' + similar[j]]]
        
        # 부정어 사전
        with open('../../../../Code/Lexicon/lexicon_{}_neg_{}.txt'.format(corp_list[i], similar[j])) as negative:
            globals()['lexicon_' + corp_list[i] + '_neg_' + similar[j]] = negative.readlines()
        globals()['lexicon_' + corp_list[i] + '_neg_' + similar[j]] = [negative.replace('\n', '') for negative in globals()['lexicon_' + corp_list[i] + '_neg_' + similar[j]]]

## **Sentiment Score**

### **① Samsung**

In [4]:
samsung_news.head(1)

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization
0,삼성전자,5930,매일경제,2018010100,스마트베타ETF 고공행진 새해도 이어질까,http://news.mk.co.kr/newsRead.php?year=2018&no=29,\n\n\n 수수료가 싼 상장지수펀드(ETF)에 펀드매니저가 종목을 고르는 액티브 ...,2018-01-02,0,51380,51400,50780,51020,169485,0.001177,1,0,수수료 상장 지수 펀드 펀드매니저 종목 액티브 펀드 특성 가미 스마트 베타 대한 목...


In [5]:
samsung_news.isna().sum()

st_n                 0
st_cd                0
news                 0
datetime             0
title                0
url                  0
text                 0
date                 0
time                 0
Open                 0
High                 0
Low                  0
Close                0
Volume               0
Change               0
UpDown               0
Extremely_Changed    0
Tokenization         2
dtype: int64

In [6]:
samsung_news[samsung_news['Tokenization'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization
4144,삼성전자,5930,매일경제,2018121204,[2018 대한민국 지식재산대전] 일상의 불편함 없앤 발명품 한자리에,http://news.mk.co.kr/newsRead.php?year=2018&no...,\n\n\nДыХыЗЩЛѓРЛ МіЛѓЧб КЅУГБтОї ОЦГЏЗЮБзЧУЗЏН...,2018-12-12,4,40250,40700,40150,40450,12024279,0.004969,1,0,
25699,삼성전자,5930,매일경제,2021040516,"QLED·OLED·미니LED… 복잡한 TV 제품, 삼성·LG TV 제품별 기능 차이...",http://news.mk.co.kr/newsRead.php?no=325040&ye...,¿ÃÇØ¿¡µµ ´Ù½Ã ÇÑ ¹ø »ï¼ºÀüÀÚ¿Í LGÀüÀÚÀÇ ¡®TV À...,2021-04-06,16,86200,86200,85100,86000,19042023,0.007026,1,0,


In [7]:
# NaN 제거
for i in range(len(corp_list)):
    globals()[corp_list[i] + '_news'].dropna(inplace=True)
    globals()[corp_list[i] + '_news'] = globals()[corp_list[i] + '_news'].reset_index(drop=True)

In [8]:
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
similar = ['50', '60', '65', '70']

# iteration limit 조정
limit_number = 15000
sys.setrecursionlimit(limit_number)

for i in range(len(corp_list)):
    globals()[corp_list[i] + '_news']['Positive_Score'] = 0
    globals()[corp_list[i] + '_news']['Negative_Score'] = 0
    globals()[corp_list[i] + '_news']['Ratio'] = 0.1
    globals()[corp_list[i] + '_news']['Pred'] = 0
    
    for j in range(len(similar)):
        globals()[corp_list[i] + '_news_' + similar[j]] = globals()[corp_list[i] + '_news']
    
        for k in range(len(globals()[corp_list[i] + '_news_' + similar[j]]['Tokenization'])):
            pos_score = 0 ; neg_score = 0
            
            for l in range(len(globals()[corp_list[i] + '_news_' + similar[j]]['Tokenization'][k].split())):
                if globals()[corp_list[i] + '_news_' + similar[j]]['Tokenization'][k].split()[l] in globals()['lexicon_' + corp_list[i] + '_pos_' + similar[j]]:
                    pos_score += 1
                elif globals()[corp_list[i] + '_news_' + similar[j]]['Tokenization'][k].split()[l] in globals()['lexicon_' + corp_list[i] + '_neg_' + similar[j]]:
                    neg_score += 1
                else:
                    pass

            globals()[corp_list[i] + '_news_' + similar[j]]['Positive_Score'][k] = pos_score
            globals()[corp_list[i] + '_news_' + similar[j]]['Negative_Score'][k] = neg_score
            
            # 긍정과 부정의 비율
            if (pos_score==0) or (neg_score==0):
                globals()[corp_list[i] + '_news_' + similar[j]]['Ratio'][k] = 0.5   # 사전에 포함되는 단어가 없으면 주가가 상승할 확률을 0.5로 가정
            else:
                globals()[corp_list[i] + '_news_' + similar[j]]['Ratio'][k] = pos_score / (pos_score + neg_score)
            
            # 예측 결과
            if globals()[corp_list[i] + '_news_' + similar[j]]['Ratio'][k]>=0.5:
                globals()[corp_list[i] + '_news_' + similar[j]]['Pred'][k] = 1
            else:
                globals()[corp_list[i] + '_news_' + similar[j]]['Pred'][k] = -1
            
        # CSV로 저장
        globals()[corp_list[i] + '_news_' + similar[j]].to_csv('../../../../Code/Lexicon/{}_news_{}.csv'.format(corp_list[i], similar[j]), index=False)
        
        print('... {}번째 종목의 {}/4 완료 ...'.format(i+1, j+1))
    
    print('===== {}번째 종목 완료! =====\n'.format(i+1))

print('####### 전부 완료-! #######')

... 1번째 종목의 1/4 완료 ...
... 1번째 종목의 2/4 완료 ...
... 1번째 종목의 3/4 완료 ...
... 1번째 종목의 4/4 완료 ...
===== 1번째 종목 완료! =====

... 2번째 종목의 1/4 완료 ...
... 2번째 종목의 2/4 완료 ...
... 2번째 종목의 3/4 완료 ...
... 2번째 종목의 4/4 완료 ...
===== 2번째 종목 완료! =====

... 3번째 종목의 1/4 완료 ...
... 3번째 종목의 2/4 완료 ...
... 3번째 종목의 3/4 완료 ...
... 3번째 종목의 4/4 완료 ...
===== 3번째 종목 완료! =====

... 4번째 종목의 1/4 완료 ...
... 4번째 종목의 2/4 완료 ...
... 4번째 종목의 3/4 완료 ...
... 4번째 종목의 4/4 완료 ...
===== 4번째 종목 완료! =====

... 5번째 종목의 1/4 완료 ...
... 5번째 종목의 2/4 완료 ...
... 5번째 종목의 3/4 완료 ...
... 5번째 종목의 4/4 완료 ...
===== 5번째 종목 완료! =====

####### 전부 완료-! #######


In [9]:
# # NaN 제거
# for i in range(len(corp_list)):
#     for j in range(len(similar)):
#         globals()[corp_list[i] + '_news_' + similar[j]].dropna(inplace=True)
#         globals()[corp_list[i] + '_news_' + similar[j]] = globals()[corp_list[i] + '_news_' + similar[j]].reset_index(drop=True)

In [10]:
# Accuracy 확인
for i in range(len(corp_list)):
    if corp_list[i] == 'samsung':
        corp = 'SAMSUNG'
    elif corp_list[i] == 'hyundai':
        corp = 'HYUNDAI'
    elif corp_list[i] == 'lg':
        corp = 'LG'
    elif corp_list[i] == 'sk':
        corp = 'SK'
    else:
        corp = 'CELLTRION'
        
    for j in range(len(similar)):
        print("Lexicon of {}({}%)'s Accuracy : {}".format(corp, similar[j], (len(globals()[corp_list[i] + '_news_' + similar[j]][(globals()[corp_list[i] + '_news_' + similar[j]]['UpDown']>=0) & (globals()[corp_list[i] + '_news_' + similar[j]]['Pred']>=0)]) + len(globals()[corp_list[i] + '_news_' + similar[j]][(globals()[corp_list[i] + '_news_' + similar[j]]['UpDown']<0) & (globals()[corp_list[i] + '_news_' + similar[j]]['Pred']<0)])) / len(globals()[corp_list[i] + '_news_' + similar[j]]['Pred'])))
    
    print('\n==================================================\n')

Lexicon of SAMSUNG(50%)'s Accuracy : 0.4989092566665581
Lexicon of SAMSUNG(60%)'s Accuracy : 0.4989092566665581
Lexicon of SAMSUNG(65%)'s Accuracy : 0.4989092566665581
Lexicon of SAMSUNG(70%)'s Accuracy : 0.4989092566665581


Lexicon of HYUNDAI(50%)'s Accuracy : 0.4645886889460154
Lexicon of HYUNDAI(60%)'s Accuracy : 0.4645886889460154
Lexicon of HYUNDAI(65%)'s Accuracy : 0.4645886889460154
Lexicon of HYUNDAI(70%)'s Accuracy : 0.4645886889460154


Lexicon of LG(50%)'s Accuracy : 0.5047686832740214
Lexicon of LG(60%)'s Accuracy : 0.5047686832740214
Lexicon of LG(65%)'s Accuracy : 0.5047686832740214
Lexicon of LG(70%)'s Accuracy : 0.5047686832740214


Lexicon of SK(50%)'s Accuracy : 0.49576540334533137
Lexicon of SK(60%)'s Accuracy : 0.49576540334533137
Lexicon of SK(65%)'s Accuracy : 0.49576540334533137
Lexicon of SK(70%)'s Accuracy : 0.49576540334533137


Lexicon of CELLTRION(50%)'s Accuracy : 0.4993118634737132
Lexicon of CELLTRION(60%)'s Accuracy : 0.4993118634737132
Lexicon of CELLT

In [11]:
# # 부호 확인
# np.sign(0)

In [12]:
samsung_news_50[samsung_news_50['Positive_Score'] != samsung_news_70['Positive_Score']]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,...,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization,Positive_Score,Negative_Score,Ratio,Pred


In [14]:
samsung_news_50.shape, samsung_news_70.shape

((30713, 22), (30713, 22))

In [19]:
samsung_news_50.head(1)

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,...,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization,Positive_Score,Negative_Score,Ratio,Pred
0,삼성전자,5930,매일경제,2018010100,스마트베타ETF 고공행진 새해도 이어질까,http://news.mk.co.kr/newsRead.php?year=2018&no=29,\n\n\n 수수료가 싼 상장지수펀드(ETF)에 펀드매니저가 종목을 고르는 액티브 ...,2018-01-02,0,51380,...,51020,169485,0.001177,1,0,수수료 상장 지수 펀드 펀드매니저 종목 액티브 펀드 특성 가미 스마트 베타 대한 목...,2,0,0.5,1


In [18]:
samsung_news_70.head(1)

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,...,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization,Positive_Score,Negative_Score,Ratio,Pred
0,삼성전자,5930,매일경제,2018010100,스마트베타ETF 고공행진 새해도 이어질까,http://news.mk.co.kr/newsRead.php?year=2018&no=29,\n\n\n 수수료가 싼 상장지수펀드(ETF)에 펀드매니저가 종목을 고르는 액티브 ...,2018-01-02,0,51380,...,51020,169485,0.001177,1,0,수수료 상장 지수 펀드 펀드매니저 종목 액티브 펀드 특성 가미 스마트 베타 대한 목...,2,0,0.5,1
