# **Lexicon**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
fasttext.util.download_model('ko', if_exists='ignore')
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf

print('GPU', '사용 가능' if tf.config.experimental.list_physical_devices('GPU') else '사용 불가능')

GPU 사용 불가능


In [3]:
!nvidia-smi

Fri Sep 17 05:43:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 452.56       Driver Version: 452.56       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX450      WDDM  | 00000000:2D:00.0 Off |                  N/A |
| N/A   66C    P8    N/A /  N/A |    119MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## **Read KOSELF & Test Data**

### **① KOSELF 감성 어휘 사전**

In [4]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② 2019년 Samsung Data**

In [5]:
lexicon_2019 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/2019_samsung.csv')

lexicon_2019.head(1)

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization
0,삼성전자,5930,매일경제,2019010100,"""韓기업 파괴적 혁신해야 성장…전담조직 회사 밖에 만들라""",http://news.mk.co.kr/newsRead.php?year=2019&no=70,◆ 2019신년기획 인터뷰 ◆\r\n\n\n 위 그래프는 크리스텐슨 교수가 매일경제...,2019-01-02,0,39400,39400,38550,38750,7847664,0.001292,1,0,신년 기획 인터뷰 그래프 크리스 텐슨 교수 매일경제 직접 그림 그래프 의미 과거 한...


In [6]:
lexicon_2019[lexicon_2019['text'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization


In [7]:
lexicon_2019.dropna(axis=0, inplace=True)
lexicon_2019 = lexicon_2019.reset_index(drop=True)
lexicon_2019.isna().sum()

st_n                 0
st_cd                0
news                 0
datetime             0
title                0
url                  0
text                 0
date                 0
time                 0
Open                 0
High                 0
Low                  0
Close                0
Volume               0
Change               0
UpDown               0
Extremely_Changed    0
Tokenization         0
dtype: int64

## **Calculate Cosine Similarity**

In [8]:
# 이미 학습된 FastText 내장 한국어 모델
try:
    print(ko_model, '모델 로드 불필요')
except:
    ko_model = models.fasttext.load_facebook_model('cc.ko.300.bin')

In [9]:
limit_number = 15000
sys.setrecursionlimit(limit_number)

In [None]:
neg_lexicon_2019 = pd.DataFrame(columns=['date', 'news_num', 'KOSELF_neg_word', 'news_word', 'cosine_similarity', 'frequency'])   # 'frequency' 구현할 방법은 추후 생각...ㅠㅠ

# KOSELF_neg와의 Cosine Similarity 계산
for x in range(len(lexicon_2019['Tokenization'])):
    news_num = x+1
    for y in range(len(negative)):
        for z in range(len(list(set(lexicon_2019['Tokenization'][x].split())))):
            if (ko_model.wv.similarity(negative[y], list(set(lexicon_2019['Tokenization'][x].split()))[z]) >= 0.5) and (ko_model.wv.similarity(negative[y], list(set(lexicon_2019['Tokenization'][x].split()))[z]) != 1.0):
                freq = 0
                for w in range(len(lexicon_2019['Tokenization'][x].split())):
                    if lexicon_2019['Tokenization'][x].split()[w] == list(set(lexicon_2019['Tokenization'][x].split()))[z]:
                        freq += 1
                data = {
                    'date': lexicon_2019['date'][x],
                    'news_num': news_num,
                    'KOSELF_neg_word': negative[y],
                    'news_word': list(set(lexicon_2019['Tokenization'][x].split()))[z],
                    'cosine_similarity': ko_model.wv.similarity(negative[y], list(set(lexicon_2019['Tokenization'][x].split()))[z]),
                    'frequency': freq
                    }
                neg_lexicon_2019 = neg_lexicon_2019.append(data, ignore_index=True)
                print("***{0} Cosine Similarity between <{1}> & <{2}> : {3}".format(lexicon_2019['date'][x], negative[y], list(set(lexicon_2019['Tokenization'][x].split()))[z], ko_model.wv.similarity(negative[y], list(set(lexicon_2019['Tokenization'][x].split()))[z])))

***2019-01-02 Cosine Similarity between <악화> & <둔화> : 0.5065980553627014
***2019-01-02 Cosine Similarity between <의혹> & <논란> : 0.5190056562423706
***2019-01-02 Cosine Similarity between <침체> & <경기> : 0.5360627174377441
***2019-01-02 Cosine Similarity between <침체> & <둔화> : 0.5701860189437866
***2019-01-02 Cosine Similarity between <하락> & <둔화> : 0.5260781049728394
***2019-01-02 Cosine Similarity between <악화> & <둔화> : 0.5065980553627014
***2019-01-02 Cosine Similarity between <침체> & <둔화> : 0.5701860189437866
***2019-01-02 Cosine Similarity between <침체> & <경기> : 0.5360627174377441
***2019-01-02 Cosine Similarity between <하락> & <둔화> : 0.5260781049728394
***2019-01-02 Cosine Similarity between <둔화> & <감소> : 0.5349085330963135
***2019-01-02 Cosine Similarity between <둔화> & <증가> : 0.5041859149932861
***2019-01-02 Cosine Similarity between <배상> & <보상> : 0.5126209259033203
***2019-01-02 Cosine Similarity between <실패> & <실패> : 0.9999999403953552
***2019-01-02 Cosine Similarity between <약화> & <감소>

In [10]:
neg_lexicon_2019

Unnamed: 0,date,news_num,KOSELF_neg_word,news_word,cosine_similarity,frequency
0,2019-01-02,1,악화,둔화,0.506598,1
1,2019-01-02,1,의혹,논란,0.519006,43
2,2019-01-02,1,침체,경기,0.536063,1
3,2019-01-02,1,침체,둔화,0.570186,1
4,2019-01-02,1,하락,둔화,0.526078,1
...,...,...,...,...,...,...
25398,2020-01-02,8456,하락,급감,0.519622,6
25399,2020-01-02,8456,하락,감소,0.575989,4
25400,2020-01-02,8456,하락,증가,0.553474,21
25401,2020-01-02,8456,하락,급락,0.687849,1


In [11]:
neg_lexicon_2019.to_csv('../../../../Code/Data/Test/Stock-Year/neg_samsung_2019.csv', index=False)

In [12]:
a = list(set(list(neg_lexicon_2019['news_word'])))

a

['보상',
 '불황',
 '폭로',
 '끝내',
 '참패',
 '급감',
 '화물',
 '위축',
 '금수',
 '어센던트',
 '운항',
 '특혜',
 '외압',
 '둔화',
 '약세',
 '성공',
 '호황',
 '주춤',
 '경기',
 '부도',
 '견조',
 '급유',
 '플러스',
 '침체',
 '위기',
 '쿨런트펌프',
 '엔드픽처',
 '청탁',
 '내통',
 '악화',
 '결항',
 '하락',
 '학연',
 '갈란츠',
 '감소',
 '차량',
 '불안',
 '선박',
 '금값',
 '의심',
 '급등',
 '누전',
 '쥴랩스',
 '마이너스제로',
 '당혹',
 '강화',
 '뎁스비전',
 '약화',
 '증가',
 '비리',
 '쇠퇴',
 '조처',
 '추징',
 '합선',
 '좌절',
 '엄벌',
 '강점',
 '무마',
 '손해배상',
 '유발',
 '등락',
 '경색',
 '우발',
 '논란',
 '상승',
 '텍드림텍',
 '호전',
 '작태',
 '보합',
 '스캔들',
 '머락비',
 '갓길',
 '저하',
 '역풍',
 '악재',
 '폭등',
 '체르빈',
 '리밸린싱',
 '피슬러',
 '업황',
 '급락',
 '손배',
 '절상',
 '르쿨르트',
 '폐업',
 '폭락',
 '실패',
 '응우옌부뚜',
 '텅빈채',
 '혼돈',
 '병세',
 '반등',
 '연루',
 '질책']

In [13]:
b = list(set(list(neg_lexicon_2019[neg_lexicon_2019['cosine_similarity']>=0.7]['news_word'])))

b

['성공', '의심', '침체', '스캔들', '약화', '상승', '실패']

In [14]:
c = list(set(list(neg_lexicon_2019[neg_lexicon_2019['cosine_similarity']>=0.65]['news_word'])))

c

['성공', '조처', '의심', '침체', '급락', '약화', '스캔들', '상승', '실패']