# **Lexicon**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
fasttext.util.download_model('ko', if_exists='ignore')
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf

print('GPU', '사용 가능' if tf.config.experimental.list_physical_devices('GPU') else '사용 불가능')

GPU 사용 불가능


In [3]:
!nvidia-smi

Sun Sep 19 21:40:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 457.49       Driver Version: 457.49       CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 1650   WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   44C    P8     1W /  N/A |    372MiB /  4096MiB |      6%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## **Read KOSELF & Test Data**

### **① KOSELF 감성 어휘 사전**

In [4]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② 2020년 Celltrion Data**

In [5]:
lexicon_2020 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/2020_celltrion.csv')

lexicon_2020.head(1)

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization
0,셀트리온,68270,매일경제,2020010210,"새해 문연 코스피, 개인 매수에 상승 출발 후 하락 전환",http://news.mk.co.kr/newsRead.php?no=3317&year...,\n\n\n 새해 첫 개장한 코스피가 개인의 매수에 힘입어 상승 출발한 후 하락 전...,2020-01-02,10,177742,179705,172832,176761,621826,-0.005525,-1,0,새해 개장 코스피 개인 매수 상승 출발 하락 전환 오전 현재 코스피 전일 대비 포인...


In [6]:
lexicon_2020[lexicon_2020['text'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization
1331,셀트리온,68270,아시아경제,2020060215,"[속보] 당국 ""셀트리온 항체치료제, 유럽서 임상 준비중…혈장치료제 우선 확보 목표""",https://view.asiae.co.kr/article/2020060215162...,,2020-06-03,15,227335,230772,220461,223407,1813508,0.013363,1,0,


In [7]:
lexicon_2020.dropna(axis=0, inplace=True)
lexicon_2020 = lexicon_2020.reset_index(drop=True)
lexicon_2020.isna().sum()

st_n                 0
st_cd                0
news                 0
datetime             0
title                0
url                  0
text                 0
date                 0
time                 0
Open                 0
High                 0
Low                  0
Close                0
Volume               0
Change               0
UpDown               0
Extremely_Changed    0
Tokenization         0
dtype: int64

## **Calculate Cosine Similarity**

In [8]:
# 이미 학습된 FastText 내장 한국어 모델
try:
    print(ko_model, '모델 로드 불필요')
except:
    ko_model = models.fasttext.load_facebook_model('cc.ko.300.bin')

In [9]:
limit_number = 15000
sys.setrecursionlimit(limit_number)

In [10]:
pos_lexicon_2020 = pd.DataFrame(columns=['date', 'news_num', 'KOSELF_pos_word', 'news_word', 'cosine_similarity', 'frequency'])   # 'frequency' 구현할 방법은 추후 생각...ㅠㅠ

# KOSELF_pos와의 Cosine Similarity 계산
for x in range(len(lexicon_2020['Tokenization'])):
    news_num = x+1
    for y in range(len(positive)):
        for z in range(len(list(set(lexicon_2020['Tokenization'][x].split())))):
            if (ko_model.wv.similarity(positive[y], list(set(lexicon_2020['Tokenization'][x].split()))[z]) >= 0.5) and (ko_model.wv.similarity(positive[y], list(set(lexicon_2020['Tokenization'][x].split()))[z]) != 1.0):
                freq = 0
                for w in range(len(lexicon_2020['Tokenization'][x].split())):
                    if lexicon_2020['Tokenization'][x].split()[w] == list(set(lexicon_2020['Tokenization'][x].split()))[z]:
                        freq += 1
                data = {
                    'date': lexicon_2020['date'][x],
                    'news_num': news_num,
                    'KOSELF_pos_word': positive[y],
                    'news_word': list(set(lexicon_2020['Tokenization'][x].split()))[z],
                    'cosine_similarity': ko_model.wv.similarity(positive[y], list(set(lexicon_2020['Tokenization'][x].split()))[z]),
                    'frequency': freq
                    }
                pos_lexicon_2020 = pos_lexicon_2020.append(data, ignore_index=True)
                print("***{0} Cosine Similarity between <{1}> & <{2}> : {3}".format(lexicon_2020['date'][x], positive[y], list(set(lexicon_2020['Tokenization'][x].split()))[z], ko_model.wv.similarity(positive[y], list(set(lexicon_2020['Tokenization'][x].split()))[z])))

***2020-01-02 Cosine Similarity between <강세> & <약세> : 0.7028064727783203
***2020-01-02 Cosine Similarity between <경신> & <기록> : 0.522990882396698
***2020-01-02 Cosine Similarity between <실현> & <차익> : 0.531851053237915
***2020-01-02 Cosine Similarity between <우세> & <강세> : 0.5120405554771423
***2020-01-02 Cosine Similarity between <우호적> & <우호> : 0.5347263216972351
***2020-01-02 Cosine Similarity between <강세> & <약세> : 0.7028064727783203
***2020-01-02 Cosine Similarity between <경신> & <기록> : 0.522990882396698
***2020-01-02 Cosine Similarity between <실현> & <차익> : 0.531851053237915
***2020-01-02 Cosine Similarity between <우세> & <강세> : 0.5120405554771423
***2020-01-02 Cosine Similarity between <우호적> & <우호> : 0.5347263216972351
***2020-01-02 Cosine Similarity between <추진> & <사업> : 0.5213737487792969
***2020-01-03 Cosine Similarity between <경신> & <기록> : 0.522990882396698
***2020-01-03 Cosine Similarity between <중요한> & <가장> : 0.5371253490447998
***2020-01-03 Cosine Similarity between <경신> & <기록> :

In [11]:
pos_lexicon_2020

Unnamed: 0,date,news_num,KOSELF_pos_word,news_word,cosine_similarity,frequency
0,2020-01-02,1,강세,약세,0.702806,2
1,2020-01-02,1,경신,기록,0.522991,2
2,2020-01-02,1,실현,차익,0.531851,2
3,2020-01-02,1,우세,강세,0.512041,1
4,2020-01-02,1,우호적,우호,0.534726,1
...,...,...,...,...,...,...
9709,2021-01-04,2029,적극적으로,적극,0.663953,1
9710,2021-01-04,2029,중요한,가장,0.537125,1
9711,2021-01-04,2029,추진,계획,0.520543,1
9712,2021-01-04,2029,추진,사업,0.521374,4


In [12]:
pos_lexicon_2020.to_csv('../../../../Code/Data/Test/Stock-Year/pos_celltrion_2020.csv', index=False)

In [13]:
a = list(set(list(pos_lexicon_2020['news_word'])))

a

['구현',
 '박기효',
 '확보',
 '실패',
 '펨페르트',
 '취합돼',
 '항바러스',
 '리뉴어블',
 '유나이티드헬스케어',
 '마그비',
 '스토니브룩대',
 '재슬러',
 '과학비즈니스벨트',
 '풍림파마텍',
 '리레이팅',
 '프론텍',
 '레그단비맙',
 '감소',
 '활황',
 '강력',
 '로슈그룹',
 '듀피젠트',
 '요구',
 '증진',
 '재원임',
 '코세스',
 '업황',
 '증가',
 '확충',
 '보통신',
 '가장',
 '중인',
 '엠에스',
 '산화칼륨',
 '재택명령',
 '홍석균',
 '칸진티',
 '인플렉트',
 '인플릭시맵',
 '프로그램매매',
 '충족',
 '조정시기',
 '문제',
 '머니무브',
 '스제약위더스',
 '박제완',
 '장단점',
 '중요',
 '훈풍',
 '우호',
 '들보',
 '하나벤처스',
 '토픽스지수',
 '펜데믹',
 '스지누스',
 '송경란',
 '방법',
 '스크리닝하',
 '피엠밸류로',
 '확립',
 '개월안',
 '의미',
 '적극',
 '횡보세',
 '니케이지수',
 '진행',
 '센티먼트',
 '이점',
 '흐트러지',
 '급증',
 '윤철진',
 '침체',
 '맥쿼리인프라',
 '권평오',
 '아달리무맙',
 '우세',
 '차익',
 '엔텔스엔텔스',
 '국립기관',
 '웃돈',
 '시너지',
 '해체설',
 '웃돈다',
 '견조할',
 '트윈데믹',
 '임일순',
 '공유진',
 '조보람',
 '윤제용',
 '계획',
 '푸나왈',
 '입관식',
 '엠앤티삼강엠',
 '증대',
 '난펑그룹',
 '스포츠투나잇',
 '평년작',
 '텍드림텍',
 '얼머스인베트먼트',
 '젬백스젬백스',
 '최선',
 '박승영',
 '자공시',
 '엔투텍엔투텍',
 '추진',
 '언택트주',
 '저하',
 '개선',
 '극복',
 '정본부장',
 '특징',
 '해법',
 '유데나필',
 '펀인터렉티브',
 '나정환',
 '장점',
 '거버넌',
 '밸트로핀',
 '황재홍',
 '

In [14]:
b = list(set(list(pos_lexicon_2020[pos_lexicon_2020['cosine_similarity']>=0.7]['news_word'])))

b

['불황',
 '얼머스인베트먼트',
 '최선',
 '실패',
 '단점',
 '개선',
 '쎄미켐동진쎄미켐',
 '충족',
 '밑돈',
 '추진',
 '약세',
 '장점']

In [15]:
c = list(set(list(pos_lexicon_2020[pos_lexicon_2020['cosine_similarity']>=0.65]['news_word'])))

c

['얼머스인베트먼트',
 '최선',
 '실패',
 '것이므',
 '추진',
 '개선',
 '적극',
 '불황',
 '장점',
 '감소',
 '쎄미켐동진쎄미켐',
 '증가',
 '밑돈',
 '재택명령',
 '트윈데믹',
 '웃돌',
 '단점',
 '충족',
 '약세']