# **Lexicon**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
fasttext.util.download_model('ko', if_exists='ignore')
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf

print('GPU', '사용 가능' if tf.config.experimental.list_physical_devices('GPU') else '사용 불가능')

GPU 사용 불가능


In [3]:
!nvidia-smi

Sat Sep 18 07:13:04 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 452.56       Driver Version: 452.56       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX450      WDDM  | 00000000:2D:00.0 Off |                  N/A |
| N/A   62C    P8    N/A /  N/A |    119MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## **Read KOSELF & Test Data**

### **① KOSELF 감성 어휘 사전**

In [4]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② 2018년 Samsung Data**

In [5]:
lexicon_2018 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/2018_samsung.csv')

lexicon_2018.head(1)

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization
0,삼성전자,5930,매일경제,2018010100,스마트베타ETF 고공행진 새해도 이어질까,http://news.mk.co.kr/newsRead.php?year=2018&no=29,\n\n\n 수수료가 싼 상장지수펀드(ETF)에 펀드매니저가 종목을 고르는 액티브 ...,2018-01-02,0,51380,51400,50780,51020,169485,0.001177,1,0,수수료 상장 지수 펀드 펀드매니저 종목 액티브 펀드 특성 가미 스마트 베타 대한 목...


In [6]:
lexicon_2018[lexicon_2018['text'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,Tokenization


In [7]:
lexicon_2018.dropna(axis=0, inplace=True)
lexicon_2018 = lexicon_2018.reset_index(drop=True)
lexicon_2018.isna().sum()

st_n                 0
st_cd                0
news                 0
datetime             0
title                0
url                  0
text                 0
date                 0
time                 0
Open                 0
High                 0
Low                  0
Close                0
Volume               0
Change               0
UpDown               0
Extremely_Changed    0
Tokenization         0
dtype: int64

## **Calculate Cosine Similarity**

In [8]:
# 이미 학습된 FastText 내장 한국어 모델
try:
    print(ko_model, '모델 로드 불필요')
except:
    ko_model = models.fasttext.load_facebook_model('cc.ko.300.bin')

In [9]:
limit_number = 15000
sys.setrecursionlimit(limit_number)

In [None]:
pos_lexicon_2018 = pd.DataFrame(columns=['date', 'news_num', 'KOSELF_pos_word', 'news_word', 'cosine_similarity', 'frequency'])   # 'frequency' 구현할 방법은 추후 생각...ㅠㅠ

# KOSELF_pos와의 Cosine Similarity 계산
for x in range(len(lexicon_2018['Tokenization'])):
    news_num = x+1
    for y in range(len(positive)):
        for z in range(len(list(set(lexicon_2018['Tokenization'][x].split())))):
            if (ko_model.wv.similarity(positive[y], list(set(lexicon_2018['Tokenization'][x].split()))[z]) >= 0.5) and (ko_model.wv.similarity(positive[y], list(set(lexicon_2018['Tokenization'][x].split()))[z]) != 1.0):
                freq = 0
                for w in range(len(lexicon_2018['Tokenization'][x].split())):
                    if lexicon_2018['Tokenization'][x].split()[w] == list(set(lexicon_2018['Tokenization'][x].split()))[z]:
                        freq += 1
                data = {
                    'date': lexicon_2018['date'][x],
                    'news_num': news_num,
                    'KOSELF_pos_word': positive[y],
                    'news_word': list(set(lexicon_2018['Tokenization'][x].split()))[z],
                    'cosine_similarity': ko_model.wv.similarity(positive[y], list(set(lexicon_2018['Tokenization'][x].split()))[z]),
                    'frequency': freq
                    }
                pos_lexicon_2018 = pos_lexicon_2018.append(data, ignore_index=True)
                print("***{0} Cosine Similarity between <{1}> & <{2}> : {3}".format(lexicon_2018['date'][x], positive[y], list(set(lexicon_2018['Tokenization'][x].split()))[z], ko_model.wv.similarity(positive[y], list(set(lexicon_2018['Tokenization'][x].split()))[z])))

***2018-01-02 Cosine Similarity between <경신> & <기록> : 0.522990882396698
***2018-01-02 Cosine Similarity between <장점> & <특성> : 0.5244326591491699
***2018-01-02 Cosine Similarity between <경신> & <기록> : 0.522990882396698
***2018-01-02 Cosine Similarity between <의심의 여지가 없는> & <자공시> : 0.5188294053077698
***2018-01-02 Cosine Similarity between <증대> & <증가> : 0.6946426630020142
***2018-01-02 Cosine Similarity between <추진> & <중인> : 0.539746880531311
***2018-01-02 Cosine Similarity between <호황> & <훈풍> : 0.5522076487541199
***2018-01-02 Cosine Similarity between <추진> & <사업> : 0.5213737487792969
***2018-01-02 Cosine Similarity between <추진> & <계획> : 0.5205424427986145
***2018-01-02 Cosine Similarity between <강세> & <약세> : 0.7028064727783203
***2018-01-02 Cosine Similarity between <개선> & <방안> : 0.5635414123535156
***2018-01-02 Cosine Similarity between <실현> & <차익> : 0.531851053237915
***2018-01-02 Cosine Similarity between <우세> & <강세> : 0.5120405554771423
***2018-01-02 Cosine Similarity between <중요한> 

In [16]:
pos_lexicon_2018

Unnamed: 0,date,news_num,KOSELF_pos_word,news_word,cosine_similarity,frequency
0,2018-01-02,1,경신,기록,0.522991,3
1,2018-01-02,1,장점,특성,0.524433,1
2,2018-01-02,2,경신,기록,0.522991,1
3,2018-01-02,2,의심의 여지가 없는,자공시,0.518829,1
4,2018-01-02,2,증대,증가,0.694643,3
...,...,...,...,...,...,...
37699,2019-01-02,9043,중요한,가장,0.537125,1
37700,2019-01-02,9043,증대,증가,0.694643,1
37701,2019-01-02,9043,추진,계획,0.520542,2
37702,2019-01-02,9043,호황,업황,0.554454,1


In [17]:
pos_lexicon_2018.to_csv('../../../../Code/Data/Test/Stock-Year/pos_samsung_2018.csv', index=False)

In [18]:
a = list(set(list(pos_lexicon_2018['news_word'])))

a

['설진훈',
 '알루텍비',
 '중소협렵체',
 '아펨스테크놀러지',
 '배민아',
 '포켓린트',
 '헤이덤엘파딜',
 '알체릭',
 '강력',
 '므그자브래비',
 '파워스트립',
 '거트너',
 '더샵퍼스트월드',
 '스크레처블',
 '엮어주',
 '윤여삼',
 '궈칭제',
 '애딕트',
 '전장시',
 '신윤근',
 '쎄미켐동진쎄미켐',
 '벽체',
 '샹들리',
 '국봉환',
 '가맹업',
 '윤제용',
 '약세',
 '투티엠',
 '브르나이',
 '옴니아폰',
 '송동삼송',
 '카투홈',
 '라디언트',
 '응우옌티낌응언',
 '경량패딩',
 '최남채',
 '리밸런싱했다',
 '최봉길',
 '진행',
 '반지원',
 '보안컨퍼러스',
 '리둥성',
 '우트르몽시',
 '조리갯값',
 '인멸했다',
 '기가와트',
 '감소',
 '강점',
 '팰리월',
 '잭라봇랩스',
 '중앙디앤엠',
 '송출되',
 '용퇴해',
 '추로스',
 '박학규',
 '우뜨흐몽',
 '주요한',
 '판티치',
 '리바운딩',
 '로컬테인먼트',
 '월곶판교선',
 '카티린',
 '토픽스지수',
 '덴티움덴티움',
 '던츠컵',
 '관악고',
 '알카텔루슨트',
 '완샹그룹',
 '닛케이지수',
 '펑황왕',
 '호앙쭝타',
 '박봉흠',
 '아람휴비스',
 '웨강아오',
 '비난가능성',
 '소재효',
 '탈루액',
 '차익',
 '가치나',
 '폭락',
 '권평오',
 '강병윤',
 '암페어시',
 '조호윤',
 '허페이시',
 '공주의료원',
 '리비젼',
 '엄웅렬',
 '앤텍컴',
 '공유진',
 '빈둥거리',
 '방송업',
 '기술러',
 '최고점',
 '뒤집혔',
 '아라마리',
 '드래콘',
 '합종연횡해',
 '매경테스트',
 '강관류',
 '랙티브',
 '씨쓰루팀',
 '민정웅',
 '특성',
 '양규석',
 '스냅드래건',
 '임채환',
 '곽수윤',
 '롱텀펀드',
 '에임트',
 '로컬디밍',
 '쁘랏트엉쑥씨',
 '상생컨설팅팀',
 '문제해결',

In [19]:
b = list(set(list(pos_lexicon_2018[pos_lexicon_2018['cosine_similarity']>=0.7]['news_word'])))

b

['셀루메드셀루메드', '밑돈', '쎄미켐동진쎄미켐', '단점', '불황', '추진', '실패', '최고점', '우세', '약세', '추천']

In [20]:
c = list(set(list(pos_lexicon_2018[pos_lexicon_2018['cosine_similarity']>=0.65]['news_word'])))

c

['김민현',
 '것이므',
 '룩시드랩스',
 '밑돈',
 '탈루액',
 '실패',
 '교제비',
 '증가',
 '추천',
 '숭요처',
 '셀루메드셀루메드',
 '단점',
 '진르터우탸오',
 '맞소송',
 '감소',
 '중앙디앤엠',
 '웃돌',
 '송출되',
 '리비젼',
 '갈승훈',
 '맹주들',
 '쎄미켐동진쎄미켐',
 '불황',
 '추진',
 '최고점',
 '국봉환',
 '약세',
 '적극',
 '우세']