# **Lexicon**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
fasttext.util.download_model('ko', if_exists='ignore')
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf

print('GPU', '사용 가능' if tf.config.experimental.list_physical_devices('GPU') else '사용 불가능')

GPU 사용 불가능


In [3]:
!nvidia-smi

Wed Sep 15 16:00:12 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 452.56       Driver Version: 452.56       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX450      WDDM  | 00000000:2D:00.0 Off |                  N/A |
| N/A   65C    P8    N/A /  N/A |    119MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## **Read KOSELF & Test Data**

### **① KOSELF 감성 어휘 사전**

In [4]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② 2018~2020년 Data**

In [5]:
news_18to20 = pd.read_csv('../../../../Code/Data/Test/news_18to20.csv')

print(news_18to20.shape)
news_18to20.head(1)

(52484, 19)


Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,score,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed,except_stopwords
0,삼성전자,5930,매일경제,2018010100,스마트베타ETF 고공행진 새해도 이어질까,http://news.mk.co.kr/newsRead.php?year=2018&no=29,수수료가 싼 상장지수펀드(ETF)에 펀드매니저가 종목을 고르는 액티브 펀드 특성을...,2018-01-02,0,2,51380,51400,50780,51020,169485,0.001177,1,0,수수료 상장 지수 펀드 펀드매니저 종목 액티브 펀드 특성 가미 스마트 베타 대한 목...


In [6]:
news_18to20.dropna(inplace=True)
news_18to20 = news_18to20.reset_index(drop=True)

## **Calculate Cosine Similarity**

In [7]:
# 이미 학습된 FastText 내장 한국어 모델
try:
    print(ko_model, '모델 로드 불필요')
except:
    ko_model = models.fasttext.load_facebook_model('cc.ko.300.bin')

In [8]:
# KOSELF_neg와의 Cosine Similarity 계산
pos_cosine = []
for x in range(len(news_18to20['except_stopwords'])):
    for y in range(len(positive)):
        for z in range(len(news_18to20['except_stopwords'][x].split())):
            if ko_model.wv.similarity(positive[y], news_18to20['except_stopwords'][x].split()[z]) > 0.5:
                pos_cosine.append(news_18to20['except_stopwords'][x].split()[z])

pos_cosine = list(set(pos_cosine))

In [9]:
pos_cosine

['엠에스',
 '자오쟝',
 '랙하우징',
 '박기효',
 '의미',
 '박순환',
 '드래콘',
 '덴티움덴티움',
 '순안국제공항',
 '컨테이선',
 '김좌일',
 '우트르몽시',
 '드라마센타',
 '셀루메드셀루메드',
 '텔코웨어텔',
 '거버넌',
 '최선',
 '유심슬롯',
 '톈치그룹',
 '몰타기사단',
 '펜데믹',
 '민혜련',
 '그린메일',
 '딴마음',
 '월곶판교선',
 '텍메드바디텍메드',
 '얌샘김밥',
 '용산세무서',
 '홀릭레이쥔',
 '애블린',
 '노정희',
 '도대사',
 '준감위',
 '흐릿했다',
 '알루텍비',
 '커넥티드된',
 '고자세',
 '우세',
 '설진훈',
 '진행',
 '강창진',
 '삼척대',
 '우호',
 '리장시',
 '빠확인할',
 '강인길',
 '알라배마',
 '훈풍',
 '커넥티드화',
 '메켄지',
 '더싼딜',
 '헬그스트란트',
 '벳이글벳',
 '박제완',
 '여의도고',
 '김금안',
 '어쎔블록',
 '노점온도',
 '엠피대산엠',
 '어쏘시에이츠',
 '브랄로바',
 '중향후',
 '심병창',
 '트랜시페어런시',
 '라인웍스',
 '펫슈어런스',
 '노수혁',
 '숏패딩',
 '연천군',
 '선셋론',
 '양미영',
 '진압대장',
 '베른스테',
 '삼성고',
 '자이앤트',
 '취합시킬',
 '리밸린싱',
 '루어위',
 '세릍리온',
 '딜소싱',
 '블러디메리',
 '그리니치표준시',
 '한림대의료원',
 '디지털신호',
 '대림오토바이',
 '뉘른부르크링',
 '룽강구',
 '손효숙씨',
 '펀더빔',
 '수츠윈',
 '적격비용',
 '다이요유덴',
 '물량폭탄',
 '특감반',
 '곽수윤',
 '휴톡스',
 '스니더',
 '임홍택',
 '딥글린트간',
 '트랜션',
 '납픔단',
 '샤오양',
 '응우옌찌중',
 '운동장역',
 '랭크됐으',
 '진르터우탸오',
 '헛말이었',
 '범퍼하단',
 '구해오',
 '매렸다',
 '본격',
 '삼미의스텐',
 '멤브레용

In [1]:
len(pos_cosine)

NameError: name 'pos_cosine' is not defined

In [None]:
# KOSELF_neg와의 Cosine Similarity 계산
neg_cosine = []
for x in range(len(news_18to20['except_stopwords'])):
    for y in range(len(negative)):
        for z in range(len(news_18to20['except_stopwords'][x].split())):
            if ko_model.wv.similarity(negative[y], news_18to20['except_stopwords'][x].split()[z]) > 0.5:
                pos_cosine.append(news_18to20['except_stopwords'][x].split()[z])

pos_cosine = list(set(neg_cosine))

In [None]:
neg_cosine

## **Read Data**

### **① KOSELF 감성 어휘 사전**

In [None]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② 연도별 News Data**

In [None]:
years = ['2018', '2019', '2020', '2021']
for i in range(len(years)):
    globals()['news_' + years[i]] = pd.read_csv('../../../../Code/Data/news_{}.csv'.format(years[i]))
    globals()['news_' + years[i]].dropna(axis=0, inplace=True)

In [None]:
test = pd.concat([news_2018, news_2019, news_2020])

test.shape

In [None]:
test[test['text'].isna()]

## **③ Stop Words**

In [None]:
# https://gist.github.com/spikeekips/40eea22ef4a89f629abd87eed535ac6a#file-stopwords-ko-txt
with open('stopwords-ko.txt', encoding='utf-8') as sw:
    stop_words = sw.readlines()
stop_words = [sw.replace('\n', '') for sw in stop_words]

## **Sentiment Analysis**

### **① 연도별 Lexicon**

#### **(1) Pos Dict**

In [None]:
tokenizer = Okt()

for i in range(len(years)):
    globals()['pos_' + years[i]] = []
    positive_text = globals()['news_' + years[i]][globals()['news_' + years[i]]['Extremely_Changed']==1]['text']
    
    for x in positive_text:
        words = tokenizer.nouns(x)
        # 불용어 제거하고 긍정어 리스트에 추가
        for y in words:
            if y not in stop_words:
                globals()['pos_' + years[i]].append(y)
            else:
                pass

#### **(2) Neg Dict**

In [None]:
tokenizer = Okt()

for i in range(len(years)):
    globals()['neg_' + years[i]] = []
    negative_text = globals()['news_' + years[i]][globals()['news_' + years[i]]['Extremely_Changed']==1]['text']
    
    for x in negative_text:
        words = tokenizer.nouns(x)
        # 불용어 제거하고 부정어 리스트에 추가
        for y in words:
            if y not in stop_words:
                globals()['neg_' + years[i]].append(y)
            else:
                pass

#### **(3) Giro Dict**

In [None]:
Counter(pos_2018).most_common(20)[0][1]

In [None]:
len(Counter(pos_2018)), len(pos_2018)

In [None]:
# 긍정어
for i in range(len(years)):
    globals()['lexicon_pos_' + years[i]] = pd.DataFrame(Counter(globals()['pos_' + years[i]]).most_common(), columns=['word', 'frequency'])

# 부정어
for i in range(len(years)):
    globals()['lexicon_neg_' + years[i]] = pd.DataFrame(Counter(globals()['neg_' + years[i]]).most_common(), columns=['word', 'frequency'])

In [None]:
lexicon_pos_2018