# **Lexicon**

In [1]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# DB 연결
# !pip install pymysql
import pymysql

# 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

# 텍스트 분석
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score

# 모델 저장 및 로드
import joblib

# 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf

print('GPU', '사용 가능' if tf.config.experimental.list_physical_devices('GPU') else '사용 불가능')

GPU 사용 불가능


In [3]:
!nvidia-smi

Sat Sep 11 09:12:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 452.56       Driver Version: 452.56       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce MX450      WDDM  | 00000000:2D:00.0 Off |                  N/A |
| N/A   61C    P8    N/A /  N/A |    119MiB /  2048MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## **Read Data**

### **① KOSELF 감성 어휘 사전**

In [4]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② 연도별 News Data**

In [5]:
years = ['2018', '2019', '2020', '2021']
for i in range(len(years)):
    globals()['news_' + years[i]] = pd.read_csv('../../../../Code/Data/news_{}.csv'.format(years[i]))
    globals()['news_' + years[i]].dropna(axis=0, inplace=True)

In [6]:
test = pd.concat([news_2018, news_2019, news_2020])

test.shape

(52484, 18)

In [7]:
test[test['text'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,score,Open,High,Low,Close,Volume,Change,UpDown,Extremely_Changed


## **③ Stop Words**

In [8]:
# https://gist.github.com/spikeekips/40eea22ef4a89f629abd87eed535ac6a#file-stopwords-ko-txt
with open('stopwords-ko.txt', encoding='utf-8') as sw:
    stop_words = sw.readlines()
stop_words = [sw.replace('\n', '') for sw in stop_words]

## **Sentiment Analysis**

### **① 연도별 Lexicon**

#### **(1) Pos Dict**

In [9]:
tokenizer = Okt()

for i in range(len(years)):
    globals()['pos_' + years[i]] = []
    positive_text = globals()['news_' + years[i]][globals()['news_' + years[i]]['Extremely_Changed']==1]['text']
    
    for x in positive_text:
        words = tokenizer.nouns(x)
        # 불용어 제거하고 긍정어 리스트에 추가
        for y in words:
            if y not in stop_words:
                globals()['pos_' + years[i]].append(y)
            else:
                pass

#### **(2) Neg Dict**

In [10]:
tokenizer = Okt()

for i in range(len(years)):
    globals()['neg_' + years[i]] = []
    negative_text = globals()['news_' + years[i]][globals()['news_' + years[i]]['Extremely_Changed']==1]['text']
    
    for x in negative_text:
        words = tokenizer.nouns(x)
        # 불용어 제거하고 부정어 리스트에 추가
        for y in words:
            if y not in stop_words:
                globals()['neg_' + years[i]].append(y)
            else:
                pass

#### **(3) Giro Dick**

In [11]:
Counter(pos_2018).most_common(20)[0][1]

2671

In [12]:
len(Counter(pos_2018)), len(pos_2018)

(11682, 238464)

In [13]:
# 긍정어
for i in range(len(years)):
    globals()['lexicon_pos_' + years[i]] = pd.DataFrame(Counter(globals()['pos_' + years[i]]).most_common(), columns=['word', 'frequency'])

# 부정어
for i in range(len(years)):
    globals()['lexicon_neg_' + years[i]] = pd.DataFrame(Counter(globals()['neg_' + years[i]]).most_common(), columns=['word', 'frequency'])

In [14]:
lexicon_pos_2018

Unnamed: 0,word,frequency
0,코스피,2671
1,전일,2319
2,거래,2090
3,증권,1778
4,대비,1748
...,...,...
11677,태동,1
11678,차트,1
11679,지선,1
11680,김광현,1
