# **Lexicon**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
# fasttext.util.download_model('ko', if_exists='ignore')   # FastText 모델 사용 시에만 필요
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Read Data**

### **① KOSELF 감성 어휘 사전**

In [2]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

### **② Celltrion Lexicon**

#### **1) Positive**

In [3]:
pos_2018 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/pos_celltrion_2018.csv')
pos_2019 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/pos_celltrion_2019.csv')
pos_2020 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/pos_celltrion_2020.csv')
pos_2021 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/pos_celltrion_2021.csv')

print(pos_2018.shape, pos_2019.shape, pos_2020.shape, pos_2021.shape)

pos_2018.head(3)

(9952, 6) (7587, 6) (9714, 6) (7047, 6)


Unnamed: 0,date,news_num,KOSELF_pos_word,news_word,cosine_similarity,frequency
0,2018-01-02,1,강세,약세,0.702806,2
1,2018-01-02,1,개선,개선,1.0,1
2,2018-01-02,1,개선,방안,0.563541,3


In [4]:
# 전체 연도 통합
lexicon_pos = pd.concat([pos_2018, pos_2019, pos_2020, pos_2021], axis=0).reset_index(drop=True)

len(lexicon_pos)

34300

In [5]:
lexicon_pos_50 = list(set(lexicon_pos[(lexicon_pos['cosine_similarity']>0.50) & (lexicon_pos['cosine_similarity']<0.99)]['news_word']))
lexicon_pos_60 = list(set(lexicon_pos[(lexicon_pos['cosine_similarity']>0.60) & (lexicon_pos['cosine_similarity']<0.99)]['news_word']))
lexicon_pos_65 = list(set(lexicon_pos[(lexicon_pos['cosine_similarity']>0.65) & (lexicon_pos['cosine_similarity']<0.99)]['news_word']))
lexicon_pos_70 = list(set(lexicon_pos[(lexicon_pos['cosine_similarity']>0.70) & (lexicon_pos['cosine_similarity']<0.99)]['news_word']))

len(lexicon_pos_50), len(lexicon_pos_60), len(lexicon_pos_65), len(lexicon_pos_70)

(323, 77, 23, 9)

#### **2) Negative**

In [6]:
neg_2018 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/neg_celltrion_2018.csv')
neg_2019 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/neg_celltrion_2019.csv')
neg_2020 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/neg_celltrion_2020.csv')
neg_2021 = pd.read_csv('../../../../Code/Data/Test/Stock-Year/neg_celltrion_2021.csv')

print(neg_2018.shape, neg_2019.shape, neg_2020.shape, neg_2021.shape)

neg_2018.head(3)

(14511, 6) (11117, 6) (12540, 6) (9266, 6)


Unnamed: 0,date,news_num,KOSELF_neg_word,news_word,cosine_similarity,frequency
0,2018-01-02,1,둔화,감소,0.534909,1
1,2018-01-02,1,둔화,업황,0.55782,2
2,2018-01-02,1,둔화,하락,0.526078,1


In [7]:
# 전체 연도 통합
lexicon_neg = pd.concat([neg_2018, neg_2019, neg_2020, neg_2021], axis=0).reset_index(drop=True)

len(lexicon_neg)

47434

In [8]:
lexicon_neg_50 = list(set(lexicon_neg[(lexicon_neg['cosine_similarity']>0.50) & (lexicon_neg['cosine_similarity']<0.99)]['news_word']))
lexicon_neg_60 = list(set(lexicon_neg[(lexicon_neg['cosine_similarity']>0.60) & (lexicon_neg['cosine_similarity']<0.99)]['news_word']))
lexicon_neg_65 = list(set(lexicon_neg[(lexicon_neg['cosine_similarity']>0.65) & (lexicon_neg['cosine_similarity']<0.99)]['news_word']))
lexicon_neg_70 = list(set(lexicon_neg[(lexicon_neg['cosine_similarity']>0.70) & (lexicon_neg['cosine_similarity']<0.99)]['news_word']))

len(lexicon_neg_50), len(lexicon_neg_60), len(lexicon_neg_65), len(lexicon_neg_70)

(79, 10, 5, 2)

## **Add Words to KOSELF**

In [9]:
# 긍정어
koself_pos_50 = positive + lexicon_pos_50
koself_pos_60 = positive + lexicon_pos_60
koself_pos_65 = positive + lexicon_pos_65
koself_pos_70 = positive + lexicon_pos_70

# 부정어
koself_neg_50 = negative + lexicon_neg_50
koself_neg_60 = negative + lexicon_neg_60
koself_neg_65 = negative + lexicon_neg_65
koself_neg_70 = negative + lexicon_neg_70

In [10]:
# 기준 Cosine Similarity별 사전 생성
similar = ['50', '60', '65', '70']

# 긍정어
for i in range(len(similar)):
    f = open('../../../../Code/Lexicon/lexicon_celltrion_pos_{}.txt'.format(similar[i]), 'w')
    for j in range(len(globals()['koself_pos_' + similar[i]])):
        f.write(globals()['koself_pos_' + similar[i]][j] + '\n')
    f.close()

#  부정어
for i in range(len(similar)):
    f = open('../../../../Code/Lexicon/lexicon_celltrion_neg_{}.txt'.format(similar[i]), 'w')
    for j in range(len(globals()['koself_neg_' + similar[i]])):
        f.write(globals()['koself_neg_' + similar[i]][j] + '\n')
    f.close()