# **Sentiment Analysis(Lexicon)**

In [1]:
import numpy as np
import pandas as pd

### Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### DB 연결
# !pip install pymysql
import pymysql

### DB에 저장
import sqlalchemy
from sqlalchemy import create_engine

### 실시간 주식가격 데이터
# !pip install finance-datareader
import FinanceDataReader as fdr

### 텍스트 분석
## KoNLPy
# 1) JAVA 설치, 2) Python 버전과 맞는 JPype1-py3 설치, 3) !pip install konlpy, 4) 설치 경로에서 jvm.py 파일 코드 67번 줄 주석 처리 
from konlpy.tag import Okt
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
## FastText
# !pip install gensim
# !pip install fasttext
import fasttext
import fasttext.util
# Facebook 한국어 Embedding 모델 다운로드 → 한 번만 설치하면 됨
# fasttext.util.download_model('ko', if_exists='ignore')   # FastText 모델 사용 시에만 필요
# 유사도 계산
from gensim import models

### 모델 학습 및 평가
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score

### 모델 저장 및 로드
import joblib

### 기타
import datetime
from collections import Counter
import sys
import warnings
warnings.filterwarnings('ignore')

## **Read Data**

### **① KOSELF + Blog 감성사전**

In [2]:
# 블로그에서 가져온 기본적인 한국어 긍부정 텍스트 목록
with open('positive_words_self.txt', encoding='utf-8') as pos_blog:
    positive_blog = pos_blog.readlines()
positive_blog = [pos_blog.replace('\n', '') for pos_blog in positive_blog]
with open('negative_words_self.txt', encoding='utf-8') as neg_blog:
    negative_blog = neg_blog.readlines()
negative_blog = [neg_blog.replace('\n', '') for neg_blog in negative_blog]

# KOSELF 감성 어휘 사전
with open('KOSELF_pos.txt', encoding='utf-8') as pos:
    positive = pos.readlines()
positive = [pos.replace('\n', '') for pos in positive]
with open('KOSELF_neg.txt', encoding='utf-8') as neg:
    negative = neg.readlines()
negative = [neg.replace('\n', '') for neg in negative]

In [3]:
len(positive), len(positive_blog), len(negative), len(negative_blog)

(48, 46, 47, 74)

In [4]:
# 긍정사전 통합
for i in range(len(positive_blog)):
    if positive_blog[i] not in positive:
        positive.append(positive_blog[i])

# 부정사전 통합
for i in range(len(negative_blog)):
    if negative_blog[i] not in negative:
        negative.append(negative_blog[i])

In [5]:
len(positive), len(negative)

(92, 111)

In [6]:
# 중복 확인
cnt = 0
for i in range(len(positive)):
    if positive[i] in negative:
        cnt += 1

cnt

0

### **② Text Data**

In [7]:
# 종목별, 연도별 데이터 통합
corp_list = ['samsung', 'hyundai', 'lg', 'sk', 'celltrion']
years = ['2018', '2019', '2020', '2021']
media_list = ['maeil', 'asia']

for i in range(len(corp_list)):
    globals()[corp_list[i] + '_news'] = []
    for j in range(len(years)):
        for k in range(len(media_list)):
            globals()[corp_list[i] + '_' + media_list[k] + '_' + years[j]] = pd.read_csv('../../../../Final Data/{0}_{1}_{2}.csv'.format(years[j], media_list[k], corp_list[i]))
            globals()[corp_list[i] + '_news'].append(globals()[corp_list[i] + '_' + media_list[k] + '_' + years[j]])

In [8]:
for i in range(len(corp_list)):
    globals()[corp_list[i]] = pd.concat(globals()[corp_list[i] + '_news'], axis=0)

## **Sentiment Score**

In [9]:
samsung[samsung['Tokenization'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Tokenization
4144,삼성전자,5930,매일경제,2018121204,[2018 대한민국 지식재산대전] 일상의 불편함 없앤 발명품 한자리에,http://news.mk.co.kr/newsRead.php?year=2018&no...,\n\n\nДыХыЗЩЛѓРЛ МіЛѓЧб КЅУГБтОї ОЦГЏЗЮБзЧУЗЏН...,2018-12-12,4,40250,40700,40150,40450,12024279,0.004969,1,
754,삼성전자,5930,매일경제,2021040516,"QLED·OLED·미니LED… 복잡한 TV 제품, 삼성·LG TV 제품별 기능 차이...",http://news.mk.co.kr/newsRead.php?no=325040&ye...,¿ÃÇØ¿¡µµ ´Ù½Ã ÇÑ ¹ø »ï¼ºÀüÀÚ¿Í LGÀüÀÚÀÇ ¡®TV À...,2021-04-06,16,86200,86200,85100,86000,19042023,0.007026,1,


In [10]:
hyundai[hyundai['Tokenization'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Tokenization
294,현대차,5380,매일경제,2019021914,"이시한, 취업계 2년 연속 트렌드 예측 경향성 소름 돋는 적중, ‘문어시한’이 떴다...",http://edu.mk.co.kr/edunews/news_view.php?sc=5...,\n\n,2019-02-19,14,119500,120500,119000,119500,470357,-0.008299,-1,
300,현대차,5380,매일경제,2019022011,"이시한이 말하는, 100:1이 일반화된 채용시장에서 살아남는 방법 - 인터뷰 #2",http://edu.mk.co.kr/edunews/news_view.php?sc=5...,\n\n,2019-02-20,11,119500,121500,119000,119000,735087,-0.004184,-1,
349,현대차,5380,매일경제,2020022017,스스로 알아서 최적 기어변속…현대·기아차 세계 최초 개발,http://news.mk.co.kr/newsRead.php?no=178210&ye...,ЧіДыЁЄБтОЦРкЕПТїДТ РќЙц ЕЕЗЮПЭ БГХы ЛѓШВРЛ ТїЗ...,2020-02-21,17,128000,129500,127000,128000,686417,-0.015385,-1,
1891,현대차,5380,매일경제,2020103016,"현대차 `두번` 찾은 문 대통령…""고용안정도 협력사 상생도 1등""",http://news.mk.co.kr/newsRead.php?no=1116169&y...,ЧіДыРкЕПТїИІ ЕЮЙјТА ЙцЙЎЧб ЙЎРчРЮ ДыХыЗЩРЬ Чі...,2020-11-02,16,166000,170500,166000,170500,1052301,0.036474,1,
2198,현대차,5380,매일경제,2020121808,"""로봇 안내견, 내쫓지 마세요""…현대차, `구조·돌봄 로봇` 공개",http://news.mk.co.kr/newsRead.php?no=1297569&y...,\n\n\nНКЦЬ НУПЌ РхИщ [ЛчСјСІАј=ЧіДыТїБзЗь]\n А...,2020-12-18,8,191000,191000,189000,189000,930831,-0.007874,-1,
2216,현대차,5380,매일경제,2020122113,"""없어서 못 팔아""…중고 `싼타페·투싼·포터`, `비대면 수출` 효자",http://news.mk.co.kr/newsRead.php?no=1305818&y...,\n\n\nÄ„·¹æ”¼­ ĄĪ±ā³ōĄŗ Åõ½Ń [»ēĮų Į¦°ų = æĄÅä...,2020-12-21,13,188500,189000,185000,187000,1120752,-0.010582,-1,
2224,현대차,5380,매일경제,2020122215,"""지금 팔면 돈 버네""…중고 `그랜저·쏘렌토·티볼리`, 몸값↑",http://news.mk.co.kr/newsRead.php?no=1310643&y...,\n\n\nĮß°ķĀ÷ ½ĆĄå ŗń¼ö±āæ”µµ ĮĮĄŗ °Ŗæ” ĘČ ¼ö Ą...,2020-12-23,15,183000,188500,183000,185000,1301433,0.005435,1,
2251,현대차,5380,매일경제,2020122814,"""팰리세이드, 두고 보자""…더 강해진 `포드 신형 SUV`가 온다",http://news.mk.co.kr/newsRead.php?no=1326358&y...,\n\n\nЦїЕх РЭНКЧУЗЮЗЏ [ЛчСј УтУГ = ЦїЕхФкИЎОЦ]...,2020-12-28,14,189000,193000,187500,189500,2410348,0.013369,1,
272,현대차,5380,매일경제,2021020518,"2월엔 그랜저 팔고 볼보XC60 사라…중고차 가격, 세단↑ SUV↓",http://news.mk.co.kr/newsRead.php?no=124283&ye...,\n\n\nРгПЕПѕАњ G4ЗКНКХЯ [ЛчСј СІАј=НжПыТї]\n С...,2021-02-08,18,238000,240500,228500,234000,3212977,-0.062124,-1,
467,현대차,5380,매일경제,2021031110,"'카니발 강적' 스타리아, '슈퍼맨 아빠' 이동국 탄다…정의선 회장 선물",http://news.mk.co.kr/newsRead.php?no=232045&ye...,"\n\n\nЧіДыТї НКХИИЎОЦ [ЛчСј СІАј=ЧіДыТї]\n""ФЋД...",2021-03-11,10,228000,232000,227500,228500,1154314,0.004396,1,


In [11]:
lg[lg['Tokenization'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Tokenization
256,LG화학,51910,매일경제,2018042713,배출권시장협의회 출범식 개최,http://news.mk.co.kr/newsRead.php?year=2018&no...,"\n\n\nЧбБЙАХЗЁМвДТ 27РЯ М­ПяЛчПСПЁМ­ ШЏАцКЮ, Й...",2018-04-27,13,354000,360000,354000,354500,156968,0.002829,1,
784,LG화학,51910,매일경제,2020092109,"""신풍제약, 시가총액은 10조 원인데 영업이익은 20억 원? 미스터리""",http://news.mk.co.kr/newsRead.php?no=971416&ye...,½ÅÁ¾ ÄÚ·Î³ª¹ÙÀÌ·¯½º °¨¿°Áõ(ÄÚ·Î³ª19)À¸·Î Áõ½Ã°...,2020-09-21,9,656000,657000,626000,627000,1239438,-0.058559,-1,


In [12]:
sk[sk['Tokenization'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Tokenization
1153,sk하이닉스,660,매일경제,2019112017,이시한의 2020년 취업트렌드코리아 #2 : Implement AI,http://edu.mk.co.kr/edunews/news_view.php?sc=5...,\n\n,2019-11-21,17,81700,82500,80500,80900,3346746,-0.021765,-1,


In [13]:
celltrion[celltrion['Tokenization'].isna()]

Unnamed: 0,st_n,st_cd,news,datetime,title,url,text,date,time,Open,High,Low,Close,Volume,Change,UpDown,Tokenization
363,셀트리온,68270,아시아경제,2020060215,"[속보] 당국 ""셀트리온 항체치료제, 유럽서 임상 준비중…혈장치료제 우선 확보 목표""",https://view.asiae.co.kr/article/2020060215162...,,2020-06-03,15,227335,230772,220461,223407,1813508,0.013363,1,


In [14]:
# 결측치 제거
for i in range(len(corp_list)):
    globals()[corp_list[i]].dropna(inplace=True)
    globals()[corp_list[i]] = globals()[corp_list[i]].reset_index(drop=True)

In [15]:
# iteration limit 조정
limit_number = 15000
sys.setrecursionlimit(limit_number)

for i in range(len(corp_list)):
    globals()[corp_list[i]]['Positive_Score'] = 0
    globals()[corp_list[i]]['Negative_Score'] = 0
    globals()[corp_list[i]]['Ratio'] = 0.1
    globals()[corp_list[i]]['Pred'] = 0
    globals()[corp_list[i]]['NSI'] = 0.1
    
    for j in range(len(globals()[corp_list[i]])):
        pos_score = 0 ; neg_score = 0
        
        for k in range(len(globals()[corp_list[i]]['Tokenization'][j].split())):
            if globals()[corp_list[i]]['Tokenization'][j].split()[k] in positive:
                pos_score += 1
            elif globals()[corp_list[i]]['Tokenization'][j].split()[k] in negative:
                neg_score += 1
            else:
                pass
        
        globals()[corp_list[i]]['Positive_Score'][j] = pos_score
        globals()[corp_list[i]]['Negative_Score'][j] = neg_score
        
        # 긍정과 부정의 비율
        if (pos_score==0) and (neg_score==0):
            globals()[corp_list[i]]['Ratio'][j] = 0.5   # 둘 다 0일 경우에는 긍정으로 가정
        else:
            globals()[corp_list[i]]['Ratio'][j] = pos_score / (pos_score + neg_score)
        
        # 예측 결과
        if globals()[corp_list[i]]['Ratio'][j]>=0.5:
            globals()[corp_list[i]]['Pred'][j] = 1
        else:
            globals()[corp_list[i]]['Pred'][j] = -1
        
        # 뉴스심리지수(NSI) 계산
        if (pos_score==0) and (neg_score==0):
            globals()[corp_list[i]]['NSI'][j] = 101
        else:
            globals()[corp_list[i]]['NSI'][j] = (pos_score - neg_score) / (pos_score + neg_score) * 100 + 100
    
    # CSV로 저장
    globals()[corp_list[i]].to_csv('../../../../Final Data/{}. {}_score.csv'.format(i+1, corp_list[i]), index=False)
    
    corp = corp_list[i].upper()
    print("... {}'s Sentiment Score 계산 완료! ...".format(corp))

print('##### 전부 완료-! #####')

... SAMSUNG's Sentiment Score 계산 완료! ...
... HYUNDAI's Sentiment Score 계산 완료! ...
... LG's Sentiment Score 계산 완료! ...
... SK's Sentiment Score 계산 완료! ...
... CELLTRION's Sentiment Score 계산 완료! ...
##### 전부 완료-! #####


In [16]:
# Accuracy 확인
for i in range(len(corp_list)):
    corp = corp_list[i].upper()
    print('<{}> Acccuracy of {} : {}'.format(i+1, corp, (len(globals()[corp_list[i]][(globals()[corp_list[i]]['UpDown']<0) & (globals()[corp_list[i]]['Pred']<0)]) + len(globals()[corp_list[i]][(globals()[corp_list[i]]['UpDown']>0) & (globals()[corp_list[i]]['Pred']>0)])) / len(globals()[corp_list[i]])))

<1> Acccuracy of SAMSUNG : 0.49452925798018266
<2> Acccuracy of HYUNDAI : 0.4659692543216177
<3> Acccuracy of LG : 0.5101088646967341
<4> Acccuracy of SK : 0.4948020581749449
<5> Acccuracy of CELLTRION : 0.49267223667990684
