In [119]:
import pandas as pd
import numpy as np
import re
import pickle
import requests
from bs4 import BeautifulSoup
from datetime import date
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from transformers import TextClassificationPipeline
from transformers import BertTokenizerFast
from transformers import TFBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [120]:
# LSTM 토크나이저
with open('./src/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)  
# LSTM 모델
model = load_model('./src/model.h5')
# BERT 토크나이저, 모델
loaded_tokenizer = BertTokenizerFast.from_pretrained('./src/bert', from_pt=True)
loaded_model = TFBertForSequenceClassification.from_pretrained('./src/bert', from_pt=True)
classifier = TextClassificationPipeline(tokenizer=loaded_tokenizer, model=loaded_model,
                                            framework='tf', return_all_scores=True) 

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [121]:
def get_code(symbol):
    krx = pd.read_csv('./src/krx_code.csv')
    krx = krx.set_index('한글 종목약명')
    try:
        code = krx.at[symbol,'단축코드']
        return code
    except:
        print('종목명을 다시 확인해주세요.')
        return 0

def get_comment(df,symbol):
    code = get_code(symbol)
    day = df['날짜'][0]
    date_list = []
    comment_list = []
    raw_comment_list = []
    chk = 1
    i = 1
    while chk:  
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')  
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')                 
                if day > root[1].split()[0].replace('.','-'):
                    chk = 0
                    break
                if len(root) == 14: # 답글
                    pass      
                elif len(root) == 13: # 기본
                    comment = root[3]
                    date_list.append(root[1].split()[0].replace('.','-'))
                    raw_comment_list.append(comment)            
                else: # 에러
                    pass
            except: # 에러
                pass
            print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링중..',end='')
        i += 1
        if chk == 0:
            break   
    print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링완료')
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = raw_comment_list
    return df

def BERT_feargreed(df,symbol):
    df = get_comment(df,symbol)  
    raw_comment_list = df['댓글'].to_list()
    pred_list=[]
    for i in raw_comment_list:
        a = classifier(i)[0]
        f = a[0]['score']
        g = a[1]['score']
        if f >= g:
            pred_list.append(1-f)
        else:
            pred_list.append(g)
        print(f'\rBERT 댓글{len(pred_list)}개 분석중..',end='')
    df['BERT'] = pred_list  
    return df

def konlpy_okt(df,symbol):
    df = BERT_feargreed(df,symbol)
    okt = Okt()
    tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
    comment_list = df['댓글'].to_list()
    tokenized_data = []
    for i in range(len(comment_list)):
        tokenized_sentence = okt.pos(comment_list[i], stem=True) 
        tag_checked_sentence = []
        for j in tokenized_sentence:
            x,y = j
            if y in tag_list:
                tag_checked_sentence.append(x)
        tokenized_data.append(tag_checked_sentence)     
    for i in tokenized_data:
        for j in range(len(i)):
            i[j] = "'"+i[j]+"'"
    df['LSTM'] = tokenized_data
    return df
    
def feargreed_index(df,symbol):
    df = konlpy_okt(df,symbol)
    tokenized_data = df['LSTM'].to_list()
    test = tokenizer.texts_to_sequences(tokenized_data)
    test = pad_sequences(test, maxlen=15)
    pred = model.predict(test)
    df['LSTM'] = pred
    df['LSTM'] = df['LSTM'].round(6)
    return df

In [129]:
df = pd.read_csv('./src/naver_score_0501.csv')
df2 = feargreed_index(df,'NAVER')
df_naver = df2.append(df).drop_duplicates(subset=['날짜','댓글'],keep='last')
df_naver = df_naver.reset_index(drop=True)
df_naver.to_csv('./src/naver_score_0501.csv',index=False)
df_naver

2022-05-27 댓글81개 크롤링완료.
BERT 댓글81개 분석중..

Unnamed: 0,날짜,댓글,BERT,LSTM
0,2022-05-27,곧 쏘겠네,0.034122,0.017184
1,2022-05-27,주시 하고 계시는게 좋겠습니다,0.852904,1.000000
2,2022-05-27,카카오는 종가 올리고,0.859154,1.000000
3,2022-05-27,네이버는 ‘보’합을 좋아해~,0.553982,0.402284
4,2022-05-27,오늘 종가는 271000원이다,0.423871,0.004942
...,...,...,...,...
3101,2022-05-01,많은 전문가들 월요일부터,0.806393,0.555548
3102,2022-05-01,15플로 물림.27까지 촉수엄금,0.190780,0.000000
3103,2022-05-01,네이버 종목,0.408528,0.064982
3104,2022-05-01,[아젠다의노래가아닌 우리의노래로이노랠부르...,0.476696,0.006255


In [133]:
df1 = pd.read_csv('./src/kakao_0401_BERT.csv')
df2 = pd.read_csv('./src/kakao_0501_BERT.csv')
df = df2.append(df1)
df

Unnamed: 0,날짜,댓글,BERT
0,2022-05-26,외국인 기관 프로그램 매수다,0.621421
1,2022-05-26,이게 이제야나오네..,0.425924
2,2022-05-26,자회사 관계사 또 분할상장해서 5만원까지...,0.053937
3,2022-05-26,카카오에서,0.556082
4,2022-05-26,앵두형님이 바닥이란다,0.152784
...,...,...,...
4730,2022-04-01,카카오는 계열사분할로 망한다,0.109831
4731,2022-04-01,"앵두 방송준비중,,..",0.115401
4732,2022-04-01,딱보니,0.225211
4733,2022-04-01,105000에서 절대안사짐!,0.304911


In [135]:
df['LSTM'] = feargreed_index(df)
df['LSTM'] = df['LSTM'].round(6)
df = df.reset_index(drop=True)
df

9931개 형태소분리중

Unnamed: 0,날짜,댓글,BERT,LSTM
0,2022-05-26,외국인 기관 프로그램 매수다,0.621421,1.000000
1,2022-05-26,이게 이제야나오네..,0.425924,0.010115
2,2022-05-26,자회사 관계사 또 분할상장해서 5만원까지...,0.053937,1.000000
3,2022-05-26,카카오에서,0.556082,0.004687
4,2022-05-26,앵두형님이 바닥이란다,0.152784,0.000000
...,...,...,...,...
9926,2022-04-01,카카오는 계열사분할로 망한다,0.109831,0.000000
9927,2022-04-01,"앵두 방송준비중,,..",0.115401,0.075082
9928,2022-04-01,딱보니,0.225211,0.001256
9929,2022-04-01,105000에서 절대안사짐!,0.304911,0.036745


In [137]:
df.to_csv('./src/kakao_score_0401.csv',index=False)

In [134]:
def konlpy_okt(df):
    okt = Okt()
    tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
    comment_list = df['댓글'].to_list()
    tokenized_data = []
    for i in range(len(comment_list)):
        tokenized_sentence = okt.pos(str(comment_list[i]), stem=True) 
        tag_checked_sentence = []
        for j in tokenized_sentence:
            x,y = j
            if y in tag_list:
                tag_checked_sentence.append(x)
        tokenized_data.append(tag_checked_sentence)   
        print(f'\r{i+1}개 형태소분리중',end='')
    for i in tokenized_data:
        for j in range(len(i)):
            i[j] = "'"+i[j]+"'"
    return tokenized_data
    
def tokenize(df):
    tokenized_data = konlpy_okt(df)
    test = tokenizer.texts_to_sequences(tokenized_data)
    test = pad_sequences(test, maxlen=15)
    return test

def feargreed_index(df): 
    test = tokenize(df)
    pred = model.predict(test)
    return pred

In [61]:
df['LSTM'] = feargreed_index(df)

2952개 형태소분리중

In [62]:
df['LSTM'] = df['LSTM'].round(6)