In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import requests
from bs4 import BeautifulSoup
from datetime import date
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from transformers import TextClassificationPipeline
from transformers import BertTokenizerFast
from transformers import TFBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [2]:
# LSTM 토크나이저
with open('./src/lstm/goodtokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)  
# LSTM 모델
model = load_model('./src/lstm/goodmodel.h5')
# BERT 토크나이저, 모델
loaded_tokenizer = BertTokenizerFast.from_pretrained('./src/bert', from_pt=True)
loaded_model = TFBertForSequenceClassification.from_pretrained('./src/bert', from_pt=True)
classifier = TextClassificationPipeline(tokenizer=loaded_tokenizer, model=loaded_model,
                                            framework='tf', return_all_scores=True) 

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [5]:
def read_comment_csv():
    df_naver = pd.read_csv('./streamlit/data/feargreed_naver.csv')
    df_kakao = pd.read_csv('./streamlit/data/feargreed_kakao.csv')
    return df_naver, df_kakao

def get_code(symbol):
    if symbol == '카카오':
        code = '035720' # 카카오
    else:
        code = '035420' # NAVER
    return code

def get_comment(df,symbol):
    code = get_code(symbol)
    day = df['날짜'][0]
    date_list = []
    comment_list = []
    raw_comment_list = []
    chk = 1
    i = 1
    while chk:  
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')  
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')                 
                if day > root[1].split()[0].replace('.','-'):
                    chk = 0
                    break
                if len(root) == 14: # 답글
                    pass      
                elif len(root) == 13: # 기본
                    comment = root[3]
                    date_list.append(root[1].split()[0].replace('.','-'))
                    raw_comment_list.append(comment)            
                else: # 에러
                    pass
            except: # 에러
                pass
            print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링중..',end='')
        i += 1
        if chk == 0:
            break   
    print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링완료')
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = raw_comment_list
    return df

def BERT_feargreed(df,symbol):
    df = get_comment(df,symbol)  
    raw_comment_list = df['댓글'].to_list()
    pred_list=[]
    for i in raw_comment_list:
        a = classifier(i)[0]
        f = a[0]['score']
        g = a[1]['score']
        if f >= g:
            pred_list.append(1-f)
        else:
            pred_list.append(g)
        print(f'\rBERT 댓글{len(pred_list)}개 분석중..',end='')
    df['BERT'] = pred_list  
    print('BERT분석 완료.')
    return df

def konlpy_okt(df,symbol):
    df = BERT_feargreed(df,symbol)
    okt = Okt()
    tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
    comment_list = df['댓글'].to_list()
    tokenized_data = []
    for i in range(len(comment_list)):
        tokenized_sentence = okt.pos(comment_list[i], stem=True) 
        tag_checked_sentence = []
        for j in tokenized_sentence:
            x,y = j
            if y in tag_list:
                tag_checked_sentence.append(x)
        tokenized_data.append(tag_checked_sentence)     
    for i in tokenized_data:
        for j in range(len(i)):
            i[j] = "'"+i[j]+"'"
    df['LSTM'] = tokenized_data
    return df
    
def feargreed_index(df,symbol):
    df = konlpy_okt(df,symbol)
    tokenized_data = df['LSTM'].to_list()
    test = tokenizer.texts_to_sequences(tokenized_data)
    test = pad_sequences(test, maxlen=15)
    pred = model.predict(test)
    df['LSTM'] = pred
    df['LSTM'] = df['LSTM'].round(6)
    print('LSTM분석 완료.')
    return df

def update_comment():
    df_naver, df_kakao = read_comment_csv()
    
    print('NAVER 댓글 갱신중...')
    df2_naver = feargreed_index(df_naver,'NAVER')
    df_naver = df2_naver.append(df_naver).drop_duplicates(subset=['날짜','댓글'],keep='last')
    df_naver = df_naver.reset_index(drop=True)
    print('NAVER 댓글 갱신완료.')
    
    print('카카오 댓글 갱신중...')
    df2_kakao = feargreed_index(df_kakao,'카카오')
    df_kakao = df2_kakao.append(df_kakao).drop_duplicates(subset=['날짜','댓글'],keep='last')
    df_kakao = df_kakao.reset_index(drop=True)
    print('카카오 댓글 갱신완료.')
    
    df_naver.to_csv('./streamlit/data/feargreed_naver.csv',index=False)
    df_kakao.to_csv('./streamlit/data/feargreed_kakao.csv',index=False)  
    return df_naver, df_kakao

In [6]:
update_comment()

NAVER 댓글 갱신중...
2022-06-07 댓글104개 크롤링완료.
BERT 댓글104개 분석중..BERT분석 완료.
LSTM분석 완료.
NAVER 댓글 갱신완료.
카카오 댓글 갱신중...
2022-06-07 댓글313개 크롤링완료.
BERT 댓글313개 분석중..BERT분석 완료.
LSTM분석 완료.
카카오 댓글 갱신완료.


(               날짜                   댓글      BERT      LSTM
 0      2022-06-07          오늘은 왜 내렸어요?  0.255915  0.000000
 1      2022-06-07                   코닥  0.088301  0.711319
 2      2022-06-07            어디를 뺐다는거야  0.101173  0.045362
 3      2022-06-07       분할하던가 배당을 늘리던가  0.085969  0.994658
 4      2022-06-07                 x수여니  0.801379  0.109405
 ...           ...                  ...       ...       ...
 32651  2021-06-01              네이버 ㅋㅋㅋ  0.404388  0.014867
 32652  2021-06-01         역시 네이버가 갑 !!  0.803849  0.392791
 32653  2021-06-01  공매도 관련 전쟁 동영상 (한투연)  0.030869  0.000000
 32654  2021-06-01         네이버 분들 오세요!!  0.690160  0.119831
 32655  2021-06-01         네이버 주가예상 가즈아  0.617649  1.000000
 
 [32656 rows x 4 columns],
                 날짜                         댓글      BERT      LSTM
 0       2022-06-07                    일찍 좀 볼걸  0.295157  0.811517
 1       2022-06-07       한 3만5천원이면 사볼만하긴 한데..  0.117217  0.856992
 2       2022-06-07           어차피 자회사 무한 상장 ㅋㅋ  0.

In [11]:
# df_naver = pd.read_csv('./streamlit/data/feargreed_naver.csv')
# df_kakao = pd.read_csv('./streamlit/data/feargreed_kakao.csv')
# df_naver = df_naver[::-1].reset_index(drop=True)
# df_kakao = df_kakao[::-1].reset_index(drop=True)
# df_naver.to_csv('./streamlit/data/feargreed_naver_rev.csv',index=False)
# df_kakao.to_csv('./streamlit/data/feargreed_kakao_rev.csv',index=False)  