In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import requests
from bs4 import BeautifulSoup
from datetime import date
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from transformers import TextClassificationPipeline
from transformers import BertTokenizerFast
from transformers import TFBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [2]:
# LSTM 토크나이저
with open('./src/lstm/goodtokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)  
# LSTM 모델
model = load_model('./src/lstm/goodmodel.h5')
# BERT 토크나이저, 모델
loaded_tokenizer = BertTokenizerFast.from_pretrained('./src/bert', from_pt=True)
loaded_model = TFBertForSequenceClassification.from_pretrained('./src/bert', from_pt=True)
classifier = TextClassificationPipeline(tokenizer=loaded_tokenizer, model=loaded_model,
                                            framework='tf', return_all_scores=True) 

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [3]:
def get_code(symbol):
    krx = pd.read_csv('./src/krx_code.csv')
    krx = krx.set_index('한글 종목약명')
    try:
        code = krx.at[symbol,'단축코드']
        return code
    except:
        print('종목명을 다시 확인해주세요.')
        return 0

def get_comment(df,symbol):
    code = get_code(symbol)
    day = df['날짜'][0]
    date_list = []
    comment_list = []
    raw_comment_list = []
    chk = 1
    i = 1
    while chk:  
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')  
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')                 
                if day > root[1].split()[0].replace('.','-'):
                    chk = 0
                    break
                if len(root) == 14: # 답글
                    pass      
                elif len(root) == 13: # 기본
                    comment = root[3]
                    date_list.append(root[1].split()[0].replace('.','-'))
                    raw_comment_list.append(comment)            
                else: # 에러
                    pass
            except: # 에러
                pass
            print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링중..',end='')
        i += 1
        if chk == 0:
            break   
    print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링완료')
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = raw_comment_list
    return df

def BERT_feargreed(df,symbol):
    df = get_comment(df,symbol)  
    raw_comment_list = df['댓글'].to_list()
    pred_list=[]
    for i in raw_comment_list:
        a = classifier(i)[0]
        f = a[0]['score']
        g = a[1]['score']
        if f >= g:
            pred_list.append(1-f)
        else:
            pred_list.append(g)
        print(f'\rBERT 댓글{len(pred_list)}개 분석중..',end='')
    df['BERT'] = pred_list  
    return df

def konlpy_okt(df,symbol):
    df = BERT_feargreed(df,symbol)
    okt = Okt()
    tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
    comment_list = df['댓글'].to_list()
    tokenized_data = []
    for i in range(len(comment_list)):
        tokenized_sentence = okt.pos(comment_list[i], stem=True) 
        tag_checked_sentence = []
        for j in tokenized_sentence:
            x,y = j
            if y in tag_list:
                tag_checked_sentence.append(x)
        tokenized_data.append(tag_checked_sentence)     
    for i in tokenized_data:
        for j in range(len(i)):
            i[j] = "'"+i[j]+"'"
    df['LSTM'] = tokenized_data
    return df
    
def feargreed_index(df,symbol):
    df = konlpy_okt(df,symbol)
    tokenized_data = df['LSTM'].to_list()
    test = tokenizer.texts_to_sequences(tokenized_data)
    test = pad_sequences(test, maxlen=15)
    pred = model.predict(test)
    df['LSTM'] = pred
    df['LSTM'] = df['LSTM'].round(6)
    return df

In [6]:
df = pd.read_csv('./streamlit/data/feargreed_naver.csv')
df2 = feargreed_index(df,'NAVER')
df_naver = df2.append(df).drop_duplicates(subset=['날짜','댓글'],keep='last')
df_naver = df_naver.reset_index(drop=True)
df_naver.to_csv('./streamlit/data/feargreed_naver.csv',index=False)
df_naver

2022-06-04 댓글17개 크롤링완료.
BERT 댓글17개 분석중..

Unnamed: 0,날짜,댓글,BERT,LSTM
0,2022-06-04,증권사리포트 적정가,0.393901,0.989159
1,2022-06-04,도대체왜 걸어다니면서 담배피는거냐?니들은...,0.055868,0.229988
2,2022-06-04,ㄲ ㅓ어어어어억 ㅋㅋ 금요일날포식 ㅋㅋㅋ,0.901422,0.005606
3,2022-06-04,대주전문가 김대주입니다,0.755341,0.876730
4,2022-06-04,안티들이 아직 많다는건?,0.300678,0.000000
...,...,...,...,...
32510,2021-06-01,네이버 ㅋㅋㅋ,0.404388,0.014867
32511,2021-06-01,역시 네이버가 갑 !!,0.803849,0.392791
32512,2021-06-01,공매도 관련 전쟁 동영상 (한투연),0.030869,0.000000
32513,2021-06-01,네이버 분들 오세요!!,0.690160,0.119831


In [None]:
df = pd.read_csv('./streamlit/data/feargreed_kakao.csv')
df2 = feargreed_index(df,'카카오')
df_kakao = df2.append(df).drop_duplicates(subset=['날짜','댓글'],keep='last')
df_kakao = df_kakao.reset_index(drop=True)
df_kakao.to_csv('./streamlit/data/feargreed_kakao.csv',index=False)
df_kakao

2022-06-04 댓글29개 크롤링완료.
BERT 댓글16개 분석중..

In [6]:
# def konlpy_okt(df):
#     okt = Okt()
#     tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
#     comment_list = df['댓글'].to_list()
#     tokenized_data = []
#     for i in range(len(comment_list)):
#         tokenized_sentence = okt.pos(str(comment_list[i]), stem=True) 
#         tag_checked_sentence = []
#         for j in tokenized_sentence:
#             x,y = j
#             if y in tag_list:
#                 tag_checked_sentence.append(x)
#         tokenized_data.append(tag_checked_sentence)   
#         print(f'\r{i+1}개 형태소분리중',end='')
#     for i in tokenized_data:
#         for j in range(len(i)):
#             i[j] = "'"+i[j]+"'"
#     return tokenized_data
    
# def tokenize(df):
#     tokenized_data = konlpy_okt(df)
#     test = tokenizer.texts_to_sequences(tokenized_data)
#     test = pad_sequences(test, maxlen=15)
#     return test

# def feargreed_indexx(df): 
#     test = tokenize(df)
#     pred = model.predict(test)
#     return pred

In [7]:
# df['LSTM'] = feargreed_indexx(df)
# df['LSTM'] = df['LSTM'].round(6)
# df = df.reset_index(drop=True)
# df

In [16]:
df = pd.read_csv('./src/score_naver.csv')
df['LSTM'] = df['LSTM'].round(6)
df

Unnamed: 0,날짜,댓글,BERT,LSTM
0,2022-06-02,강보합 가즈아,0.841521,1.000000
1,2022-06-02,NAVER 최저점262500원으로 제시...,0.011697,0.727272
2,2022-06-02,루나 충격 코인 시장 어디로 가나…오늘 ...,0.165881,0.222555
3,2022-06-02,하락장에서도 버티라,0.170349,0.000000
4,2022-06-02,'동반 추락' 네이버·카카오는 '바겐세일...,0.294569,0.000001
...,...,...,...,...
32291,2021-06-01,네이버 ㅋㅋㅋ,0.404388,0.014867
32292,2021-06-01,역시 네이버가 갑 !!,0.803849,0.392791
32293,2021-06-01,공매도 관련 전쟁 동영상 (한투연),0.030869,0.000000
32294,2021-06-01,네이버 분들 오세요!!,0.690160,0.119831
