In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import requests
from bs4 import BeautifulSoup
from datetime import date
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from transformers import TextClassificationPipeline
from transformers import BertTokenizerFast
from transformers import TFBertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [2]:
# LSTM 토크나이저
with open('./src/lstm/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)  
# LSTM 모델
model = load_model('./src/lstm/model.h5')
# BERT 토크나이저, 모델
loaded_tokenizer = BertTokenizerFast.from_pretrained('./src/bert', from_pt=True)
loaded_model = TFBertForSequenceClassification.from_pretrained('./src/bert', from_pt=True)
classifier = TextClassificationPipeline(tokenizer=loaded_tokenizer, model=loaded_model,
                                            framework='tf', return_all_scores=True) 

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [3]:
def get_code(symbol):
    krx = pd.read_csv('./src/krx_code.csv')
    krx = krx.set_index('한글 종목약명')
    try:
        code = krx.at[symbol,'단축코드']
        return code
    except:
        print('종목명을 다시 확인해주세요.')
        return 0

def get_comment(df,symbol):
    code = get_code(symbol)
    day = df['날짜'][0]
    date_list = []
    comment_list = []
    raw_comment_list = []
    chk = 1
    i = 1
    while chk:  
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')  
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')                 
                if day > root[1].split()[0].replace('.','-'):
                    chk = 0
                    break
                if len(root) == 14: # 답글
                    pass      
                elif len(root) == 13: # 기본
                    comment = root[3]
                    date_list.append(root[1].split()[0].replace('.','-'))
                    raw_comment_list.append(comment)            
                else: # 에러
                    pass
            except: # 에러
                pass
            print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링중..',end='')
        i += 1
        if chk == 0:
            break   
    print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링완료')
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = raw_comment_list
    return df

def BERT_feargreed(df,symbol):
    df = get_comment(df,symbol)  
    raw_comment_list = df['댓글'].to_list()
    pred_list=[]
    for i in raw_comment_list:
        a = classifier(i)[0]
        f = a[0]['score']
        g = a[1]['score']
        if f >= g:
            pred_list.append(1-f)
        else:
            pred_list.append(g)
        print(f'\rBERT 댓글{len(pred_list)}개 분석중..',end='')
    df['BERT'] = pred_list  
    return df

def konlpy_okt(df,symbol):
    df = BERT_feargreed(df,symbol)
    okt = Okt()
    tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
    comment_list = df['댓글'].to_list()
    tokenized_data = []
    for i in range(len(comment_list)):
        tokenized_sentence = okt.pos(comment_list[i], stem=True) 
        tag_checked_sentence = []
        for j in tokenized_sentence:
            x,y = j
            if y in tag_list:
                tag_checked_sentence.append(x)
        tokenized_data.append(tag_checked_sentence)     
    for i in tokenized_data:
        for j in range(len(i)):
            i[j] = "'"+i[j]+"'"
    df['LSTM'] = tokenized_data
    return df
    
def feargreed_index(df,symbol):
    df = konlpy_okt(df,symbol)
    tokenized_data = df['LSTM'].to_list()
    test = tokenizer.texts_to_sequences(tokenized_data)
    test = pad_sequences(test, maxlen=15)
    pred = model.predict(test)
    df['LSTM'] = pred
    df['LSTM'] = df['LSTM'].round(6)
    return df

In [4]:
df = pd.read_csv('./src/score_naver_0301.csv')
df2 = feargreed_index(df,'NAVER')
df_naver = df2.append(df).drop_duplicates(subset=['날짜','댓글'],keep='last')
df_naver = df_naver.reset_index(drop=True)
df_naver.to_csv('./src/score_naver_0301.csv',index=False)
df_naver

2022-05-27 댓글117개 크롤링완료.
BERT 댓글117개 분석중..

Unnamed: 0,날짜,댓글,BERT,LSTM
0,2022-05-29,아주 아주 나쁜 악재,0.090395,0.000008
1,2022-05-29,[부동산시장도 폰지의관점으로도볼수도있다....,0.779604,0.000872
2,2022-05-29,손절 타이밍 못잡아서,0.119956,0.000000
3,2022-05-29,북한 미사일 쏘면 문재인의 반응,0.400954,0.422165
4,2022-05-29,절대사지마,0.026471,1.000000
...,...,...,...,...
9421,2022-03-01,어르신들 .,0.404791,0.002084
9422,2022-03-01,차트는 좋네,0.153072,0.999999
9423,2022-03-01,[잘쳐먹고잘산다는미명하에 온갖곳에 악질이...,0.706786,0.999999
9424,2022-03-01,"●●●● 찢재명,,, 왈 ~~",0.845221,0.013839


In [5]:
df = pd.read_csv('./src/score_kakao_0401.csv')
df2 = feargreed_index(df,'카카오')
df_kakao = df2.append(df).drop_duplicates(subset=['날짜','댓글'],keep='last')
df_kakao = df_kakao.reset_index(drop=True)
df_kakao.to_csv('./src/score_kakao_0401.csv',index=False)
df_kakao

2022-05-27 댓글214개 크롤링완료.
BERT 댓글214개 분석중..

Unnamed: 0,날짜,댓글,BERT,LSTM
0,2022-05-29,성장주도 가치주도 아니다,0.003523,0.002275
1,2022-05-29,주식투자 단순원리,0.061108,0.040277
2,2022-05-29,◆카카오 월요일 상승 합니다!!◆,0.887065,1.000000
3,2022-05-29,유투브나 주식방송에 나오는 인간들의,0.633766,0.246460
4,2022-05-29,상한가 기대한다,0.712382,1.000000
...,...,...,...,...
9908,2022-04-01,카카오는 계열사분할로 망한다,0.109831,0.000000
9909,2022-04-01,"앵두 방송준비중,,..",0.115401,0.075082
9910,2022-04-01,딱보니,0.225211,0.001256
9911,2022-04-01,105000에서 절대안사짐!,0.304911,0.036745


In [6]:
# def konlpy_okt(df):
#     okt = Okt()
#     tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
#     comment_list = df['댓글'].to_list()
#     tokenized_data = []
#     for i in range(len(comment_list)):
#         tokenized_sentence = okt.pos(str(comment_list[i]), stem=True) 
#         tag_checked_sentence = []
#         for j in tokenized_sentence:
#             x,y = j
#             if y in tag_list:
#                 tag_checked_sentence.append(x)
#         tokenized_data.append(tag_checked_sentence)   
#         print(f'\r{i+1}개 형태소분리중',end='')
#     for i in tokenized_data:
#         for j in range(len(i)):
#             i[j] = "'"+i[j]+"'"
#     return tokenized_data
    
# def tokenize(df):
#     tokenized_data = konlpy_okt(df)
#     test = tokenizer.texts_to_sequences(tokenized_data)
#     test = pad_sequences(test, maxlen=15)
#     return test

# def feargreed_indexx(df): 
#     test = tokenize(df)
#     pred = model.predict(test)
#     return pred

# df['LSTM'] = feargreed_indexx(df)
# df['LSTM'] = df['LSTM'].round(6)
# df = df.reset_index(drop=True)
# df

In [7]:
# df.to_csv('./src/score_naver_0301.csv',index=False)