In [7]:
import pandas as pd
import numpy as np
import re
import pickle5 as pickle
import requests
from bs4 import BeautifulSoup
from datetime import date
from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from transformers import TextClassificationPipeline
from transformers import BertTokenizerFast
from transformers import TFBertForSequenceClassification
from pymongo import MongoClient
from datetime import datetime,timedelta

import warnings
warnings.filterwarnings('ignore')

In [8]:
my_client = MongoClient('mongodb://18.181.49.139:27017')
mydb = my_client['final_project']

In [9]:
# LSTM 토크나이저
with open('data/lstm/goodtokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)  
# LSTM 모델
model = load_model('data/lstm/goodmodel.h5')
# BERT 토크나이저, 모델
loaded_tokenizer = BertTokenizerFast.from_pretrained('data/bert', from_pt=True)
loaded_model = TFBertForSequenceClassification.from_pretrained('data/bert', from_pt=True)
classifier = TextClassificationPipeline(tokenizer=loaded_tokenizer, model=loaded_model,
                                            framework='tf', return_all_scores=True) 



ResourceExhaustedError: OOM when allocating tensor with shape[119547,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:TruncatedNormal]

In [4]:
def read_comment_csv():
    try:
        df_naver = pd.DataFrame(mydb['naver_feargreed'].find({'날짜':datetime.strftime((datetime.now()).date(), "%Y-%m-%d")})).drop('_id',axis=1)
        df_kakao = pd.DataFrame(mydb['kakao_feargreed'].find({'날짜':datetime.strftime((datetime.now()).date(), "%Y-%m-%d")})).drop('_id',axis=1)
    except:
        df_naver = pd.DataFrame(mydb['naver_feargreed'].find({'날짜':datetime.strftime((datetime.now()).date()-timedelta(days=1), "%Y-%m-%d")})).drop('_id',axis=1)
        df_kakao = pd.DataFrame(mydb['kakao_feargreed'].find({'날짜':datetime.strftime((datetime.now()).date()-timedelta(days=1), "%Y-%m-%d")})).drop('_id',axis=1)
    
    return df_naver, df_kakao
    
def get_code(symbol):
#     krx = pd.read_csv('./src/krx_code.csv')
#     krx = krx.set_index('한글 종목약명')
#     try:
#         code = krx.at[symbol,'단축코드']
#         return code
#     except:
#         print('종목명을 다시 확인해주세요.')
#         return 0
    if symbol == '카카오':
        code = '035720' # 카카오
    else:
        code = '035420' # NAVER
    return code

def get_comment(df,symbol):
    code = get_code(symbol)
    day = df['날짜'][0]
    date_list = []
    comment_list = []
    raw_comment_list = []
    chk = 1
    i = 1
    while chk:  
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')  
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')                 
                if day > root[1].split()[0].replace('.','-'):
                    chk = 0
                    break
                if len(root) == 14: # 답글
                    pass      
                elif len(root) == 13: # 기본
                    comment = root[3]
                    date_list.append(root[1].split()[0].replace('.','-'))
                    raw_comment_list.append(comment)            
                else: # 에러
                    pass
            except: # 에러
                pass
            print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링중..',end='')
        i += 1
        if chk == 0:
            break   
    print(f'\r{day} 댓글{len(raw_comment_list)}개 크롤링완료')
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = raw_comment_list
    return df

def BERT_feargreed(df,symbol):
    df = get_comment(df,symbol)  
    raw_comment_list = df['댓글'].to_list()
    pred_list=[]
    for i in raw_comment_list:
        a = classifier(i)[0]
        f = a[0]['score']
        g = a[1]['score']
        if f >= g:
            pred_list.append(1-f)
        else:
            pred_list.append(g)
        print(f'\rBERT 댓글{len(pred_list)}개 분석중..',end='')
    df['BERT'] = pred_list  
    print('BERT분석 완료.')
    return df

def konlpy_okt(df,symbol):
    df = BERT_feargreed(df,symbol)
    okt = Okt()
    tag_list = ['Noun','Verb','Adjective','VerbPrefix'] 
    comment_list = df['댓글'].to_list()
    tokenized_data = []
    for i in range(len(comment_list)):
        tokenized_sentence = okt.pos(comment_list[i], stem=True) 
        tag_checked_sentence = []
        for j in tokenized_sentence:
            x,y = j
            if y in tag_list:
                tag_checked_sentence.append(x)
        tokenized_data.append(tag_checked_sentence)     
    for i in tokenized_data:
        for j in range(len(i)):
            i[j] = "'"+i[j]+"'"
    df['LSTM'] = tokenized_data
    return df
    
def feargreed_index(df,symbol):
    df = konlpy_okt(df,symbol)
    tokenized_data = df['LSTM'].to_list()
    test = tokenizer.texts_to_sequences(tokenized_data)
    test = pad_sequences(test, maxlen=15)
    pred = model.predict(test)
    df['LSTM'] = pred
    df['LSTM'] = df['LSTM'].round(6)
    print('LSTM분석 완료.')
    return df

def update_comment():
    df_naver, df_kakao = read_comment_csv()
    
    print('NAVER 댓글 갱신중...')
    df2_naver = feargreed_index(df_naver,'NAVER')
    df_naver = df2_naver.append(df_naver).drop_duplicates(subset=['날짜','댓글'],keep='last')
    df_naver = df_naver.reset_index(drop=True)
    print('NAVER 댓글 갱신완료.')
    
    print('카카오 댓글 갱신중...')
    df2_kakao = feargreed_index(df_kakao,'카카오')
    df_kakao = df2_kakao.append(df_kakao).drop_duplicates(subset=['날짜','댓글'],keep='last')
    df_kakao = df_kakao.reset_index(drop=True)
    print('카카오 댓글 갱신완료.')
    
    #df_naver.to_csv('./streamlit/data/feargreed_naver.csv',index=False)
    #df_kakao.to_csv('./streamlit/data/feargreed_kakao.csv',index=False)
    mydb['naver_feargreed'].delete_many({'날짜':datetime.strftime((datetime.now()).date(), "%Y-%m-%d")})
    mydb['kakao_feargreed'].delete_many({'날짜':datetime.strftime((datetime.now()).date(), "%Y-%m-%d")})

    mydb['naver_feargreed'].insert_many(df_naver.to_dict('records'))
    mydb['kakao_feargreed'].insert_many(df_kakao.to_dict('records'))
    

In [6]:
update_comment()

NAVER 댓글 갱신중...
2022-06-08 댓글71개 크롤링완료.


NameError: name 'classifier' is not defined