In [1]:
import pandas as pd
import numpy as np
import re
import requests
import pickle
from konlpy.tag import Okt
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

# LSTM 학습 데이터 생성

In [2]:
def get_code(symbol):
    krx = pd.read_csv('./src/krx_code.csv')
    krx = krx.set_index('한글 종목약명')
    try:
        code = krx.at[symbol,'단축코드']
        return code
    except:
        print('종목명을 다시 확인해주세요.')
        return 0

def get_comment_df(symbol,page):
    if get_code(symbol) == 0:
        return
    code = get_code(symbol)
    date_list, comment_list, view_list, good_list, bad_list = [], [], [], [], []
    for i in range(1,page+1):
        url = f'https://finance.naver.com/item/board.naver?code={code}&page={i}'
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.50'}
        res = requests.get(url, headers = headers)
        bs = BeautifulSoup(res.text, 'html.parser')
        for j in range(20):
            try:
                root = bs.find('div',{'class':'section inner_sub'}).find_all('tr',{'onmouseover':'mouseOver(this)'})[j].text.split('\n')
                date_list.append(root[1].split()[0].replace('.','-'))
                if len(root) == 14: # 답글
                    comment_list.append('답글:'+root[4])
                    view_list.append(root[10])
                    good_list.append(root[11])
                    bad_list.append(root[12])          
                elif len(root) == 13: # 기본
                    comment_list.append(root[3])
                    view_list.append(root[9])
                    good_list.append(root[10])
                    bad_list.append(root[11])
                else: # 에러
                    comment_list.append('error')
                    view_list.append(0)
                    good_list.append(0)
                    bad_list.append(0)   
            except: # 에러
                date_list.append('error')
                comment_list.append('error')
                view_list.append(0)
                good_list.append(0)
                bad_list.append(0)   
        print(f'\r{i}페이지 크롤링 완료.',end='')   
    df = pd.DataFrame()
    df['날짜'] = date_list
    df['댓글'] = comment_list
    df['조회수'] = view_list
    df['좋아요'] = good_list
    df['싫어요'] = bad_list
    return df

def preprocess_df(symbol,page):
    if get_code(symbol) == 0:
        return 
    df = get_comment_df(symbol,page)
    df = df[df['댓글'] != 'error'] 
    df = df.dropna() 
    df['한글댓글'] = df['댓글'].str.replace('\[삭제된 게시물의 답글\]',' ') 
    df['한글댓글'] = df['한글댓글'].str.replace('답글:',' ')
    df['한글댓글'] = df['한글댓글'].str.replace('[^가-힣]',' ').str.replace(' +',' ').str.strip() 
    df = df[df['한글댓글'] != ''] 
    df = df.reset_index(drop=True) 
    return df

def preprocess_okt_df(symbol,page):
    if get_code(symbol) == 0:
        return
    df = preprocess_df(symbol,page)
    okt = Okt()
    tag_list = ['Noun','Verb','Adjective','VerbPrefix']
    tokenized_data = []
    print()
    for i in range(df.shape[0]):
        tokenized_sentence = okt.pos(df['한글댓글'][i], stem=True) # 토큰화
        tag_checked_sentence = []
        for j in tokenized_sentence:
            x,y = j
            if y in tag_list:
                tag_checked_sentence.append(x)
        tokenized_data.append(tag_checked_sentence)
        print(f'\r{i+1}개 형태소분석 완료',end='')
    df['토큰화댓글'] = tokenized_data
    df = df[df['토큰화댓글'].str.len() > 1]
    df = df.reset_index(drop=True) 
    return df

greed_word = ['매수','사','사다','사라','사면','사고','줍다','들어오다','들어가다','타다','수급','매집','올라타다' # 주식 구매 단어
              ,'탑승','불나방','담다'
              ,'오르다','올라가다','올리다','올려주다','올린다','오름','올려놓다','오른','상향' # 주식 가격 상승 단어
              ,'양봉','상방','상승','살아나다','양전','상한','반등','폭등','퍽등','급등'
              ,'탐욕','찬티','좋다','간다','가다','가즈','싸다','익절','제발','최고','돌파','수익','위대하다','먹다' # 탐욕 단어
              ,'기회','호재','감사','감사하다','대박','대단하다','승리','찬양','믿다','회복','갓','부활','영차','개꿀']
fear_word = ['공매도','공매','매도','팔','파다','팔다','팔고','팔면','던지다','털다','탈출','튀다','튀어','설거지' # 주식 판매 단어
             ,'손절','버리다'
             ,'떨어지다','떨구다','빠지다','하락','폭락','떡락','조정','급락','음봉','하방','폭포수','음전' # 주식 가격 하락 단어
             ,'반토막','내리다','내려오다','깨지다','대퍽락','나락','붕괴','추락'
             ,'공포','안티','망하다','물리다','끝나다','손해','폭망','거품','무섭다','자살','악재','상폐','개미지옥' # 공포 단어
             ,'시발','염병','욕','짜증나다','걸레','어휴','개','놈','아가리','빡치다','지랄','손실','버티다','존버'
             ,'개관','주가조작','쓰레기','죽다','패닉','홀딩','바닥','흑우','추매','추미애']

def preprocess_label_df(symbol,page):
    if get_code(symbol) == 0:
        return
    df = preprocess_okt_df(symbol,page)
    df['공포탐욕'] = 0
    label_list = df['공포탐욕'].to_list()
    token_list = df['토큰화댓글'].to_list()
    print()
    for i in range(len(token_list)):
        x = token_list[i]
        for word in x:
            if word in greed_word:
                label_list[i] += 1
            if word in fear_word:
                label_list[i] -= 1
        if label_list[i] == 0:
            label_list[i] = 'm'
        elif label_list[i] > 0:
            label_list[i] = 1
        elif label_list[i] < 0:
            label_list[i] = 0
        print(f'\r{i+1}개 라벨링 완료',end='')
    df['공포탐욕'] = label_list
    df.to_csv(f'./src/네이버종토방댓글_{symbol}_{page}_전처리.csv', index=False)
    return df

def make_train(symbol,page):
    if get_code(symbol) == 0:
        return
    train = pd.read_csv('./src/train.csv')
    train = train[['토큰화댓글','공포탐욕']]
    df = preprocess_label_df(symbol,page)
    df = df[['토큰화댓글','공포탐욕']]
    df = df.append(train)
    df = df.astype(str)
    df = df.drop_duplicates('토큰화댓글')
    df = df.reset_index(drop=True)
    return df

def make_train_token(symbol,page):
    if get_code(symbol) == 0:
        return
    df = make_train(symbol,page)
    print()
    print('토큰화 진행중..',end='')
    tokenizer = Tokenizer(num_words=40000, oov_token = True)
    tokenizer.fit_on_texts(df['토큰화댓글'])
    df['토큰'] = tokenizer.texts_to_sequences(df['토큰화댓글'])
    print('\r토큰화 완료.    ')
    with open('./src/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    df = df[df['공포탐욕'] != 'm']
    df = df.reset_index(drop=True)
    df.to_csv(f'./src/train.csv', index=False)
    return df

In [3]:
# make_train_token('NAVER',10)

# LSTM 학습

In [4]:
def train_LSTM():
    df = pd.read_csv('./src/train.csv')
    train = pad_sequences(df['토큰'], maxlen=15)
    
    label = df['공포탐욕']
    encoder = LabelEncoder()
    batch_size = label.shape[0]
    input_dim = 1
    label = encoder.fit_transform(label)
    label = np.reshape(label, (batch_size, input_dim))
    
    model = Sequential()
    model.add(Embedding(30000, 128))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    hist = model.fit(train, label, batch_size=32, epochs=5)
    model.save('./src/model.h5')

In [5]:
# train_LSTM()