In [None]:
import pandas as pd
from afinn import Afinn
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# 기사제목 엑셀 파일 불러오기

title = pd.read_csv("stock_title.csv", engine='python', encoding = "utf-8")

In [None]:
title

Unnamed: 0,회사명,날짜,등락률,뉴스 타이틀
0,삼성전자,20190103,-3.06,"삼성전자, 기술전문가 名匠제도 신설"
1,삼성전자,20190103,-3.06,"삼성전자, 기술 전문가 육성 위한 '삼성명장' 제도 신설"
2,삼성전자,20190103,-3.06,2019년삼성전자시무식
3,삼성전자,20190103,-3.06,"삼성전자""초일류, 초격차 100년 기업 만들자"""
4,삼성전자,20190103,-3.06,삼성전자김기남 부회장 “초일류·초격차 100년 기업 도약하자”
...,...,...,...,...
7900,LG디스플레이,20221216,-3.73,"삼성, 애증의 ‘QD-OLED TV’ 내년 국내 출시"
7901,LG디스플레이,20221216,-3.73,[재송]14일 장 마감 후 주요 종목 뉴스
7902,LG디스플레이,20221216,-3.73,"[내 종목을 부탁해] 복잡한 계좌 해결 방법은? <두산에너빌리티>, <대원제약>..."
7903,LG디스플레이,20221216,-3.73,"삼성전자 여유자금 10.7조→3.9조, SK하이닉스 3.5조→―8500억"


In [None]:
# 날짜 칼럼 datatype 변경

title['날짜'] = title['날짜'].astype(str)

In [None]:
# 상승/하락을 판단하여 이진 분류 시켜주는 함수
def classify_size(size):
    if size >= 0:
        return 1
    else:
        return 0

In [None]:
# 상승 하락 판단

title['Either'] = title['등락률'].apply(classify_size)

In [None]:
title

Unnamed: 0,회사명,날짜,등락률,뉴스 타이틀,Either
0,삼성전자,20190103,-3.06,"삼성전자, 기술전문가 名匠제도 신설",0
1,삼성전자,20190103,-3.06,"삼성전자, 기술 전문가 육성 위한 '삼성명장' 제도 신설",0
2,삼성전자,20190103,-3.06,2019년삼성전자시무식,0
3,삼성전자,20190103,-3.06,"삼성전자""초일류, 초격차 100년 기업 만들자""",0
4,삼성전자,20190103,-3.06,삼성전자김기남 부회장 “초일류·초격차 100년 기업 도약하자”,0
...,...,...,...,...,...
7900,LG디스플레이,20221216,-3.73,"삼성, 애증의 ‘QD-OLED TV’ 내년 국내 출시",0
7901,LG디스플레이,20221216,-3.73,[재송]14일 장 마감 후 주요 종목 뉴스,0
7902,LG디스플레이,20221216,-3.73,"[내 종목을 부탁해] 복잡한 계좌 해결 방법은? <두산에너빌리티>, <대원제약>...",0
7903,LG디스플레이,20221216,-3.73,"삼성전자 여유자금 10.7조→3.9조, SK하이닉스 3.5조→―8500억",0


In [None]:
# 등락률 제거

title = title.drop('등락률', axis=1)

In [None]:
# 칼럼명 변경

title.rename(columns = {'뉴스 타이틀':'title'},inplace=True)

In [None]:
title

Unnamed: 0,회사명,날짜,title,Either
0,삼성전자,20190103,"삼성전자, 기술전문가 名匠제도 신설",0
1,삼성전자,20190103,"삼성전자, 기술 전문가 육성 위한 '삼성명장' 제도 신설",0
2,삼성전자,20190103,2019년삼성전자시무식,0
3,삼성전자,20190103,"삼성전자""초일류, 초격차 100년 기업 만들자""",0
4,삼성전자,20190103,삼성전자김기남 부회장 “초일류·초격차 100년 기업 도약하자”,0
...,...,...,...,...
7900,LG디스플레이,20221216,"삼성, 애증의 ‘QD-OLED TV’ 내년 국내 출시",0
7901,LG디스플레이,20221216,[재송]14일 장 마감 후 주요 종목 뉴스,0
7902,LG디스플레이,20221216,"[내 종목을 부탁해] 복잡한 계좌 해결 방법은? <두산에너빌리티>, <대원제약>...",0
7903,LG디스플레이,20221216,"삼성전자 여유자금 10.7조→3.9조, SK하이닉스 3.5조→―8500억",0


In [None]:
# 종가/거래량 엑셀 파일 불러오기

fin  = pd.read_excel("final_fin.xlsx")

In [None]:
fin

Unnamed: 0,회사명,날짜,등락률,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량,거래량,종가
0,삼성전자,2019.01.03,-3.06,0.02,-0.02,0.00,-0.01,-0.01,-0.05,0.32,-0.33,-0.23,0.40,1177579,17800
1,삼성전자,2019.01.07,3.35,0.00,-0.01,-0.01,0.00,0.02,-0.33,-0.23,0.40,0.06,0.05,1307934,18400
2,삼성전자,2019.01.09,3.79,-0.01,0.00,0.02,0.01,0.03,0.40,0.06,0.05,0.06,0.96,1830546,19000
3,삼성전자,2019.01.25,3.80,0.00,0.01,-0.03,0.01,0.00,-0.20,1.37,-0.21,-0.52,-0.01,2055931,20300
4,삼성전자,2019.02.08,-3.12,0.02,-0.04,-0.04,0.01,0.01,0.25,1.53,-0.15,-0.56,-0.05,965641,18850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,LG디스플레이,2022.11.11,7.14,0.04,0.00,0.08,-0.01,0.00,0.51,-0.26,2.19,-0.68,0.31,3409680,15400
830,LG디스플레이,2022.11.14,-5.12,0.00,0.08,-0.01,0.00,0.08,-0.26,2.19,-0.68,0.31,0.67,2298394,14650
831,LG디스플레이,2022.11.18,3.03,0.08,-0.05,0.01,-0.01,-0.02,0.67,-0.33,-0.55,0.21,-0.32,2393482,14850
832,LG디스플레이,2022.12.14,4.21,-0.02,-0.01,0.01,0.00,-0.03,-0.28,2.13,-0.58,-0.44,0.13,1055419,14250


In [None]:
# 날짜 형식을 제목 엑셀 파일과 일치시키기 위한 작업

for i in range(len(fin['날짜'])):
    fin.날짜[i] = fin.날짜[i].replace('.', '')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fin.날짜[i] = fin.날짜[i].replace('.', '')


In [None]:
# 상승/하락 판단

fin['Either'] = fin['등락률'].apply(classify_size)

In [None]:
fin

Unnamed: 0,회사명,날짜,등락률,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량,거래량,종가,Either
0,삼성전자,20190103,-3.06,0.02,-0.02,0.00,-0.01,-0.01,-0.05,0.32,-0.33,-0.23,0.40,1177579,17800,0
1,삼성전자,20190107,3.35,0.00,-0.01,-0.01,0.00,0.02,-0.33,-0.23,0.40,0.06,0.05,1307934,18400,1
2,삼성전자,20190109,3.79,-0.01,0.00,0.02,0.01,0.03,0.40,0.06,0.05,0.06,0.96,1830546,19000,1
3,삼성전자,20190125,3.80,0.00,0.01,-0.03,0.01,0.00,-0.20,1.37,-0.21,-0.52,-0.01,2055931,20300,1
4,삼성전자,20190208,-3.12,0.02,-0.04,-0.04,0.01,0.01,0.25,1.53,-0.15,-0.56,-0.05,965641,18850,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,LG디스플레이,20221111,7.14,0.04,0.00,0.08,-0.01,0.00,0.51,-0.26,2.19,-0.68,0.31,3409680,15400,1
830,LG디스플레이,20221114,-5.12,0.00,0.08,-0.01,0.00,0.08,-0.26,2.19,-0.68,0.31,0.67,2298394,14650,0
831,LG디스플레이,20221118,3.03,0.08,-0.05,0.01,-0.01,-0.02,0.67,-0.33,-0.55,0.21,-0.32,2393482,14850,1
832,LG디스플레이,20221214,4.21,-0.02,-0.01,0.01,0.00,-0.03,-0.28,2.13,-0.58,-0.44,0.13,1055419,14250,1


In [None]:
# 불필요한 거래량/종가/등락률 칼럼 제거

main = fin.drop(['Either','거래량','종가','등락률'], axis=1)

In [None]:
main

Unnamed: 0,회사명,날짜,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량
0,삼성전자,20190103,0.02,-0.02,0.00,-0.01,-0.01,-0.05,0.32,-0.33,-0.23,0.40
1,삼성전자,20190107,0.00,-0.01,-0.01,0.00,0.02,-0.33,-0.23,0.40,0.06,0.05
2,삼성전자,20190109,-0.01,0.00,0.02,0.01,0.03,0.40,0.06,0.05,0.06,0.96
3,삼성전자,20190125,0.00,0.01,-0.03,0.01,0.00,-0.20,1.37,-0.21,-0.52,-0.01
4,삼성전자,20190208,0.02,-0.04,-0.04,0.01,0.01,0.25,1.53,-0.15,-0.56,-0.05
...,...,...,...,...,...,...,...,...,...,...,...,...
829,LG디스플레이,20221111,0.04,0.00,0.08,-0.01,0.00,0.51,-0.26,2.19,-0.68,0.31
830,LG디스플레이,20221114,0.00,0.08,-0.01,0.00,0.08,-0.26,2.19,-0.68,0.31,0.67
831,LG디스플레이,20221118,0.08,-0.05,0.01,-0.01,-0.02,0.67,-0.33,-0.55,0.21,-0.32
832,LG디스플레이,20221214,-0.02,-0.01,0.01,0.00,-0.03,-0.28,2.13,-0.58,-0.44,0.13


In [None]:
# 위에서 생성한 main 데이터프레임을 x, 상승/하락을 표기한 Either 칼럼을 y 값으로 설정하고 데이터 분리

x_main = main
y_main = fin['Either']

x_train_main, x_test_main, y_train_main, y_test_main = train_test_split(x_main, y_main,
                                                    stratify=y_main,
                                                    test_size=0.3)

In [None]:
# 위에서 생성한 기사 제목 데이터프레임을 x, 상승/하락을 표기한 Either 칼럼을 y 값으로 설정하고 데이터 분리

x = title
y = title['Either']

x_train_title, x_test_title, y_train_title, y_test_title = train_test_split(x, y,
                                                    stratify=y,
                                                    test_size=0.3)

In [None]:
print(x_train_main.shape, x_test_main.shape) # 훈련세트, 테스트세트 비율 확인
np.unique(y_train_main, return_counts=True) # 훈련세트의 타깃(라벨) 확인

(583, 12) (251, 12)


(array([0, 1], dtype=int64), array([280, 303], dtype=int64))

In [None]:
# 회사명과 날짜는 의미가 없는 칼럼이기 때문에 제거

x_train_main_sub = x_train_main.drop(['회사명','날짜'], axis=1)

In [None]:
x_train_main_sub

Unnamed: 0,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량
149,-0.02,0.02,-0.02,-0.01,0.02,0.34,-0.28,-0.31,0.83,-0.14
414,0.00,0.01,0.03,-0.02,0.00,0.09,0.17,1.16,-0.41,-0.04
243,0.05,0.00,-0.01,-0.03,0.03,0.40,-0.25,-0.31,0.07,0.31
771,0.01,0.00,-0.02,-0.03,-0.01,-0.13,0.25,0.78,-0.01,-0.20
749,0.02,0.01,-0.02,0.02,0.02,0.00,-0.39,0.80,-0.27,0.03
...,...,...,...,...,...,...,...,...,...,...
645,0.00,0.02,0.02,-0.04,-0.03,-0.29,0.39,0.15,-0.32,0.44
618,-0.02,0.03,0.00,-0.02,-0.01,-0.30,0.22,-0.07,-0.21,-0.25
803,0.00,-0.02,0.01,0.01,-0.01,-0.63,0.12,-0.38,0.88,-0.39
808,0.05,-0.01,0.01,-0.01,-0.02,2.17,-0.69,0.09,-0.33,0.06


In [None]:
x_train_main

Unnamed: 0,회사명,날짜,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량
149,삼성에스디에스,20221228,-0.02,0.02,-0.02,-0.01,0.02,0.34,-0.28,-0.31,0.83,-0.14
414,SK하이닉스,20201209,0.00,0.01,0.03,-0.02,0.00,0.09,0.17,1.16,-0.41,-0.04
243,삼성SDI,20210222,0.05,0.00,-0.01,-0.03,0.03,0.40,-0.25,-0.31,0.07,0.31
771,LG디스플레이,20210818,0.01,0.00,-0.02,-0.03,-0.01,-0.13,0.25,0.78,-0.01,-0.20
749,LG디스플레이,20210114,0.02,0.01,-0.02,0.02,0.02,0.00,-0.39,0.80,-0.27,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...
645,LG전자,20220913,0.00,0.02,0.02,-0.04,-0.03,-0.29,0.39,0.15,-0.32,0.44
618,LG전자,20211221,-0.02,0.03,0.00,-0.02,-0.01,-0.30,0.22,-0.07,-0.21,-0.25
803,LG디스플레이,20220613,0.00,-0.02,0.01,0.01,-0.01,-0.63,0.12,-0.38,0.88,-0.39
808,LG디스플레이,20220819,0.05,-0.01,0.01,-0.01,-0.02,2.17,-0.69,0.09,-0.33,0.06


In [None]:
x_test_main

Unnamed: 0,회사명,날짜,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량
419,SK하이닉스,20210105,-0.02,0.01,0.03,0.00,0.03,-0.83,0.34,-0.39,-0.46,0.69
679,LG디스플레이,20191002,-0.01,-0.01,-0.02,0.03,-0.01,-0.10,0.01,-0.36,0.33,-0.09
712,LG디스플레이,20200504,0.00,-0.03,0.03,0.02,0.00,-0.16,0.06,-0.22,0.62,-0.20
569,LG전자,20210122,-0.02,-0.02,0.02,0.01,0.10,-0.40,-0.37,-0.07,0.03,6.72
408,SK하이닉스,20201013,0.02,-0.02,-0.01,0.04,0.00,-0.05,0.38,-0.20,0.94,-0.41
...,...,...,...,...,...,...,...,...,...,...,...,...
752,LG디스플레이,20210129,0.01,0.00,0.02,-0.05,0.00,-0.66,-0.58,0.72,0.25,-0.27
661,LG디스플레이,20190328,0.01,0.00,-0.02,0.00,-0.01,0.88,-0.27,-0.32,0.00,0.39
230,삼성SDI,20201116,0.02,-0.02,0.02,-0.01,-0.02,0.00,0.37,-0.15,0.29,-0.23
248,삼성SDI,20210311,-0.01,-0.01,-0.02,0.00,-0.01,0.13,0.60,-0.47,0.55,-0.32


In [78]:
# 제목을 이용한 예측과 종가/거래량을 이용한 예측의 학습과 테스트가 같은 데이터로 이루어져야하기 때문에 merge를 통해
# 제목을 이용한 예측을 위한 test/train data 생성

test_newtitle = pd.merge(x_test_main,title, on=["회사명","날짜"])

In [79]:
test_newtitle

Unnamed: 0,회사명,날짜,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량,title,Either
0,SK하이닉스,20210105,-0.02,0.01,0.03,0.00,0.03,-0.83,0.34,-0.39,-0.46,0.69,"[신년사] 박정호SK하이닉스부회장 ""경쟁자와도 손잡고 협업해야""",1
1,SK하이닉스,20210105,-0.02,0.01,0.03,0.00,0.03,-0.83,0.34,-0.39,-0.46,0.69,"SK하이닉스, IT 수요 호조세 지속…목표가↑-유진",1
2,SK하이닉스,20210105,-0.02,0.01,0.03,0.00,0.03,-0.83,0.34,-0.39,-0.46,0.69,"[특징주]삼성전자·SK하이닉스, 반도체 대장주 나란히 신고가",1
3,SK하이닉스,20210105,-0.02,0.01,0.03,0.00,0.03,-0.83,0.34,-0.39,-0.46,0.69,[특징주] 삼성전자·SK하이닉스신고가…반도체株 새해 첫날 강세,1
4,SK하이닉스,20210105,-0.02,0.01,0.03,0.00,0.03,-0.83,0.34,-0.39,-0.46,0.69,"[채널Who]SK하이닉스D램과 낸드 두 날개, 이석희 비메모리 씨 뿌려",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380,삼성SDI,20210318,0.03,0.01,-0.02,0.01,-0.01,0.41,-0.34,-0.24,0.33,0.18,"캐나다 네오배터리,삼성SDI·LG엔솔 인력 영입",1
2381,삼성SDI,20210318,0.03,0.01,-0.02,0.01,-0.01,0.41,-0.34,-0.24,0.33,0.18,"삼성전기 ""조직문화 혁신, 5년내 매출 2배""삼성SDI""절대적 품질 우위로 게임...",1
2382,삼성SDI,20210318,0.03,0.01,-0.02,0.01,-0.01,0.41,-0.34,-0.24,0.33,0.18,폭스바겐 파워데이 후폭풍…LG화학삼성SDISK이노 이틀째 하락,1
2383,삼성SDI,20210318,0.03,0.01,-0.02,0.01,-0.01,0.41,-0.34,-0.24,0.33,0.18,"대전여상,삼성 SDI·KT&G 등에 '44명' 합격",1


In [71]:
train_newtitle = pd.merge(x_train_main,title, on=["회사명","날짜"])

In [72]:
train_newtitle

Unnamed: 0,회사명,날짜,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량,title,Either
0,삼성에스디에스,20221228,-0.02,0.02,-0.02,-0.01,0.02,0.34,-0.28,-0.31,0.83,-0.14,"삼성SDS, 최고권위 AI 국제학회에 기술논문 잇따라 등재",0
1,삼성에스디에스,20221228,-0.02,0.02,-0.02,-0.01,0.02,0.34,-0.28,-0.31,0.83,-0.14,[삼성그룹] 시가총액 1570억 증가...삼성바이오로직스 1.99% 상승 '미소',0
2,SK하이닉스,20201209,0.00,0.01,0.03,-0.02,0.00,0.09,0.17,1.16,-0.41,-0.04,"SK하이닉스, 반도체 中企에 기술 무상으로 나눠준다",1
3,SK하이닉스,20201209,0.00,0.01,0.03,-0.02,0.00,0.09,0.17,1.16,-0.41,-0.04,"SK하이닉스, 최첨단 ‘176단 4D 낸드’ 개발",1
4,SK하이닉스,20201209,0.00,0.01,0.03,-0.02,0.00,0.09,0.17,1.16,-0.41,-0.04,'外人 러브콜'SK하이닉스신고가 행진에도 증권사 목표가 '쑥쑥',1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5515,삼성SDI,20190307,0.01,0.01,-0.02,0.00,0.03,0.73,0.73,-0.22,-0.41,0.75,"삼성SDI, 신사업 확장에 그룹 매출비중 20%대로 낮춰... ‘홀로서기’ 눈앞",0
5516,삼성SDI,20190307,0.01,0.01,-0.02,0.00,0.03,0.73,0.73,-0.22,-0.41,0.75,"1월 전기차 배터리 중국 약진·한국 후진…LG화학 4위,삼성SDI7위",0
5517,삼성SDI,20190307,0.01,0.01,-0.02,0.00,0.03,0.73,0.73,-0.22,-0.41,0.75,삼성SDI우 주가종목 그래프 외 증시동향.... 6일 현재 94000원,0
5518,삼성SDI,20190307,0.01,0.01,-0.02,0.00,0.03,0.73,0.73,-0.22,-0.41,0.75,[코스피 기관 순매도]삼성전자·삼성전기·LG화학,0


In [76]:
x_test_main_sub = x_test_main.drop(['회사명','날짜'], axis=1)

In [77]:
x_test_main_sub

Unnamed: 0,5일전_종가,4일전_종가,3일전_종가,2일전_종가,1일전_종가,5일전_거래량,4일전_거래량,3일전_거래량,2일전_거래량,1일전_거래량
419,-0.02,0.01,0.03,0.00,0.03,-0.83,0.34,-0.39,-0.46,0.69
679,-0.01,-0.01,-0.02,0.03,-0.01,-0.10,0.01,-0.36,0.33,-0.09
712,0.00,-0.03,0.03,0.02,0.00,-0.16,0.06,-0.22,0.62,-0.20
569,-0.02,-0.02,0.02,0.01,0.10,-0.40,-0.37,-0.07,0.03,6.72
408,0.02,-0.02,-0.01,0.04,0.00,-0.05,0.38,-0.20,0.94,-0.41
...,...,...,...,...,...,...,...,...,...,...
752,0.01,0.00,0.02,-0.05,0.00,-0.66,-0.58,0.72,0.25,-0.27
661,0.01,0.00,-0.02,0.00,-0.01,0.88,-0.27,-0.32,0.00,0.39
230,0.02,-0.02,0.02,-0.01,-0.02,0.00,0.37,-0.15,0.29,-0.23
248,-0.01,-0.01,-0.02,0.00,-0.01,0.13,0.60,-0.47,0.55,-0.32


In [None]:
print(x_train_title.shape, x_test_title.shape) # 훈련세트, 테스트세트 비율 확인
np.unique(y_train_title, return_counts=True) # 훈련세트의 타깃(라벨) 확인

(5533, 4) (2372, 4)


(array([0, 1], dtype=int64), array([2666, 2867], dtype=int64))

In [81]:
# title을 그대로 학습할 수 없기 때문에 가중치를 TF-IDF로 설정하고 문장을 벡터화시켜줌

vect = TfidfVectorizer().fit(train_newtitle['title'])
x_train_vectorized = vect.transform(train_newtitle['title'])

x_train_vectorized

<5520x12975 sparse matrix of type '<class 'numpy.float64'>'
	with 38835 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
RFC = RandomForestClassifier()


## 최적화 파라미터 설정
rf_param_grid = {"max_depth": [None],
              "max_features": [3, 8, 8],
              "min_samples_split": [2, 3, 8],
              "min_samples_leaf": [1, 3, 8],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gsRFC = GridSearchCV(RFC, rf_param_grid, scoring="accuracy",  verbose = 1)

gsRFC.fit(x_train_main_sub,y_train_main)

RFC_best = gsRFC.best_estimator_

gsRFC.best_score_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


0.6843353964043619

In [None]:
RFC_predict=RFC_best.predict(x_test_main_sub)

In [None]:
RFC_best

In [None]:
GBC = GradientBoostingClassifier()

## 최적화 파라미터 설정
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1]
              }

gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, scoring="accuracy", n_jobs= 4, verbose = 1)

gsGBC.fit(x_train_main_sub,y_train_main)

GBC_best = gsGBC.best_estimator_

# Best score
gsGBC.best_score_

Fitting 5 folds for each of 72 candidates, totalling 360 fits




0.6139994105511347

In [None]:
GBC_predict=GBC_best.predict(x_test_main_sub)

In [83]:
SVMC = SVC(probability=True)

## 최적화 파라미터 설정
svc_param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                  'gamma': [ 0.001, 0.01, 0.1, 1]
                  }

gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, scoring="accuracy", n_jobs= 4, verbose = 1)

gsSVMC.fit(x_train_vectorized,train_newtitle['Either'])
SVMC_best = gsSVMC.best_estimator_

# Best score
gsSVMC.best_score_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


0.5153985507246377

In [86]:
svm_predict = SVMC_best.predict(vect.transform(test_newtitle['title']))

In [48]:
# test 데이터셋에 대한 RF의 예측값

RFC = pd.DataFrame({
        "날짜": x_test_main['날짜'],
        "회사": x_test_main['회사명'],
        "real": y_test_main,
        "RFC":RFC_predict,
   })

In [49]:
RFC

Unnamed: 0,날짜,회사,real,RFC
419,20210105,SK하이닉스,1,1
679,20191002,LG디스플레이,0,1
712,20200504,LG디스플레이,0,0
569,20210122,LG전자,0,1
408,20201013,SK하이닉스,1,1
...,...,...,...,...
752,20210129,LG디스플레이,0,0
661,20190328,LG디스플레이,0,0
230,20201116,삼성SDI,0,1
248,20210311,삼성SDI,1,0


In [97]:
# test 데이터셋에 대한 GB의 예측값

GBC = pd.DataFrame({
        "날짜": x_test_main['날짜'],
        "회사": x_test_main['회사명'],
        "real": y_test_main,
        "GBC":GBC_predict,
   })

In [98]:
GBC

Unnamed: 0,날짜,회사,real,GBC
419,20210105,SK하이닉스,1,1
679,20191002,LG디스플레이,0,1
712,20200504,LG디스플레이,0,0
569,20210122,LG전자,0,1
408,20201013,SK하이닉스,1,1
...,...,...,...,...
752,20210129,LG디스플레이,0,0
661,20190328,LG디스플레이,0,0
230,20201116,삼성SDI,0,1
248,20210311,삼성SDI,1,0


In [87]:
# test 데이터셋에 대한 SVM의 예측값

SVM = pd.DataFrame({
        "날짜": test_newtitle['날짜'],
        "회사": test_newtitle['회사명'],
        "real": test_newtitle["Either"],
        "SVM":svm_predict,
   })

In [88]:
SVM

Unnamed: 0,날짜,회사,real,SVM
0,20210105,SK하이닉스,1,1
1,20210105,SK하이닉스,1,1
2,20210105,SK하이닉스,1,1
3,20210105,SK하이닉스,1,1
4,20210105,SK하이닉스,1,1
...,...,...,...,...
2380,20210318,삼성SDI,1,1
2381,20210318,삼성SDI,1,1
2382,20210318,삼성SDI,1,1
2383,20210318,삼성SDI,1,1


In [99]:
# RF와 GB에 대한 결과 값 결합

first_merge = pd.merge(RFC,GBC, on=["날짜","회사","real"])

In [100]:
first_merge

Unnamed: 0,날짜,회사,real,RFC,GBC
0,20210105,SK하이닉스,1,1,1
1,20191002,LG디스플레이,0,1,1
2,20200504,LG디스플레이,0,0,0
3,20210122,LG전자,0,1,1
4,20201013,SK하이닉스,1,1,1
...,...,...,...,...,...
246,20210129,LG디스플레이,0,0,0
247,20190328,LG디스플레이,0,0,0
248,20201116,삼성SDI,0,1,1
249,20210311,삼성SDI,1,0,0


In [101]:
# RF, GB, SVM에 대한 결과값 결합

second_merge = pd.merge(first_merge,SVM, on=["날짜","회사","real"])

In [103]:
# 2개의 분류기 이상이 상승이라고 판단할 경우 상승이라고 판단하고 반대의 경우 하락이라고 판단할 예정임
# 따라서 분류기에 따른 결과값의 합이 2 이상일 경우 상승임
# 이를 위해 결과값들을 모두 더해줌

second_merge['total'] = second_merge['RFC'] + second_merge['GBC'] + second_merge['SVM']

In [110]:
# 2 이상일 경우 상승(1) 아닐 경우 하락이라는 표시를 해주는 칼럼 생성

second_merge['prediction'] = second_merge['total'].apply(lambda x: 1 if x > 2 else 0)

In [111]:
second_merge

Unnamed: 0,날짜,회사,real,RFC,GBC,SVM,total,prediction
0,20210105,SK하이닉스,1,1,1,1,3,1
1,20210105,SK하이닉스,1,1,1,1,3,1
2,20210105,SK하이닉스,1,1,1,1,3,1
3,20210105,SK하이닉스,1,1,1,1,3,1
4,20210105,SK하이닉스,1,1,1,1,3,1
...,...,...,...,...,...,...,...,...
2380,20210318,삼성SDI,1,0,0,1,1,0
2381,20210318,삼성SDI,1,0,0,1,1,0
2382,20210318,삼성SDI,1,0,0,1,1,0
2383,20210318,삼성SDI,1,0,0,1,1,0


In [115]:
# 예측값과 실제값을 모두 더해줌

prediction_sum = second_merge['prediction'].sum()
real_sum = second_merge['real'].sum()

In [118]:
# 예측값/실제값을 진행하여 정확도 측정

accuracy  = prediction_sum/real_sum
accuracy

0.7130295763389288

In [None]:
prediction_sum