In [None]:
!pip install finance-datareader

# 삼성전자 주가 예측

In [3]:
import pandas as pd
import FinanceDataReader as fdr

In [5]:
# 분석할 데이터의 시작점과 끝점을 설정
# business_day(월화수목금)를 구함 (주식 시장은 business_day에만 열림)

start_date = '20160104'
end_date = '20220520'

business_day = pd.DataFrame(pd.date_range(start_date,end_date,freq='B'), columns = ['Date'])

print(f'business_day shape : {business_day.shape}', )
business_day

business_day shape : (1665, 1)


Unnamed: 0,Date
0,2016-01-04
1,2016-01-05
2,2016-01-06
3,2016-01-07
4,2016-01-08
...,...
1660,2022-05-16
1661,2022-05-17
1662,2022-05-18
1663,2022-05-19


In [6]:
# 삼성전자에 대하여 진행

stock_code = '005930'

# 위에서 설정한 기간에 해당하는 삼성전자 주식 데이터를 불러온다.
stock_df = fdr.DataReader(stock_code, start = start_date, end = end_date).reset_index()

# 토, 일 외에도 휴장하는 날이 있으므로 business_day와 stock_df의 행 수는 다르다.
# 따라서 merge하고 NaN인 부분을 전날 데이터로 채워준다.
stock_df = pd.merge(business_day, stock_df, how = 'outer')
stock_df = stock_df.ffill()
stock_df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Change
0,2016-01-04,25200.0,25200.0,24100.0,24100.0,306939.0,-0.043651
1,2016-01-05,24040.0,24360.0,23720.0,24160.0,216002.0,0.002490
2,2016-01-06,24160.0,24160.0,23360.0,23500.0,366752.0,-0.027318
3,2016-01-07,23320.0,23660.0,23020.0,23260.0,282388.0,-0.010213
4,2016-01-08,23260.0,23720.0,23260.0,23420.0,257763.0,0.006879
...,...,...,...,...,...,...,...
1660,2022-05-16,67100.0,67400.0,66100.0,66300.0,11937555.0,-0.003008
1661,2022-05-17,66600.0,67900.0,66600.0,67600.0,15680447.0,0.019608
1662,2022-05-18,68300.0,68700.0,67600.0,68100.0,16486319.0,0.007396
1663,2022-05-19,66500.0,67600.0,66500.0,67500.0,17073727.0,-0.008811


In [7]:
# NaN이 전날 데이터로 잘 채워졌는지 확인 (모두 0인것으로 보아 잘 채워졌음을 알 수 있음)
stock_df.isna().sum()

Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
Change    0
dtype: int64

In [8]:
# 가격에 대한 데이터만을 사용할 것이기 때문에 Change(등락률)와 Volume(거래량) 컬럼을 제거

stock_df = stock_df.drop(['Change', 'Volume'], axis=1)
stock_df.head()

Unnamed: 0,Date,Open,High,Low,Close
0,2016-01-04,25200.0,25200.0,24100.0,24100.0
1,2016-01-05,24040.0,24360.0,23720.0,24160.0
2,2016-01-06,24160.0,24160.0,23360.0,23500.0
3,2016-01-07,23320.0,23660.0,23020.0,23260.0
4,2016-01-08,23260.0,23720.0,23260.0,23420.0


In [17]:
# 20일 전부터의 가격 데이터와 기준일의 가격 데이터를 이용하여 10일 후 까지의 시가, 고가, 저가, 종가를 예측한다.
# 이에 맞게 데이터셋을 preprocessing 한다.


# shift 함수를 통해 데이터를 한 행씩 아래로 이동하며 기준일 이전 총 20일치의 가격 데이터를 만든다.
x_lag_size = 20

lag_df = stock_df.copy()
for lag in range(1, x_lag_size+1):
    temp_df = stock_df.shift(lag).drop('Date', axis=1)
    temp_df.columns = [f'{lag}_lag_Open', f'{lag}_lag_High', f'{lag}_lag_Low',
                      f'{lag}_lag_Close']
    lag_df = pd.concat([lag_df, temp_df], axis=1)


# shift 함수를 통해 데이터를 항 행씩 위로 이동하며 기준일 이후 총 10일치의 가격 데이터를 만든다.
y_lag_size = 10

for lag in range(1, y_lag_size+1):
    temp_df = stock_df.shift(-lag).drop('Date', axis=1)
    temp_df.columns = [f'{lag}_target_Open', f'{lag}_target_High', f'{lag}_target_Low',
                      f'{lag}_target_Close']
    lag_df = pd.concat([lag_df, temp_df], axis=1)


# 위 아래로 행을 이동하였기 때문에 맨 위와 맨 아래쪽에 NaN 데이터들이 만들어졌기 때문에 해당 행들을 drop한다.
lag_df = lag_df.dropna()

In [18]:
# preprocessed data를 확인
# (20 + 1 + 10) * 4 = 124 개의 컬럼에 Date 컬럼까지 총 125컬럼인 것을 확인할 수 있다.
print(f'lag_data shape : {lag_df.shape}')
lag_df

lag_data shape : (1635, 125)


Unnamed: 0,Date,Open,High,Low,Close,1_lag_Open,1_lag_High,1_lag_Low,1_lag_Close,2_lag_Open,...,8_target_Low,8_target_Close,9_target_Open,9_target_High,9_target_Low,9_target_Close,10_target_Open,10_target_High,10_target_Low,10_target_Close
20,2016-02-01,23040.0,23260.0,23020.0,23260.0,22800.0,23000.0,22320.0,23000.0,23280.0,...,22360.0,22600.0,22600.0,23020.0,22440.0,22600.0,23080.0,23200.0,22880.0,23080.0
21,2016-02-02,23220.0,23320.0,22940.0,23120.0,23040.0,23260.0,23020.0,23260.0,22800.0,...,22440.0,22600.0,23080.0,23200.0,22880.0,23080.0,23160.0,23580.0,23140.0,23360.0
22,2016-02-03,23000.0,23040.0,22740.0,22920.0,23220.0,23320.0,22940.0,23120.0,23040.0,...,22880.0,23080.0,23160.0,23580.0,23140.0,23360.0,23580.0,24020.0,23380.0,23700.0
23,2016-02-04,23000.0,23220.0,22960.0,23120.0,23000.0,23040.0,22740.0,22920.0,23220.0,...,23140.0,23360.0,23580.0,24020.0,23380.0,23700.0,24060.0,24060.0,23560.0,23740.0
24,2016-02-05,23120.0,23380.0,23120.0,23280.0,23000.0,23220.0,22960.0,23120.0,23000.0,...,23380.0,23700.0,24060.0,24060.0,23560.0,23740.0,23740.0,23900.0,23480.0,23800.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1650,2022-05-02,66600.0,67600.0,66500.0,67300.0,65100.0,67600.0,65000.0,67400.0,65400.0,...,64900.0,64900.0,65300.0,66700.0,65200.0,66500.0,67100.0,67400.0,66100.0,66300.0
1651,2022-05-03,67400.0,68400.0,67300.0,67500.0,66600.0,67600.0,66500.0,67300.0,65100.0,...,65200.0,66500.0,67100.0,67400.0,66100.0,66300.0,66600.0,67900.0,66600.0,67600.0
1652,2022-05-04,68000.0,68400.0,67500.0,67900.0,67400.0,68400.0,67300.0,67500.0,66600.0,...,66100.0,66300.0,66600.0,67900.0,66600.0,67600.0,68300.0,68700.0,67600.0,68100.0
1653,2022-05-05,68000.0,68400.0,67500.0,67900.0,68000.0,68400.0,67500.0,67900.0,67400.0,...,66600.0,67600.0,68300.0,68700.0,67600.0,68100.0,66500.0,67600.0,66500.0,67500.0


In [19]:
# 20210104를 기준으로 train data와 test data를 split한다.

split_date = '20210104'
split_index = lag_df.index[lag_df.Date == '20210104'].to_list()[0]

# 활용이 끝난 Date 컬럼을 drop한다.
lag_df = lag_df.drop('Date', axis=1)

train_df = lag_df.iloc[:split_index]
test_df = lag_df.iloc[split_index:]
train_df.shape, test_df.shape

((1305, 124), (330, 124))

In [20]:
# feature 데이터와 label 데이터를 나눈다.

x_train = train_df.values[:,:-40]
y_train = train_df.values[:,-40:]
x_test = test_df.values[:,:-40]
y_test = test_df.values[:,-40:]

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((1305, 84), (1305, 40), (330, 84), (330, 40))

In [21]:
# XGBRegressor와 MultiOutputRegressor를 이용하여 학습한다.

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

estimator = XGBRegressor(objective = 'reg:squarederror', n_estimators=200)

model = MultiOutputRegressor(estimator = estimator)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric='rmse', early_stopping_rounds=30)

[0]	validation_0-rmse:69219.5
Will train until validation_0-rmse hasn't improved in 30 rounds.
[1]	validation_0-rmse:62303
[2]	validation_0-rmse:56109.9
[3]	validation_0-rmse:50588.1
[4]	validation_0-rmse:45583.2
[5]	validation_0-rmse:41016
[6]	validation_0-rmse:36909.7




[7]	validation_0-rmse:33205.3
[8]	validation_0-rmse:29930
[9]	validation_0-rmse:26978.5
[10]	validation_0-rmse:24369.3
[11]	validation_0-rmse:21702.9
[12]	validation_0-rmse:19278.6
[13]	validation_0-rmse:17379.4
[14]	validation_0-rmse:15708.1
[15]	validation_0-rmse:14009.2
[16]	validation_0-rmse:12632.3
[17]	validation_0-rmse:11154.8
[18]	validation_0-rmse:9947.17
[19]	validation_0-rmse:8989.88
[20]	validation_0-rmse:8118.58
[21]	validation_0-rmse:7319.81
[22]	validation_0-rmse:6649.02
[23]	validation_0-rmse:5899.94
[24]	validation_0-rmse:5362.9
[25]	validation_0-rmse:4860.36
[26]	validation_0-rmse:4353.89
[27]	validation_0-rmse:3879.12
[28]	validation_0-rmse:3495.9
[29]	validation_0-rmse:3179.16
[30]	validation_0-rmse:2873.49
[31]	validation_0-rmse:2640.6
[32]	validation_0-rmse:2409.01
[33]	validation_0-rmse:2192.81
[34]	validation_0-rmse:2011.51
[35]	validation_0-rmse:1889.12
[36]	validation_0-rmse:1769
[37]	validation_0-rmse:1648.96
[38]	validation_0-rmse:1579.25
[39]	validation_0-r

MultiOutputRegressor(estimator=XGBRegressor(n_estimators=200,
                                            objective='reg:squarederror'))

In [22]:
# metric 측정 (rmse, r2 score)

from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

pred = model.predict(x_test)

print(f'rmse : {np.sqrt(mean_squared_error(y_test, pred))}, r2 score : {r2_score(y_test, pred)}')

rmse : 2997.074672457524, r2 score : 0.6784291339846875
