# **1. 데이터 불러오기**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import os
import gc
import pyarrow.parquet as pq
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/2024 BDA 연합공모전_코인채굴꾼/OHLCV.csv')
df

Unnamed: 0,Time,Open,High,Low,Close,Volume,Maker_ratio
0,2023-01-01 00:00:00,16537.5,16540.9,16504.0,16527.0,5381.399,0.523137
1,2023-01-01 01:00:00,16527.1,16554.3,16524.1,16550.4,3210.826,0.439935
2,2023-01-01 02:00:00,16550.5,16557.1,16534.8,16542.4,2399.668,0.538677
3,2023-01-01 03:00:00,16542.5,16542.5,16515.0,16529.3,3214.480,0.517398
4,2023-01-01 04:00:00,16529.2,16530.4,16508.8,16517.8,3150.954,0.452596
...,...,...,...,...,...,...,...
9403,2024-01-27 19:00:00,41785.0,41977.0,41775.1,41896.4,7765.565,0.473508
9404,2024-01-27 20:00:00,41896.5,42070.0,41896.4,42049.6,5408.243,0.474818
9405,2024-01-27 21:00:00,42049.6,42165.6,42001.8,42137.8,5859.277,0.505293
9406,2024-01-27 22:00:00,42137.8,42187.1,42057.6,42135.3,5271.863,0.527780


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9408 entries, 0 to 9407
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Time         9408 non-null   object 
 1   Open         9408 non-null   float64
 2   High         9408 non-null   float64
 3   Low          9408 non-null   float64
 4   Close        9408 non-null   float64
 5   Volume       9408 non-null   float64
 6   Maker_ratio  9408 non-null   float64
dtypes: float64(6), object(1)
memory usage: 514.6+ KB


In [None]:
### Time 처리

df['Time'] = pd.to_datetime(df['Time'])
df.set_index('Time', inplace=True)

In [None]:
# maker_ratio는 일단 필요 없을 것 같으므로 제거

df.drop('Maker_ratio', axis = 1, inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 9408 entries, 2023-01-01 00:00:00 to 2024-01-27 23:00:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    9408 non-null   float64
 1   High    9408 non-null   float64
 2   Low     9408 non-null   float64
 3   Close   9408 non-null   float64
 4   Volume  9408 non-null   float64
dtypes: float64(5)
memory usage: 441.0 KB


In [None]:
df.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,9408.0,9408.0,9408.0,9408.0,9408.0
mean,29771.803178,29865.354645,29676.537495,29774.602227,16831.177838
std,6740.667365,6768.684152,6708.999383,6740.419939,20470.45631
min,16504.3,16524.9,16488.0,16504.2,0.0
25%,26033.35,26079.65,25981.575,26033.925,5985.10075
50%,28007.35,28110.5,27909.65,28009.25,10366.2045
75%,31043.7,31199.1,30892.775,31047.2,19305.60375
max,48577.1,49027.5,47209.8,48577.9,355275.447


## **2. 모델링**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
### Toy Data 생성
# 임시로 2024/01/28 0:00:00 ~ 2024/01/31 0:00:00에 대한 Open, High, Low, Volume을 만들어둠

np.random.seed(42)

date_rng = pd.date_range('2024-01-28 0:00:00', '2024-01-31 0:00:00', freq='H')  # 괄호 수정

pred_ftrs = pd.DataFrame({
    'Time': date_rng,
    'Open': np.random.uniform(16504.3, 48577.1, size=(len(date_rng))),
    'High': np.random.uniform(16524.9, 49027.5, size=(len(date_rng))),
    'Low': np.random.uniform(6708.9, 47209.8, size=(len(date_rng))),
    'Volume': np.random.uniform(0, 355275.447, size=(len(date_rng))),
})

pred_ftrs['Time'] = pd.to_datetime(pred_ftrs['Time'])
pred_ftrs.set_index('Time', inplace=True)

pred_ftrs

Unnamed: 0_level_0,Open,High,Low,Volume
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-01-28 00:00:00,28516.850324,43029.516624,31396.804134,301708.226766
2024-01-28 01:00:00,46996.369807,39499.601504,27067.852852,233633.714265
2024-01-28 02:00:00,39981.395297,40219.528380,8793.835757,201906.093084
2024-01-28 03:00:00,35704.953832,41593.191570,17994.332583,33280.345013
2024-01-28 04:00:00,21508.254651,18931.543697,43494.485821,130640.396301
...,...,...,...,...
2024-01-30 20:00:00,18895.347885,24708.479248,35552.799426,192788.315158
2024-01-30 21:00:00,48156.527340,32686.769288,16317.701568,101801.071434
2024-01-30 22:00:00,41272.352037,26304.227353,19887.880635,209908.550751
2024-01-30 23:00:00,22877.668311,25782.956653,36942.473750,10835.989931


In [None]:
### 기술적 지표 생성
# 해당 부분은 실제 코드로 바꿔야 함

df['feature1'] = df['Close'] - df['Open']
df['feature2'] = df['High'] - df['Low']

In [None]:
# NaN 값이 있는 행 제거

df.dropna(inplace=True)

In [None]:
df_ftrs = df.drop('Close', axis = 1)
df_target = df['Close']

In [None]:
df_ftrs.head()

Unnamed: 0_level_0,Open,High,Low,Volume,feature1,feature2
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:00:00,16537.5,16540.9,16504.0,5381.399,-10.5,36.9
2023-01-01 01:00:00,16527.1,16554.3,16524.1,3210.826,23.3,30.2
2023-01-01 02:00:00,16550.5,16557.1,16534.8,2399.668,-8.1,22.3
2023-01-01 03:00:00,16542.5,16542.5,16515.0,3214.48,-13.2,27.5
2023-01-01 04:00:00,16529.2,16530.4,16508.8,3150.954,-11.4,21.6


In [None]:
df_target.head()

Time
2023-01-01 00:00:00    16527.0
2023-01-01 01:00:00    16550.4
2023-01-01 02:00:00    16542.4
2023-01-01 03:00:00    16529.3
2023-01-01 04:00:00    16517.8
Name: Close, dtype: float64

In [None]:
### Sliding Window 방식으로 데이터 생성

def create_dataset(ftrs, target, window):
  x_data = []
  y_data = []

  for i in range(len(ftrs) - window - 1):
    data = ftrs.iloc[i:(i + window)].values.flatten()  # 2D 배열로 변환
    x_data.append(data)
    y_data.append(target.iloc[i + window])

  return np.array(x_data), np.array(y_data)

In [None]:
window = 20
x_train, y_train = create_dataset(df_ftrs, df_target, window)
print(x_train.shape, y_train.shape)

(9387, 120) (9387,)


In [None]:
# 모델 객체 생성
model = RandomForestRegressor()

# Train the model
model.fit(x_train, y_train)

In [None]:
# 가상의 예측 데이터

window = 20
X_prediction = df_ftrs.tail(20).copy() # 일단 마지막 20시간의 데이터 선택

# 예측 과정
y_pred = []

for i in range(73):
  x_pred = X_prediction.iloc[-window:].values.flatten().reshape(1, -1) # 지난 20시간의 데이터
  pred = model.predict(x_pred)[0]  # 모델 예측

  y_pred.append(pred)  # 예측 결과 저장

  ### 데이터 업데이트
  open = pred_ftrs.iloc[i,0]
  high = pred_ftrs.iloc[i,1]
  low = pred_ftrs.iloc[i,2]
  volume = pred_ftrs.iloc[i,3]
  new_feature1 = pred * 0.1
  new_feature2 = pred * 0.2

  ### 업데이트된 값을 새로운 행으로 추가
  # 이후 제일 마지막 20시간의 값을 가져와서 예측(for문 맨 윗줄에서 마지막 20시간의 데이터를 가져옴)
  new_data = pd.DataFrame([[open, high, low, volume, new_feature1, new_feature2]],
                          columns=['Open', 'High', 'Low', 'Volume', 'feature1', 'feature2'])
  X_prediction = pd.concat([X_prediction, new_data], ignore_index=True)

In [None]:
X_prediction # 93개인 이유: 73시간 + 제일 앞에 20시간 붙어서..

Unnamed: 0,Open,High,Low,Volume,feature1,feature2
0,41805.100000,41868.300000,41780.000000,2413.204000,23.6000,88.3000
1,41828.700000,41880.500000,41721.000000,3788.355000,-80.5000,159.5000
2,41748.100000,41803.600000,41672.500000,4141.470000,-53.3000,131.1000
3,41694.800000,41723.700000,41594.000000,7069.355000,4.3000,129.7000
4,41699.000000,41720.200000,41400.000000,13171.464000,-243.6000,320.2000
...,...,...,...,...,...,...
88,18895.347885,24708.479248,35552.799426,192788.315158,3069.3313,6138.6626
89,48156.527340,32686.769288,16317.701568,101801.071434,3385.0620,6770.1240
90,41272.352037,26304.227353,19887.880635,209908.550751,2782.7595,5565.5190
91,22877.668311,25782.956653,36942.473750,10835.989931,2817.5998,5635.1996


In [None]:
pd.DataFrame({'pred_close': y_pred}, index = pred_ftrs.index)

Unnamed: 0_level_0,pred_close
Time,Unnamed: 1_level_1
2024-01-28 00:00:00,42141.289
2024-01-28 01:00:00,37744.105
2024-01-28 02:00:00,31929.293
2024-01-28 03:00:00,31945.649
2024-01-28 04:00:00,31443.290
...,...
2024-01-30 20:00:00,30693.313
2024-01-30 21:00:00,33850.620
2024-01-30 22:00:00,27827.595
2024-01-30 23:00:00,28175.998
