In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Bidirectional
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
dir = '../Big_Data/[DACON]Bit_Trader'
train_x_df = pd.read_csv(dir + '/train_x_df.csv')
train_y_df = pd.read_csv(dir + '/train_y_df.csv')

train_x_df = train_x_df[train_x_df.sample_id < 300]
train_y_df = train_y_df[train_y_df.sample_id < 300]

In [3]:
test_x_df = pd.read_csv(dir + '/test_x_df.csv')

In [4]:
def df2d_to_array3d(df_2d):
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    array_3d = df_2d.iloc[:,2:].values.reshape([sample_size, time_size, feature_size])
    return array_3d

In [5]:
train_x_array = df2d_to_array3d(train_x_df) 
train_y_array = df2d_to_array3d(train_y_df)  

print('train_x shape : ', train_x_array.shape)
print('train_y shape : ', train_y_array.shape)

train_x shape :  (300, 1380, 10)
train_y shape :  (300, 120, 10)


In [6]:
test_x_array = df2d_to_array3d(test_x_df)    

print('test_x shape : ', test_x_array.shape)

test_x shape :  (529, 1380, 10)


In [7]:
# seq_len = 120
# model = Sequential()
# model.add(LSTM(50, activation='tanh', return_sequences= True, input_shape = [seq_len, 1]))
# model.add(LSTM(50, activation='tanh')) 
# model.add(Dense(1)) 

# model.compile(optimizer = 'adam', loss = 'mse', metrics=['mse'])

# model.summary()

In [8]:
def build_model():
    seq_len = 120
    model = Sequential()
    model.add(LSTM(50, activation='tanh', return_sequences= True, input_shape = [seq_len, 1]))
    model.add(LSTM(50, activation='tanh'))
    model.add(Dense(1))

    model.compile(optimizer = 'adam', loss = 'mse', metrics=['mse'])

    return model

In [9]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler

# LearningRateScheduler 함수 생성 (epoch이 5이하면 그대로 lr 유지, 아닐시 lr 조정)
def scheduler(epoch, learning_rate):
    if epoch < 5:
        return learning_rate
    else:
        return learning_rate * tf.math.exp(-0.1)

# print(round(model.optimizer.lr.numpy(), 5))

lr = LearningRateScheduler(scheduler)
es = EarlyStopping(monitor='mse', patience=10, mode='min')

In [10]:
test_pred_array = np.zeros([len(test_x_array), 120, 1])
print(test_pred_array.shape)

epoch = 30
batch_size = 120
# 529번의 sample_id 에 대해서 반복문
for idx in tqdm(range(test_x_array.shape[0])):
    seq_len = 120
    sequence_length = seq_len + 1

    windows = []
    for index in range(1380 - sequence_length):
        windows.append(test_x_array[idx, :, 1][index : index + sequence_length])
    
    windows = np.array(windows)
    x_test = windows[:, :-1]
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
    y_test = windows[:, -1]

    model = build_model()
    history = model.fit(x_test, y_test, epochs=epoch, batch_size=batch_size, verbose=0, callbacks=[lr, es])
    print('sample_id : ', idx, '번')
    print('loss : ', history.history['loss'][-1])
    print('mse : ', history.history['mse'][-1])
    print('lr : ', round(model.optimizer.lr.numpy(), 5))
    
    # windows.shape (1259, 121)
    window = windows[-1, :-1] # (120, )
    window_3d = np.reshape(window, (1, window.shape[0], 1))  # (1, 120, 1)
    for m in range(window.shape[0]):
        pred = model.predict(window_3d)
        test_pred_array[idx, m, :] = pred

        window_3d_2nd = window_3d[0, 1:, :]  # 119개

        pred_target = test_pred_array[idx, m, :]
        pred_target = np.reshape(pred_target, (pred_target.shape[0], 1))

        window_3d = np.concatenate((window_3d_2nd, pred_target), axis=0)
        window_3d = window_3d.T
        window_3d = np.reshape(window_3d, (window_3d.shape[0], window_3d.shape[1], 1))

  0%|          | 0/529 [00:00<?, ?it/s]

(529, 120, 1)


  0%|          | 0/529 [00:06<?, ?it/s]


UnknownError:  [_Derived_]  Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[sequential/lstm/StatefulPartitionedCall]] [Op:__inference_distributed_function_5216]

Function call stack:
distributed_function -> distributed_function -> distributed_function


In [None]:
print(test_pred_array.shape)

In [None]:
pred_array_2d = np.zeros([test_pred_array.shape[0], 120])

for idx in tqdm(range(test_pred_array.shape[0])):
    pred_array_2d[idx, :] = test_pred_array[idx, :, 0]

In [None]:
for idx, sell_time in enumerate(np.argmax(pred_array_2d, axis=1)):
        print(pred_array_2d[idx, sell_time])

In [None]:
pred_array_2d.shape

In [None]:
# 2) 예측값을 재해석하여 submission 표를 작성하는 함수 정의
def array_to_submission(pred_array):
    submission = pd.DataFrame(np.zeros([pred_array.shape[0], 2], np.int64),
                              columns=['buy_quantity', 'sell_time'])
    submission = submission.reset_index()


    sell_price = []
    for idx, sell_time in enumerate(np.argmax(pred_array, axis=1)):
        sell_price.append(pred_array[idx, sell_time])

    sell_price = np.array(sell_price)

    submission.loc[:, 'buy_quantity'] = (((sell_price/1)*0.99900025) > 1.08) * 1


    submission['sell_time'] = np.argmax(pred_array, axis=1)


    submission.columns = ['sample_id', 'buy_quantity', 'sell_time']

    return submission


In [None]:
print((0.998392641544342/1))
print((0.998392641544342/1)*0.99900025)

In [None]:
final_submission = array_to_submission(pred_array_2d)

In [None]:
final_submission.buy_quantity.value_counts()

In [None]:
# final_submission csv파일로 저장
dir = '/content/drive/MyDrive/데이콘/비트트레이더_경진대회/answer/'
final_submission.to_csv(dir + 'submission_lstm(5050).csv', index = False)

In [None]:
answer = pd.read_csv('/content/drive/MyDrive/데이콘/비트트레이더_경진대회/answer/submission.csv')
answer

# Train 데이터로 뒤에 코드는 X

In [None]:
train_pred_array = np.zeros([len(train_x_array), 120, 1])
print(train_pred_array.shape)

epoch = 30
batch_size = 120

for idx in tqdm(range(train_x_array.shape[0])):
    seq_len = 120
    sequence_length = seq_len + 1

    windows = []
    for index in range(1380 - sequence_length):
        windows.append(train_x_array[idx, :, 1][index : index + sequence_length])
    
    windows = np.array(windows)
    x_test = windows[:, :-1]
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
    y_test = windows[:, -1]

    history = model.fit(x_test, y_test, epochs=epoch, batch_size=batch_size, verbose=0, callbacks=[lr, es])
    print('sample_id : ', idx, '번')
    print('loss : ', history.history['loss'][-1])
    print('mse : ', history.history['mse'][-1])
    
    # windows.shape (1259, 121)
    window = windows[-1, :-1] # (120, )
    window_3d = np.reshape(window, (1, window.shape[0], 1))  # (1, 120, 1)
    for m in range(window.shape[0]):
        pred = model.predict(window_3d)
        train_pred_array[idx, m, :] = pred

        window_3d_2nd = window_3d[0, 1:, :]  # 119개

        pred_target = train_pred_array[idx, m, :]
        pred_target = np.reshape(pred_target, (pred_target.shape[0], 1))

        window_3d = np.concatenate((window_3d_2nd, pred_target), axis=0)
        window_3d = window_3d.T
        window_3d = np.reshape(window_3d, (window_3d.shape[0], window_3d.shape[1], 1))

In [None]:
train_pred_array

---
train 데이터로 계산

In [None]:
def df2d_to_answer(df_2d):
    # valid_y_df로부터
    # open 가격 정보가 포함된
    # [샘플 수, 120분] 크기의 
    # 2차원 array를 반환하는 함수
    feature_size = df_2d.iloc[:,2:].shape[1]
    time_size = len(df_2d.time.value_counts())
    sample_size = len(df_2d.sample_id.value_counts())
    sample_index = df_2d.sample_id.value_counts().index
    array_2d = df_2d.open.values.reshape([sample_size, time_size])
    sample_index = list(sample_index)
    return array_2d, sample_index

def COIN(y_df, submission, df2d_to_answer = df2d_to_answer):
    # 2차원 데이터프레임에서 open 시점 데이터만 추출하여 array로 복원
    # sample_id정보를 index에 저장
    y_array, index = df2d_to_answer(y_df)
    
    # index 기준으로 submission을 다시 선택
    submission = submission.set_index(submission.columns[0])
    submission = submission.iloc[index, :]    
    
    # 초기 투자 비용은 10000 달러
    total_momey      = 10000 # dolors
    total_momey_list = []
    
    # 가장 처음 sample_id값
    start_index = submission.index[0]
    for row_idx in submission.index:
        sell_time  = submission.loc[row_idx, 'sell_time']
        buy_price  = y_array[row_idx - start_index, 0]
        sell_price = y_array[row_idx - start_index, sell_time]
        buy_quantity = submission.loc[row_idx, 'buy_quantity'] * total_momey
        residual = total_momey - buy_quantity
        ratio = sell_price / buy_price
        total_momey = buy_quantity * ratio * 0.9995 * 0.9995 + residual        
        total_momey_list.append(total_momey)
        
    return total_momey, total_momey_list

In [None]:
total_momey, total_momey_list = COIN(train_y_df,
                                     train_pred_array)
print(total_momey)