In [1]:
import torch
from torch import nn
import torch.optim as optim

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgbm
%matplotlib inline



In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
df_train = pd.read_csv('../input/optiver-trading-at-the-close/train.csv')

In [5]:
def memory_decreasing(df_train):
    start_mem = df_train.memory_usage().sum() / 1024**2

    for col in list(df_train):
        col_type = df_train[col].dtype

        if col_type != object:
            c_min = df_train[col].min()
            c_max = df_train[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    if c_min >= 0:
                        df_train[col] = df_train[col].astype(np.uint8)
                    else:
                        df_train[col] = df_train[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    if c_min >= 0:
                        df_train[col] = df_train[col].astype(np.uint16)
                    else:
                        df_train[col] = df_train[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    if c_min >= 0:
                        df_train[col] = df_train[col].astype(np.uint32)
                    else:
                        df_train[col] = df_train[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    if c_min >= 0:
                        df_train[col] = df_train[col].astype(np.uint64)
                    else:
                        df_train[col] = df_train[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df_train[col] = df_train[col].astype(np.float32)
                else:
                    df_train[col] = df_train[col].astype(np.float64)

    end_mem = df_train.memory_usage().sum() / 1024**2
    
    return df_train

In [6]:
df_train = memory_decreasing(df_train)

df_train

In [7]:
def train(df , x):

    
    df = df.drop(['row_id','time_id','far_price','near_price'] , axis = 1)
    df = df.dropna(axis = 0)
    df['pressure'] = df['imbalance_size']*df['imbalance_buy_sell_flag']
    df['bid_ask_size_gap'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'] + 0.000001)
    df['power0'] = (df['matched_size'] + df['pressure']) / (df['matched_size'] + abs(df['pressure']) + 0.000001)
    df['power1'] = (df['matched_size'] + df['pressure'] + df['bid_size'] - df['ask_size']) / (df['matched_size'] + df['pressure'] + df['bid_size'] + df['ask_size'] + 0.000001)

    df = df.drop(['imbalance_size','imbalance_buy_sell_flag'] , axis = 1)
    
    #tiem series
    column = ['reference_price' ,'ask_size','bid_size','pressure','ask_price','bid_price','matched_size']
    
    shift_list = []
    for i in range(1, x+1):
        shift_list.append(i)
    
    for c in column:
        gv = df[["stock_id", "date_id", c]].groupby(["stock_id","date_id"])
        for s in shift_list:
            df[c + '_' + str(s)] = gv.shift(s)
        df[c + '_' + str(0)] = df[c]
        df = df.drop([c] , axis = 1)
    
    df = df.dropna(axis = 0)
    
    # feaeture setting
    
    column1 = ['ask_size','ask_price']
    for c in column1:
        for i in range(0,x+1):
            df['neg' + '_' + c + '_' + str(i)] = -df[c + '_' + str(i)]
            df = df.drop([c + '_' + str(i)] , axis = 1)
    for c in column1:
        for i in range(0,x+1):
            df[c + '_' + str(i)] = df['neg' + '_' + c + '_' + str(i)]
            df = df.drop(['neg' + '_' + c + '_' + str(i)], axis = 1)
    
    for i in range(0,x):
        j = i+1
        df['reference_price_deri_' + str(i)] = df['reference_price_' + str(i)] - df['reference_price_' + str(j)]
    for i in range(0, x+1):
        df = df.drop(['reference_price_' + str(i)] , axis = 1)
    
    for i in range(0,x):
        df['reference_price_' + str(i)] = df['reference_price_deri_' + str(i)]
    for i in range(0,x):
        df = df.drop(['reference_price_deri_' + str(i)] , axis = 1)
        
    df = df.dropna(axis = 0)
    
    #feature generating
    
    split_column1 = ['ask_size','bid_size','pressure']
    
    day = []
    for i in range(0, x+1):
        day.append(i)
    
    for i in range(0, len(split_column1)):
        for j in range(0, len(split_column1)):
            for d in day:
                if i < j and i != j:
                    df[split_column1[i] + '_' + split_column1[j] + '_' + str(d)] = df[split_column1[i] + '_' + str(d)] + df[split_column1[j] + '_' + str(d)]
                    
    
    for i in range(0,6):
        df['sign_' + str(i)] = df['reference_price_' + str(i)]*df['matched_size_' + str(i)]
        df = df.drop(['matched_size_' + str(i)] , axis = 1)
    
    df = df.drop(['date_id' , 'wap'] , axis = 1)
    
    df = df.dropna(axis = 0)

    
    # 아래부터
    val_y = df['target']
    df = df.drop(['target'] , axis = 1)
    # 위까지 test와 다른 코드

    return df,val_y

In [8]:
df_x, val_y = train(df_train , 6)

In [9]:
#df_x.columns[0:]

In [10]:
df_x5_column = ['stock_id', 'seconds_in_bucket', 'bid_ask_size_gap', 'power0', 'power1',
       'bid_size_1', 'bid_size_2', 'bid_size_3', 'bid_size_4', 
       'bid_size_6', 'bid_size_0', 'pressure_1', 'pressure_2', 'pressure_3',
       'pressure_4',  'pressure_6', 'pressure_0', 'bid_price_1',
       'bid_price_2', 'bid_price_3', 'bid_price_4', 
       'bid_price_6', 'bid_price_0', 'matched_size_6', 'ask_size_0',
       'ask_size_1', 'ask_size_2', 'ask_size_3', 'ask_size_4',
       'ask_size_6', 'ask_price_0', 'ask_price_1', 'ask_price_2',
       'ask_price_3', 'ask_price_4', 'ask_price_6',
       'reference_price_0', 'reference_price_1', 'reference_price_2',
       'reference_price_3', 'reference_price_4', 
       'ask_size_bid_size_0', 'ask_size_bid_size_1', 'ask_size_bid_size_2',
       'ask_size_bid_size_3', 'ask_size_bid_size_4', 
       'ask_size_bid_size_6', 'ask_size_pressure_0', 'ask_size_pressure_1',
       'ask_size_pressure_2', 'ask_size_pressure_3', 'ask_size_pressure_4',
        'ask_size_pressure_6', 'bid_size_pressure_0',
       'bid_size_pressure_1', 'bid_size_pressure_2', 'bid_size_pressure_3',
       'bid_size_pressure_4',  'bid_size_pressure_6',
         'sign_0', 'sign_1', 'sign_2', 'sign_3',
       'sign_4']

df_x4_column = ['stock_id', 'seconds_in_bucket', 'bid_ask_size_gap', 'power0', 'power1',
       'bid_size_1', 'bid_size_2', 'bid_size_3',  'bid_size_5',
       'bid_size_6', 'bid_size_0', 'pressure_1', 'pressure_2', 'pressure_3',
        'pressure_5', 'pressure_6', 'pressure_0', 'bid_price_1',
       'bid_price_2', 'bid_price_3',  'bid_price_5',
       'bid_price_6', 'bid_price_0', 'matched_size_6', 'ask_size_0',
       'ask_size_1', 'ask_size_2', 'ask_size_3',  'ask_size_5',
       'ask_size_6', 'ask_price_0', 'ask_price_1', 'ask_price_2',
       'ask_price_3',  'ask_price_5', 'ask_price_6',
       'reference_price_0', 'reference_price_1', 'reference_price_2',
       'reference_price_3',  'reference_price_5',
       'ask_size_bid_size_0', 'ask_size_bid_size_1', 'ask_size_bid_size_2',
       'ask_size_bid_size_3',  'ask_size_bid_size_5',
       'ask_size_bid_size_6', 'ask_size_pressure_0', 'ask_size_pressure_1',
       'ask_size_pressure_2', 'ask_size_pressure_3', 
       'ask_size_pressure_5', 'ask_size_pressure_6', 'bid_size_pressure_0',
       'bid_size_pressure_1', 'bid_size_pressure_2', 'bid_size_pressure_3',
        'bid_size_pressure_5', 'bid_size_pressure_6',
         'sign_0', 'sign_1', 'sign_2', 'sign_3',
        'sign_5']

In [11]:
df_x5 = df_x[df_x5_column]
val_y5 = val_y
df_x4 = df_x[df_x4_column]
val_y4 = val_y

In [12]:
len(df_x.columns)

74

In [13]:
model = lgbm.LGBMRegressor(learning_rate = 0.01 , max_depth = -1,device = "gpu", random_state=1234 , objective='mae', n_estimators=500,num_leaves = 256,subsample = 0.75)
#model6 = lgbm.LGBMRegressor(learning_rate = 0.5 , max_depth = 1 , random_state=1234 , objective='mae', n_estimators=2,num_leaves = 2,subsample = 0.75)
model.fit(df_x, val_y)

In [14]:
model5 = lgbm.LGBMRegressor(learning_rate = 0.01 , max_depth = -1,device = "gpu", random_state=1234 , objective='mae', n_estimators=500,num_leaves = 256,subsample = 0.75)
#model7 = lgbm.LGBMRegressor(learning_rate = 0.5 , max_depth = 1, random_state=1234 , objective='mae', n_estimators=2,num_leaves = 2,subsample = 0.75)
model5.fit(df_x5, val_y5)

In [15]:
model4 = lgbm.LGBMRegressor(learning_rate = 0.01 , max_depth = -1,device = "gpu", random_state=1234 , objective='mae', n_estimators=500,num_leaves = 256,subsample = 0.75)
#model8 = lgbm.LGBMRegressor(learning_rate = 0.5 , max_depth = 1, random_state=1234 , objective='mae', n_estimators=2,num_leaves = 2,subsample = 0.75)
model4.fit(df_x4, val_y4)

df_test

In [16]:
def test_fun(df , x):

    df = df.fillna(0.000000001)
    
    df['pressure'] = df['imbalance_size']*df['imbalance_buy_sell_flag']
    df['bid_ask_size_gap'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'] + 0.000001)
    df['power0'] = (df['matched_size'] + df['pressure']) / (df['matched_size'] + abs(df['pressure']) + 0.000001)
    df['power1'] = (df['matched_size'] + df['pressure'] + df['bid_size'] - df['ask_size']) / (df['matched_size'] + df['pressure'] + df['bid_size'] + df['ask_size'] + 0.000001)

    df = df.drop(['imbalance_size','imbalance_buy_sell_flag'] , axis = 1)
    
    #tiem series
    column = ['reference_price' ,'ask_size','bid_size','pressure','ask_price','bid_price','matched_size']
    
    shift_list = []
    for i in range(1, x+1):
        shift_list.append(i)
    
    for c in column:
        gv = df[["stock_id", "date_id", c]].groupby(["stock_id","date_id"])
        for s in shift_list:
            df[c + '_' + str(s)] = gv.shift(s)
        df[c + '_' + str(0)] = df[c]
        df = df.drop([c] , axis = 1)
    
    # feaeture setting
    
    column1 = ['ask_size','ask_price']
    for c in column1:
        for i in range(0,x+1):
            df['neg' + '_' + c + '_' + str(i)] = -df[c + '_' + str(i)]
            df = df.drop([c + '_' + str(i)] , axis = 1)
    for c in column1:
        for i in range(0,x+1):
            df[c + '_' + str(i)] = df['neg' + '_' + c + '_' + str(i)]
            df = df.drop(['neg' + '_' + c + '_' + str(i)], axis = 1)
    
    for i in range(0,x):
        j = i+1
        df['reference_price_deri_' + str(i)] = df['reference_price_' + str(i)] - df['reference_price_' + str(j)]
    for i in range(0, x+1):
        df = df.drop(['reference_price_' + str(i)] , axis = 1)
    
    for i in range(0,x):
        df['reference_price_' + str(i)] = df['reference_price_deri_' + str(i)]
    for i in range(0,x):
        df = df.drop(['reference_price_deri_' + str(i)] , axis = 1)

    #feature generating
    
    split_column1 = ['ask_size','bid_size','pressure']
    
    day = []
    for i in range(0, x+1):
        day.append(i)
    
    for i in range(0, len(split_column1)):
        for j in range(0, len(split_column1)):
            for d in day:
                if i < j and i != j:
                    df[split_column1[i] + '_' + split_column1[j] + '_' + str(d)] = df[split_column1[i] + '_' + str(d)] + df[split_column1[j] + '_' + str(d)]
                    
    
    for i in range(0,6):
        df['sign_' + str(i)] = df['reference_price_' + str(i)]*df['matched_size_' + str(i)]
        df = df.drop(['matched_size_' + str(i)] , axis = 1)
    
    df = df.drop(['date_id' , 'wap'] , axis = 1)
    

    df = df.fillna(0.000000001)    
    

    return df

제출

In [17]:
test_column = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'bid_price', 'bid_size', 'ask_price','ask_size', 'wap']

df_pro = df_train[test_column]
df_pro = df_pro.iloc[:][-4000:]

In [18]:
import optiver2023
optiver2023.make_env.func_dict['__called__'] = False
env = optiver2023.make_env()
iter_test = env.iter_test()

In [19]:
cnt = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    
    if test.iloc[0]['currently_scored'] == False:
       sample_prediction['target'] = 0
       env.predict(sample_prediction)
       continue

    test_pro = test[test_column]
    df_pro = pd.concat([df_pro , test_pro] , axis = 0)
    
    test_pro = test_fun(df_pro , 6)
    test_pro5 = test_pro[df_x5.columns]
    test_pro4 = test_pro[df_x4.columns]

    test_pro = test_pro.iloc[:][-len(test):]
    test_pro5 = test_pro5.iloc[:][-len(test):]
    test_pro4 = test_pro4.iloc[:][-len(test):]

    pred6 = model.predict(test_pro)
    pred5 = model5.predict(test_pro5)
    pred4 = model4.predict(test_pro4)
    
    pred6 = pred6.reshape(len(test),1) 
    pred5 = pred5.reshape(len(test),1) 
    pred4 = pred4.reshape(len(test),1) 
    
    pred = (pred5 + pred6 + pred4)/3
    
    #if cnt >= 0 and cnt <= 1:
    #    print(pred)
    
    sample_prediction['target'] = pred
    env.predict(sample_prediction)
    cnt += 1

cnt

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


0

In [20]:
!head submission.csv

row_id,target
478_0_0,0
478_0_1,0
478_0_2,0
478_0_3,0
478_0_4,0
478_0_5,0
478_0_6,0
478_0_7,0
478_0_8,0
