# Import required libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras.layers import BatchNormalization
from keras.models import Sequential, Model
from keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Reshape
from keras import backend as K
from keras import regularizers 
from tensorflow.keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.regularizers import l2
from tensorflow.keras.losses import Loss

# Load Dataset

In [2]:
stock_price_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")

# A glance at the data

In [3]:
print('(rows, columns) =', stock_price_df.shape)
stock_price_df.tail()

(rows, columns) = (2332531, 12)


Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
2332526,20211203_9990,2021-12-03,9990,514.0,528.0,513.0,528.0,44200,1.0,,False,0.034816
2332527,20211203_9991,2021-12-03,9991,782.0,794.0,782.0,794.0,35900,1.0,,False,0.025478
2332528,20211203_9993,2021-12-03,9993,1690.0,1690.0,1645.0,1645.0,7200,1.0,,False,-0.004302
2332529,20211203_9994,2021-12-03,9994,2388.0,2396.0,2380.0,2389.0,6500,1.0,,False,0.009098
2332530,20211203_9997,2021-12-03,9997,690.0,711.0,686.0,696.0,381100,1.0,,False,0.018414


In [4]:
stock_price_df['ExpectedDividend'] = stock_price_df['ExpectedDividend'].fillna(0)
stock_price_df['SupervisionFlag'] = stock_price_df['SupervisionFlag'].map({True: 1, False: 0})
stock_price_df['Date'] = pd.to_datetime(stock_price_df['Date'])
stock_price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2332531 entries, 0 to 2332530
Data columns (total 12 columns):
 #   Column            Dtype         
---  ------            -----         
 0   RowId             object        
 1   Date              datetime64[ns]
 2   SecuritiesCode    int64         
 3   Open              float64       
 4   High              float64       
 5   Low               float64       
 6   Close             float64       
 7   Volume            int64         
 8   AdjustmentFactor  float64       
 9   ExpectedDividend  float64       
 10  SupervisionFlag   int64         
 11  Target            float64       
dtypes: datetime64[ns](1), float64(7), int64(3), object(1)
memory usage: 213.5+ MB


# Import stock list


In [5]:
stock_list = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")

In [6]:
stock_list = stock_list[['SecuritiesCode','NewMarketSegment','33SectorCode','17SectorCode','Universe0','Section/Products','NewIndexSeriesSize']]
stock_list = stock_list.replace(np.nan,'-')
stock_list['Universe0'] = np.where(stock_list['Universe0'], 1, 0)
stock_list = stock_list.drop_duplicates()
stock_list

Unnamed: 0,SecuritiesCode,NewMarketSegment,33SectorCode,17SectorCode,Universe0,Section/Products,NewIndexSeriesSize
0,1301,Prime Market,50,1,1,First Section (Domestic),TOPIX Small 2
1,1305,-,-,-,0,ETFs/ ETNs,-
2,1306,-,-,-,0,ETFs/ ETNs,-
3,1308,-,-,-,0,ETFs/ ETNs,-
4,1309,-,-,-,0,ETFs/ ETNs,-
...,...,...,...,...,...,...,...
4412,9994,Standard Market,6100,14,1,First Section (Domestic),TOPIX Small 2
4413,9995,Prime Market,6050,13,0,First Section (Domestic),TOPIX Small 2
4414,9996,Standard Market,6050,13,0,JASDAQ(Standard / Domestic),-
4415,9997,Prime Market,6100,14,1,First Section (Domestic),TOPIX Small 1


# Some Feature Engineering

In [7]:
# 计算特征
def FE(stock_price_df):
    stock_price_df['BOP'] = (stock_price_df['Open']-stock_price_df['Close'])/(stock_price_df['High']-stock_price_df['Low'])
    stock_price_df['wp'] = (stock_price_df['Open']+stock_price_df['High']+stock_price_df['Low'])/3
    stock_price_df['TR'] = stock_price_df['High'] - stock_price_df['Low']
    # stock_price_df['AD'] = ta.AD(High, Low, Close, Volume)
    # stock_price_df['OBV']  = ta.OBV(Close, Volume)
    stock_price_df['OC'] = stock_price_df['Open'] * stock_price_df['Close']
    stock_price_df['HL'] = stock_price_df['High'] * stock_price_df['Low']
    stock_price_df['logC'] = np.log(stock_price_df['Close']+1)
    stock_price_df['OHLCstd'] = stock_price_df[['Open','Close','High','Low']].std(axis=1)
    stock_price_df['OHLCskew'] = stock_price_df[['Open','Close','High','Low']].skew(axis=1)
    stock_price_df['OHLCkur'] = stock_price_df[['Open','Close','High','Low']].kurtosis(axis=1)
    stock_price_df['Cpos'] = (stock_price_df['Close']-stock_price_df['Low'])/(stock_price_df['High']-stock_price_df['Low']) -0.5
    stock_price_df['bsforce'] = stock_price_df['Cpos'] * stock_price_df['Volume']
    stock_price_df['Opos'] = (stock_price_df['Open']-stock_price_df['Low'])/(stock_price_df['High']-stock_price_df['Low']) -0.5
    stock_price_df['Date'] = pd.to_datetime(stock_price_df['Date'])
    stock_price_df['weekday'] = stock_price_df['Date'].dt.weekday+1
    stock_price_df['Monday'] = np.where(stock_price_df['weekday']==1,1,0)
    stock_price_df['Tuesday'] = np.where(stock_price_df['weekday']==2,1,0)
    stock_price_df['Wednesday'] = np.where(stock_price_df['weekday']==3,1,0)
    stock_price_df['Thursday'] = np.where(stock_price_df['weekday']==4,1,0)
    stock_price_df['Friday'] = np.where(stock_price_df['weekday']==5,1,0)
    return stock_price_df
stock_price_df = FE(stock_price_df)
stock_price_df = pd.merge(stock_price_df,stock_list, on='SecuritiesCode')

In [8]:
subf = ['Open', 'High', 'Low', 'Close',
       'Volume', 'ExpectedDividend',
       'SupervisionFlag', 'BOP', 'wp', 'TR', 'OC', 'HL', 'logC',
       'OHLCstd', 'OHLCskew', 'OHLCkur', 'Cpos', 'bsforce', 'Opos']

# standardize features according to daily stock data (at the same day only)

In [9]:
# 按天分组进行计算
def daily_standardize(df,col):
    avg = df[[col,'Date']].groupby('Date').mean()
    avg.columns = ['avg']
    avg['Date'] = avg.index
    avg = avg.reset_index(drop=True)
    std = df[[col,'Date']].groupby('Date').std()
    std.columns = ['std']
    std['Date'] = std.index
    std = std.reset_index(drop=True)
    df = pd.merge(df, avg, on='Date')
    df = pd.merge(df,std,on='Date')
    df[col] = (df[col] - df['avg'])/df['std']
    df = df.drop(['avg','std'],axis=1)
    df[col] = df[col].fillna(0)
    return df

In [10]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

In [11]:
for i in subf:
    stock_price_df = daily_standardize(stock_price_df,i)

In [12]:
stock_price_df = stock_price_df.fillna(0)
stock_price_df.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,...,Tuesday,Wednesday,Thursday,Friday,NewMarketSegment,33SectorCode,17SectorCode,Universe0,Section/Products,NewIndexSeriesSize
0,20170104_1301,2017-01-04,1301,0.206799,0.198286,0.211367,0.199778,-0.119214,1.0,0.0,...,0,1,0,0,Prime Market,50,1,1,First Section (Domestic),TOPIX Small 2
1,20170104_1332,2017-01-04,1332,-0.501139,-0.504093,-0.501782,-0.504558,0.151123,1.0,0.0,...,0,1,0,0,Prime Market,50,1,1,First Section (Domestic),TOPIX Mid400
2,20170104_1333,2017-01-04,1333,0.342765,0.344951,0.346297,0.351611,-0.095825,1.0,0.0,...,0,1,0,0,Prime Market,50,1,1,First Section (Domestic),TOPIX Mid400
3,20170104_1376,2017-01-04,1376,-0.193255,-0.190134,-0.190129,-0.186942,-0.121178,1.0,0.0,...,0,1,0,0,Standard Market,50,1,1,First Section (Domestic),TOPIX Small 2
4,20170104_1377,2017-01-04,1377,0.381986,0.390078,0.389079,0.390542,-0.107549,1.0,0.0,...,0,1,0,0,Prime Market,50,1,1,First Section (Domestic),TOPIX Small 1


# standardize "Target" as well

In [13]:
stock_price_df = daily_standardize(stock_price_df,'Target')

In [14]:
stock_price_df['Target'].head()

0   -0.028649
1    0.679476
2    0.302609
3    0.601867
4    0.111545
Name: Target, dtype: float64

# Tranform the "SecuritiesCode" of stock as well

In [15]:
investment_ids = list(stock_price_df['SecuritiesCode'].unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
with tf.device("cpu"):
    investment_id_lookup_layer.adapt(stock_price_df['SecuritiesCode'])

2022-07-05 17:23:54.573343: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-07-05 17:23:54.735190: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


## Define the DNN model that separately trains "SecuritiesCode" & features, and then concatenate them together for the final training

## You can adjust number of layers, neurons, and etc.

In [16]:
# 神经网络
# 两个特征
def get_model():
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((len(subf), ), dtype=tf.float16)
    # 对输入ID做Embedding
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 4, input_length=1)(investment_id_x)
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    # 全连接
    investment_id_x = layers.Dense(16, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(16, activation='swish')(investment_id_x)
    
    feature_x = layers.Dense(16, activation='swish')(features_inputs)
    feature_x = layers.Dense(16, activation='swish')(feature_x)
    
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    x = layers.Dense(16, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(16, activation='swish', kernel_regularizer="l2")(x)
    output = layers.Dense(1)(x)
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    model.compile(optimizer=tf.optimizers.Adam(0.05), loss='mse', metrics=['mse', "mae", "mape", rmse])
    return model


## Transform the train data so that it can used for DNN model training.

In [17]:
def preprocess(X, y):
    print(X)
    print(y)
    return X, y
def make_dataset(feature, investment_id, y, batch_size=32, mode="train"):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y))
    ds = ds.map(preprocess)
    if mode == "train":
        # 打乱，256为随机种子
        ds = ds.shuffle(256)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

## Use GroupKFold to train 5 DNN models (the same "Sector" stock should always be in the same group)

In [18]:
# 交叉验证
# 根据股票代码分组
from sklearn.model_selection import GroupKFold
kfold = GroupKFold(n_splits = 5)

In [19]:
count=0
df_x = stock_price_df[subf]
df_y=stock_price_df['Target']
time_id = stock_price_df['33SectorCode']
dnn_models = []
for train_index, val_index in kfold.split(df_x, df_y,time_id):
    # Split training dataset.
    train_x, train_y = df_x.iloc[train_index], df_y.iloc[train_index]
    train_inv =stock_price_df['SecuritiesCode'].iloc[train_index]
    # Split validation dataset.
    val_x, val_y = df_x.iloc[val_index], df_y.iloc[val_index]
    val_inv =  stock_price_df['SecuritiesCode'].iloc[val_index]
    # Make tensor dataset.
    tf_train = make_dataset(train_x, train_inv, train_y, batch_size=4096, mode="train")
    tf_val = make_dataset(val_x, val_inv, val_y, batch_size=4096, mode="train")
    # Load model
    model = get_model()
  
    model.fit(tf_train, epochs = 3,
             validation_data = (tf_val), shuffle=True)
    model.save_weights('my_dnn_'+str(count)+'.tf')
    dnn_models.append(model)
    count+=1

    del tf_train
    del tf_val
    del model, train_x, train_y, val_x, val_y

(<tf.Tensor 'args_0:0' shape=() dtype=int64>, <tf.Tensor 'args_1:0' shape=(19,) dtype=float64>)
Tensor("args_2:0", shape=(), dtype=float64)
(<tf.Tensor 'args_0:0' shape=() dtype=int64>, <tf.Tensor 'args_1:0' shape=(19,) dtype=float64>)
Tensor("args_2:0", shape=(), dtype=float64)
Epoch 1/3
Epoch 2/3
Epoch 3/3
(<tf.Tensor 'args_0:0' shape=() dtype=int64>, <tf.Tensor 'args_1:0' shape=(19,) dtype=float64>)
Tensor("args_2:0", shape=(), dtype=float64)
(<tf.Tensor 'args_0:0' shape=() dtype=int64>, <tf.Tensor 'args_1:0' shape=(19,) dtype=float64>)
Tensor("args_2:0", shape=(), dtype=float64)
Epoch 1/3
Epoch 2/3
Epoch 3/3
(<tf.Tensor 'args_0:0' shape=() dtype=int64>, <tf.Tensor 'args_1:0' shape=(19,) dtype=float64>)
Tensor("args_2:0", shape=(), dtype=float64)
(<tf.Tensor 'args_0:0' shape=() dtype=int64>, <tf.Tensor 'args_1:0' shape=(19,) dtype=float64>)
Tensor("args_2:0", shape=(), dtype=float64)
Epoch 1/3
Epoch 2/3
Epoch 3/3
(<tf.Tensor 'args_0:0' shape=() dtype=int64>, <tf.Tensor 'args_1:0' sh

## average model predictions

In [20]:
def infer(models, ds):
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

## Transform test set so that it can be used for prediction

In [21]:
def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0

def make_test_dataset(feature, investment_id, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

## Create features for the test set, standardize them, and use 5 DNN models averaging to make final prediction.

In [22]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    
    prices['ExpectedDividend'] = prices['ExpectedDividend'].fillna(0)
    prices['SupervisionFlag'] = prices['SupervisionFlag'].map({True: 1, False: 0})
    prices = pd.merge(prices,stock_list, on='SecuritiesCode')
    
    prices = FE(prices)
    
    prices[subf] = StandardScaler().fit_transform(prices[subf])
    prices = prices.fillna(0)


    prices['inference'] = -infer(dnn_models, make_test_dataset(prices[subf],prices['SecuritiesCode']))
    prices['rank'] = prices['inference'].rank(method='first')-1
    prices['rank'] = prices['rank'].apply(lambda x: int(x))
    prices = prices.drop('Date',axis=1)

    sample_prediction = pd.merge(sample_prediction, prices, on=['SecuritiesCode'])[['Date','SecuritiesCode','rank']]
    sample_prediction['rank'] = sample_prediction['rank'].fillna(1000)
    sample_prediction.columns = ['Date','SecuritiesCode','Rank']
    print(sample_prediction)
    
    env.predict(sample_prediction)   # register your predictions

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
            Date  SecuritiesCode  Rank
0     2021-12-06            1301  1931
1     2021-12-06            1332   238
2     2021-12-06            1333  1540
3     2021-12-06            1375   360
4     2021-12-06            1376   773
...          ...             ...   ...
1995  2021-12-06            9990   320
1996  2021-12-06            9991  1124
1997  2021-12-06            9993   870
1998  2021-12-06            9994  1650
1999  2021-12-06            9997   328

[2000 rows x 3 columns]
            Date  SecuritiesCode  Rank
0     2021-12-07            1301  1967
1     2021-12-07            1332   282
2     2021-12-07            1333  1554
3     2021-12-07            1375   533
4     2021-12-07            1376   766
...          ...             ...   ...
1995  2021-12-07            9990   177
1996  2021-12-07            9991   655
1997  2021-12-07            999

In [23]:
len(np.unique(sample_prediction['Rank']))

2000

In [24]:
alt = prices.copy()
alt.head()

Unnamed: 0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,...,bsforce,Opos,weekday,Monday,Tuesday,Wednesday,Thursday,Friday,inference,rank
0,20211207_1301,1301,0.048464,0.049647,0.053,0.052898,-0.278169,1.0,0.0,-0.050063,...,-0.218825,-0.686894,2,0,1,0,0,0,0.00398,1967
1,20211207_1332,1332,-0.502494,-0.505402,-0.509387,-0.508151,2.579301,1.0,0.0,-0.050063,...,0.569352,2.783155,2,0,1,0,0,0,0.003868,282
2,20211207_1333,1333,-0.091261,-0.094452,-0.088799,-0.093793,-0.230086,1.0,0.0,-0.050063,...,-0.1852,-0.172354,2,0,1,0,0,0,0.003943,1554
3,20211207_1375,1375,-0.353243,-0.350406,-0.350864,-0.349832,-0.229508,1.0,0.0,-0.050063,...,-0.168163,-1.101229,2,0,1,0,0,0,0.003887,533
4,20211207_1376,1376,-0.3199,-0.32172,-0.319023,-0.320538,-0.284079,1.0,0.0,-0.050063,...,-0.225821,-0.029674,2,0,1,0,0,0,0.0039,766
