In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

# Parameters

In [2]:
TF1 = 1
TF2 = 2
TF3 = 3
NTest = 96
NVal = 100
T1 = 30
D = 15
NFeature = 6

In [3]:
def split_X_Y(ar, T, TF):
  X = []
  Y = []
  for t in range(len(ar) - T - TF + 1):
    x = ar[t : t + T, 0: -1]
    X.append(x)
    y = ar[t + T : t + T + TF, -1:]
    Y.append(y)

  X = np.array(X).astype(float) 
  if TF == 1:
    Y = np.array(Y).astype(float).reshape(len(Y), )
  else:
    Y = np.array(Y).astype(float).reshape(len(Y), TF)
  print("X.shape", X.shape, "Y.shape", Y.shape) 
  return X, Y

In [4]:
def feature_selection(x_train, y_train):
  rfe = RFE(RandomForestRegressor(), n_features_to_select=18)
  fit = rfe.fit(x_train.reshape(len(x_train),D*T1), y_train)
  important_features = list()
  for i in range(len(fit.support_)):
      if fit.support_[i]:
          important_features.append(i)
  result = np.array(important_features)
  calMostFeature = [0]*D
  for i in result:
    calMostFeature[i%D] = calMostFeature[i%D] + 1
  print(calMostFeature)
  return calMostFeature

# Normalization Dataset

In [8]:
df_normalize = pd.read_csv("G:/My Drive/CNTMDT/_Final/Dataset/normalization.csv", parse_dates=['Date'], index_col=0)
df_normalize.head(5)

Unnamed: 0_level_0,Gold_Close,Oil_Close,Volume_Top2,Volume_Top3,Volume_Top4,Volume_Top5,X1,X2,X3,X4,X5,X6,X7,X8,X9,Y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-03-12,0.195054,0.428501,0.252108,0.035745,0.666943,0.028726,0.090175,0.096398,0.215437,0.385386,0.294139,0.478063,0.760187,0.813189,0.055378,0.0
2020-03-13,0.066875,0.429926,0.236596,0.038759,0.662361,0.030444,0.023122,0.096398,0.145379,0.473294,0.327844,0.241249,0.699492,0.549101,0.038502,0.009472
2020-03-14,0.066875,0.429926,0.224523,0.036015,0.668942,0.03007,0.027929,0.096398,0.153463,0.453264,0.315684,0.060818,0.682915,0.560035,0.160002,0.003668
2020-03-15,0.066875,0.429926,0.241015,0.036838,0.654064,0.026986,0.015865,0.096398,0.129212,0.405045,0.227655,0.26753,0.500701,0.355787,0.2678,0.006734
2020-03-16,0.014977,0.411145,0.231549,0.03391,0.652156,0.028229,0.012953,0.096398,0.129212,0.416914,0.271336,0.282272,0.733482,0.526008,0.069977,0.000698


In [9]:
ar_original_minmax = df_normalize.to_numpy()
ar_original_minmax

array([[0.195054  , 0.4285006 , 0.25210841, ..., 0.81318927, 0.05537789,
        0.        ],
       [0.06687549, 0.42992625, 0.23659559, ..., 0.5491007 , 0.03850218,
        0.00947215],
       [0.06687549, 0.42992625, 0.22452314, ..., 0.56003522, 0.16000223,
        0.00366761],
       ...,
       [0.66997559, 0.70854771, 0.19900319, ..., 0.73869017, 0.08532489,
        0.24308976],
       [0.66997559, 0.70854771, 0.19728917, ..., 0.7039487 , 0.07841119,
        0.25020148],
       [0.66997559, 0.70854771, 0.20929626, ..., 0.50852748, 0.11842803,
        0.27466851]])

## Sliding Windows

In [10]:
x13_original_minmax, y13_original_minmax = split_X_Y(ar_original_minmax, T1, TF1)

X.shape (1066, 30, 15) Y.shape (1066,)


## Train Val Test

In [22]:
x13_original_minmax_train, y13_original_minmax_train = x13_original_minmax[:-NTest], y13_original_minmax[: -NTest]
x13_original_minmax_test, y13_original_minmax_test = x13_original_minmax[-NTest:], y13_original_minmax[-NTest:]

In [23]:
print(x13_original_minmax_train.shape, y13_original_minmax_train.shape )
print(x13_original_minmax_test.shape, y13_original_minmax_test.shape )

(970, 30, 15) (970,)
(96, 30, 15) (96,)


## Feature Section

In [13]:
calMostFeature = feature_selection(x13_original_minmax_train, y13_original_minmax_train)

[3, 2, 0, 1, 0, 0, 11, 1, 0, 0, 0, 0, 0, 0, 0]


In [15]:
top_idx =  np.sort(np.argsort(calMostFeature)[-5:])
top_idx

array([0, 1, 3, 6, 7], dtype=int64)

In [None]:
NFeature = 0
for i in calMostFeature:
    if i !=0:
        NFeature = NFeature + 1

In [26]:
NFeature = 5

In [16]:
np.savetxt(f"G:/My Drive/CNTMDT/_Final/FeatureSelection/{T1}_{TF1}.csv", top_idx, delimiter=",")