In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
%matplotlib inline
import numpy as np
from tqdm import tqdm
from joblib import *
from emulator.apply_factor import get_factors

from datetime import datetime
from datetime import *
from IPython.display import clear_output
import time

In [2]:
def data_processing(data_df, rolling):
    Index = data_df.index
    High = data_df.high.values.astype('float64')
    Low = data_df.low.values.astype('float64')
    Close = data_df.close.values.astype('float64')
    Open = data_df.open.values.astype('float64')
    Volume = data_df.volume.values.astype('float64')
    quotes = get_factors(Index, Open, Close, High, Low, Volume, rolling=rolling, drop=True)
    return quotes

# Import Data

In [3]:
min_data_df = pd.read_csv('KQ.m@SHFE.rb_3min.csv').drop(columns=['datetime_nano','KQ.m@SHFE.rb.open_oi','KQ.m@SHFE.rb.close_oi'])
min_data_df.columns = ['datadate','open','high','low','close','volume']
min_data_df.datadate = pd.to_datetime(min_data_df.datadate)

min_data_df.shape, min_data_df.columns, min_data_df.head(), min_data_df.isnull().values.any(), min_data_df.dtypes

((168625, 6),
 Index(['datadate', 'open', 'high', 'low', 'close', 'volume'], dtype='object'),
              datadate    open    high     low   close  volume
 0 2016-01-04 21:00:00  1778.0  1782.0  1777.0  1778.0   46531
 1 2016-01-04 21:03:00  1778.0  1783.0  1778.0  1782.0   29321
 2 2016-01-04 21:06:00  1782.0  1784.0  1781.0  1783.0   17180
 3 2016-01-04 21:09:00  1783.0  1783.0  1775.0  1775.0   34118
 4 2016-01-04 21:12:00  1775.0  1777.0  1775.0  1775.0   32035,
 False,
 datadate    datetime64[ns]
 open               float64
 high               float64
 low                float64
 close              float64
 volume               int64
 dtype: object)

# Split Training Data

In [4]:
begin_date = '2019-02-01'
end_date = '2019-06-01'
min_train_data = min_data_df[(min_data_df.datadate>=begin_date) & (min_data_df.datadate<=end_date)]
min_train_data = min_train_data.reset_index(drop=True)

min_train_data.head(5), min_train_data.tail(5), min_train_data.shape

(             datadate    open    high     low   close  volume
 0 2019-02-01 09:00:00  3731.0  3732.0  3720.0  3729.0   33599
 1 2019-02-01 09:03:00  3729.0  3732.0  3723.0  3730.0   13187
 2 2019-02-01 09:06:00  3730.0  3732.0  3726.0  3728.0   10914
 3 2019-02-01 09:09:00  3728.0  3735.0  3724.0  3729.0   21272
 4 2019-02-01 09:12:00  3729.0  3732.0  3728.0  3732.0    9194,
                 datadate    open    high     low   close  volume
 8730 2019-05-31 22:45:00  3750.0  3754.0  3749.0  3750.0    8993
 8731 2019-05-31 22:48:00  3750.0  3755.0  3750.0  3755.0    6660
 8732 2019-05-31 22:51:00  3755.0  3763.0  3755.0  3755.0   31856
 8733 2019-05-31 22:54:00  3755.0  3758.0  3752.0  3753.0   11229
 8734 2019-05-31 22:57:00  3753.0  3756.0  3748.0  3754.0   20754,
 (8735, 6))

## Transform Data Type to Meet Ta-lib's Requirement

In [5]:
min_train_data['open'] = min_train_data['open'].values.astype('float64')
min_train_data['high'] = min_train_data['high'].values.astype('float64')
min_train_data['low']  = min_train_data['low'].values.astype('float64')
min_train_data['close'] = min_train_data['close'].values.astype('float64')
min_train_data['volume'] = min_train_data['volume'].values.astype('float64')
rolling = 60
background = rolling*3
window_size = rolling

In [6]:
tmp_processed_df= data_processing(min_train_data, rolling)
drop = min_train_data.shape[0] - tmp_processed_df.shape[0]

drop

122

In [7]:
min_processed_list = []
def min_data_processing(i):
    strat = i
    end = i+window_size+drop+background
    quotes = min_train_data.iloc[strat:end].copy()
    quotes = get_factors(quotes.index,quotes.open.values,quotes.close.values,quotes.high.values,quotes.low.values,\
                                                  quotes.volume.values, rolling=rolling, drop=False)
    min_processed_list.append(quotes.iloc[-window_size:].to_numpy())
    clear_output()
    return min_processed_list
min_processed_list = Parallel(n_jobs=46)(delayed(min_data_processing)(i) for i in tqdm(range(len(min_train_data)-background-drop-window_size)))
min_processed_array = np.asarray(min_processed_list)
for i in tqdm(range(min_processed_array.shape[0])):
    if True in np.isnan(min_processed_array[i]):
        print(i)  
    else:
        pass

min_processed_array.shape

100%|█████████████████████████████████████████████████████████████████████████████| 8373/8373 [00:10<00:00, 831.23it/s]
100%|███████████████████████████████████████████████████████████████████████████| 8373/8373 [00:00<00:00, 22991.18it/s]


(8373,)

In [8]:
sorted_processed_array = []
for i in range(len(min_processed_array)):
    tmp_obs = min_processed_array[i][0]
    sorted_processed_array.append(tmp_obs)
sorted_processed_array = np.asarray(sorted_processed_array)

sorted_processed_array.shape

(8373, 60, 29)

In [9]:
min_processed_array[0]

[array([[-2.04604729e+05, -2.10165001e+00,  2.35076362e+00, ...,
         -2.71176729e+00, -3.32029022e+00,  1.25040319e-01],
        [-1.94143729e+05, -1.49492362e+00,  2.08165841e+00, ...,
         -2.27483507e+00, -2.91440434e+00, -7.14031458e-02],
        [-2.01590329e+05, -1.53326826e+00,  1.72639933e+00, ...,
         -2.16740472e+00, -2.18075616e+00,  1.58835434e-01],
        ...,
        [-2.76790136e+05, -5.65711610e-01,  5.64738592e-01, ...,
         -1.69653774e+00, -1.86629713e+00, -2.08625205e-01],
        [-2.53678136e+05,  3.07575218e-02, -1.11196714e-01, ...,
         -1.32235624e+00, -1.30681111e+00,  4.51918986e-01],
        [-2.65688136e+05,  1.00551291e-01, -6.39063995e-01, ...,
         -1.16739998e+00, -9.72876357e-01,  9.55569687e-02]])]