In [1]:
import tensorflow as tf
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import pandas as pd
import autoencoder
import model
from datetime import datetime
from datetime import timedelta
sns.set()

## Deep Feed-forward Auto-Encoder Neural Network  to reduce dimension + Deep Recurrent Neural Network + ARIMA + Extreme Boosting Gradient Regressor

### Our target is Close market

In [2]:
google = pd.read_csv('../dataset/predict-set-20190527-0531.csv')
# 取500提高速度
#google = google.iloc[:500, :]

In [3]:
date_ori = pd.to_datetime(google.iloc[:, 0]).tolist()
google.head()


Unnamed: 0,Timestamp,High,Low,Open,Volume,Close
0,1558915200,8660.0,8607.303062,8616.502752,20803.498272,8653.011422
1,1558915500,8729.091604,8653.81505,8653.81505,122780.290759,8689.545605
2,1558915800,8698.578593,8658.14632,8682.595359,48788.811247,8682.085232
3,1558916100,8733.177325,8700.0,8700.0,36294.412783,8733.177325
4,1558916400,8815.225937,8734.0,8734.0,164608.486049,8811.826574


In [16]:
minmax = MinMaxScaler().fit(google.iloc[:, 5].values.reshape((-1,1)))
df_log = MinMaxScaler().fit_transform(google.iloc[:, 1:].astype('float32'))
df_log = pd.DataFrame(df_log)
df_log.head()

Unnamed: 0,0,1,2,3,4
0,0.561347,0.603841,0.526613,0.022655,0.573219
1,0.638965,0.65451,0.569281,0.133711,0.614819
2,0.604686,0.659228,0.602192,0.053132,0.606323
3,0.643555,0.704822,0.622094,0.039525,0.664501
4,0.735728,0.741861,0.660975,0.179262,0.754056


In [5]:
df_log.values[0, 4]
#google.iloc[:, 5]

0.5732186214446511

In [6]:
thought_vector = autoencoder.reducedimension(df_log.values, 4, 0.001, 128, 100)

Instructions for updating:
Colocations handled automatically by placer.
epoch: 10 loss: 0.4378075 time: 0.002884387969970703
epoch: 20 loss: 0.43741098 time: 0.008697032928466797
epoch: 30 loss: 0.43672013 time: 0.009076833724975586
epoch: 40 loss: 0.4354934 time: 0.008598566055297852
epoch: 50 loss: 0.43324736 time: 0.00574493408203125
epoch: 60 loss: 0.4289481 time: 0.010927438735961914
epoch: 70 loss: 0.4203464 time: 0.005697727203369141
epoch: 80 loss: 0.40349907 time: 0.011595010757446289
epoch: 90 loss: 0.37255806 time: 0.004644155502319336
epoch: 100 loss: 0.30822793 time: 0.002702951431274414


In [7]:
thought_vector.shape

(1294, 4)

In [8]:
num_layers = 1
size_layer = 128
timestamp = 5
epoch = 250
dropout_rate = 0.1

In [9]:
tf.reset_default_graph()
modelnn = model.Model(0.01, num_layers, thought_vector.shape[1], size_layer, 1, dropout_rate)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
for i in range(epoch):
    init_value = np.zeros((1, num_layers * 2 * size_layer))
    total_loss = 0
    for k in range(0, (thought_vector.shape[0] // timestamp) * timestamp, timestamp):
        batch_x = np.expand_dims(thought_vector[k: k + timestamp, :], axis = 0)
        batch_y = df_log.values[k + 1: k + timestamp + 1, 4].reshape([-1, 1])
        last_state, _, loss = sess.run([modelnn.last_state, 
                                        modelnn.optimizer, 
                                        modelnn.cost], feed_dict={modelnn.X: batch_x, 
                                                                  modelnn.Y: batch_y, 
                                                                  modelnn.hidden_layer: init_value})
        init_value = last_state
        total_loss += loss
    total_loss /= (thought_vector.shape[0] // timestamp)
    if (i + 1) % 50 == 0:
        print('epoch:', i + 1, 'avg loss:', total_loss)

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




epoch: 50 avg loss: 0.011157123428079492
epoch: 100 avg loss: 0.008497152921251633
epoch: 150 avg loss: 0.007799944285227323
epoch: 200 avg loss: 0.005400152584200462
epoch: 250 avg loss: 0.00542403273766115
epoch: 300 avg loss: 0.006404403195359524
epoch: 350 avg loss: 0.005237310977280185
epoch: 400 avg loss: 0.005206809209317556
epoch: 450 avg loss: 0.005116332863148865
epoch: 500 avg loss: 0.005020703507778858


In [10]:
output_predict = np.zeros(((thought_vector.shape[0] // timestamp) * timestamp, 1))
init_value = np.zeros((1, num_layers * 2 * size_layer))
for k in range(0, (thought_vector.shape[0] // timestamp) * timestamp, timestamp):
    out_logits, last_state = sess.run([modelnn.logits, modelnn.last_state], feed_dict = {modelnn.X:np.expand_dims(thought_vector[k: k + timestamp, :], axis = 0),
                                     modelnn.hidden_layer: init_value})
    init_value = last_state
    output_predict[k: k + timestamp, :] = out_logits

In [11]:
print('Mean Square Error:', np.mean(np.square(output_predict[:, 0] - df_log.iloc[1: (thought_vector.shape[0] // timestamp) * timestamp + 1, 0].values)))

Mean Square Error: 0.041774962109934186


In [12]:
print(output_predict)

[[0.34684539]
 [0.37494174]
 [0.40746546]
 ...
 [0.24420828]
 [0.14625014]
 [0.13179977]]


Import ARIMA model using stats model

In [13]:
import statsmodels.api as sm
from itertools import product
from scipy import stats
    
Qs = range(0, 1)
qs = range(0, 2)
Ps = range(0, 2)
ps = range(0, 2)
D=1
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
best_aic = float("inf")
for param in parameters_list:
    try:
        arima=sm.tsa.statespace.SARIMAX(df_log.iloc[:,4].values, order=(param[0], D, param[1]), seasonal_order=(param[2], D, param[3], 1)).fit(disp=-1)
    except:
        continue
    aic = arima.aic
    if aic < best_aic and aic:
        best_arima = arima
        best_aic = aic
        
best_aic

  return matrix[[slice(None)]*(matrix.ndim-1) + [0]]


-6237.931474951886

In [14]:
def reverse_close(array):
    return minmax.inverse_transform(array.reshape((-1,1))).reshape((-1))

In [17]:
%matplotlib notebook

pred_arima = best_arima.predict()
x_range = np.arange(df_log.shape[0])
fig = plt.figure(figsize = (15,6))
ax = plt.subplot(111)
ax.plot(x_range, reverse_close(df_log.iloc[:,4].values), label = 'true Close')
ax.plot(x_range, reverse_close(pred_arima), label = 'predict Close using Arima')
box = ax.get_position()
ax.set_position([box.x0, box.y0 + box.height * 0.1, box.width, box.height * 0.9])
ax.legend(loc = 'upper center', bbox_to_anchor= (0.5, -0.05), fancybox = True, shadow = True, ncol = 5)
plt.xticks(x_range[::5], date_ori[::5])
plt.title('overlap market Close')
plt.show()

<IPython.core.display.Javascript object>

In [None]:
boundary = (thought_vector.shape[0] // timestamp) * timestamp
stack_predict = np.vstack([pred_arima[:boundary], output_predict.reshape((-1))]).T

In [18]:
where_below_0 = np.where(stack_predict < 0)
where_higher_1 = np.where(stack_predict > 1)
stack_predict[where_below_0[0], where_below_0[1]] = 0
stack_predict[where_higher_1[0], where_higher_1[1]] = 1

NameError: name 'stack_predict' is not defined

In [19]:
corr_df = pd.DataFrame(np.hstack([stack_predict, df_log.values[:boundary, 3].reshape((-1,1))]))

NameError: name 'stack_predict' is not defined

In [20]:
sns.heatmap(corr_df.corr(), annot= True)
plt.show()

NameError: name 'corr_df' is not defined

ARIMA able to predict data that correlate 0.61 originally from original Close

Deep Recurrent Neural Network able to predict data that correlate 0.48 originally from original Close

In [None]:
params_xgd = {
    'max_depth': 7,
    'objective': 'reg:logistic',
    'learning_rate': 0.05,
    'n_estimators': 10000
    }
train_Y = df_log.values[:boundary, 4]
clf = xgb.XGBRegressor(**params_xgd)
clf.fit(stack_predict,train_Y, eval_set=[(stack_predict,train_Y)], 
        eval_metric='rmse', early_stopping_rounds=20, verbose=False)

In [None]:
stacked = clf.predict(stack_predict)

In [None]:
plt.figure(figsize = (15,6))
x_range = np.arange(boundary)
plt.plot(x_range, reverse_close(train_Y), label = 'Real Close')
plt.plot(x_range, reverse_close(pred_arima[:boundary]), label = 'ARIMA Close')
plt.plot(x_range, reverse_close(output_predict), label = 'RNN Close')
plt.plot(x_range, reverse_close(stacked), label = 'Stacked Close')
plt.legend()
plt.xticks(x_range[::5], date_ori[:boundary][::5])
plt.title('stacked RNN + ARIMA with XGB')
plt.show()

# Pretty insane i can say!

In [None]:
from xgboost import plot_importance
plot_importance(clf)
plt.show()

## Arima is more important than RNN