In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate

tf.reset_default_graph()

In [None]:
def MinMaxScaler(data):
    
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    # noise term prevents the zero division
    return numerator / (denominator + 1e-7)

def gen_features(X):
    strain = []
    strain.append(X.mean())
    strain.append(X.std())
    strain.append(X.min())
    strain.append(X.max())
    strain.append(X.kurtosis())
    strain.append(X.skew())
    strain.append(np.quantile(X,0.01))
    strain.append(np.quantile(X,0.05))
    strain.append(np.quantile(X,0.95))
    strain.append(np.quantile(X,0.99))
    strain.append(np.abs(X).max())
    strain.append(np.abs(X).mean())
    strain.append(np.abs(X).std())
    return pd.Series(strain)

In [None]:
if "DISPLAY" not in os.environ:
    # remove Travis CI Error
    matplotlib.use('Agg')

# train Parameters
seq_length = 100
data_dim = 13
hidden_dim = 10
output_dim = 1
learning_rate = 0.01
iterations = 500

# Open, High, Low, Volume, Close
xy = np.loadtxt('preprocessed/X_train_1.5K.csv', delimiter=',', usecols=(1,2,3,4,5,6,7,8,9,10,11,12,13))
y = np.loadtxt('preprocessed/y_train_1.5K.csv', delimiter = ',', usecols=(1))
y = np.reshape(y, (len(y),1))
#print(xy)
#print(y)
print("xy shape : ", xy.shape, "y shape : ",y.shape)

# train/test split
train_size = int(len(xy) * 0.7)
train_setx = xy[0:train_size]
train_sety = y[0:train_size]
test_setx = xy[train_size - seq_length:]  # Index from [train_size - seq_length] to utilize past sequence
test_sety = y[train_size - seq_length:]

# Scale each
# train_setx = MinMaxScaler(train_setx)
# train_sety = MinMaxScaler(train_sety)
# test_setx = MinMaxScaler(test_setx)
# test_sety = MinMaxScaler(test_sety)
print("train_setx shape : ", train_setx.shape, "train_sety shape : ", train_sety.shape)
print("test_setx shape : ", test_setx.shape, "test_sety shape : ", test_sety.shape)

# build datasets
def build_dataset(XX, YY, seq_length):
    dataX = []
    dataY = []
    for i in range(0, len(XX) - seq_length):
        _x = XX[i:i + seq_length, :]
        _y = YY[i+seq_length]

        #print("_x shape : ", _x.shape)
        #print("_y shape : ", _y.shape)
        #print(_x, "->", _y)
        dataX.append(_x)
        dataY.append(_y)
    return np.array(dataX), np.array(dataY)

trainX, trainY = build_dataset(train_setx, train_sety, seq_length)
testX, testY = build_dataset(test_setx, test_sety, seq_length)

# print("trainX")
# print(trainX)
# print("trainY")
# print(trainY)
# print("testX")
# print(testX)
# print("testY")
# print(testY)
print("trainX shape : ", trainX.shape, "trainY shape : ", trainY.shape)
print("testX shape : ", testX.shape, "testY shape : ", testY.shape)

# input place holders
X = tf.placeholder(tf.float32, [None, seq_length, data_dim])
Y = tf.placeholder(tf.float32, [None, 1])

# build a LSTM network
cell = tf.contrib.rnn.BasicLSTMCell(
    num_units=hidden_dim, state_is_tuple=True, activation=tf.tanh)
outputs, _states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
Y_pred = tf.contrib.layers.fully_connected(
    outputs[:, -1], output_dim, activation_fn=None)  # We use the last cell's output
print("Y_pred shape : ", Y_pred.shape)

# cost/loss
loss = tf.reduce_sum(tf.square(Y_pred - Y))  # sum of the squares
# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)

# RMSE
targets = tf.placeholder(tf.float32, [None, 1])
predictions = tf.placeholder(tf.float32, [None, 1])
rmse = tf.sqrt(tf.reduce_mean(tf.square(targets - predictions)))

sess = tf.Session() 
init = tf.global_variables_initializer()
sess.run(init)

# Training step
for i in range(iterations):
    _, step_loss = sess.run([train, loss], feed_dict={
                            X: trainX, Y: trainY})
    print("[step: {}] loss: {}".format(i, step_loss))
        
#     scores = cross_validate(reg_lin, X_train, y_train, scoring='neg_mean_absolute_error', cv=5)
#     print(scores['test_score'])
#     print("{:.4f}".format(sum(scores['test_score']) / 5))

# Test step
test_predict = sess.run(Y_pred, feed_dict={X: testX})
print("test_predict shape : ", test_predict.shape)
rmse_val = sess.run(rmse, feed_dict={
                targets: testY, predictions: test_predict})
print("RMSE: {}".format(rmse_val))
    

# Plot predictions
plt.plot(testY)
plt.plot(test_predict)
plt.xlabel("Time Period")
plt.ylabel("Time To Failure")
plt.show()


  This is separate from the ipykernel package so we can avoid doing imports until


xy shape :  (419431, 13) y shape :  (419431, 1)
train_setx shape :  (293601, 13) train_sety shape :  (293601, 1)
test_setx shape :  (125930, 13) test_sety shape :  (125930, 1)
trainX shape :  (293501, 100, 13) trainY shape :  (293501, 1)
testX shape :  (125830, 100, 13) testY shape :  (125830, 1)
Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').
Y_pred shape :  (?, 1)
[step: 0] loss: 12432017.0
[step: 1] loss: 11858707.0
[step: 2] loss: 10659521.0
[step: 3] loss: 8797360.0
[step: 4] loss: 7515975.0
[step: 5] loss: 6917855.0
[step: 6] loss: 6435944.5
[step: 7] loss: 5946326.5
[step: 8] loss: 5659695.5
[step: 9] loss: 5530247.5
[step: 10] loss: 5426175.0
[step: 11] loss: 5327609.0
[step: 12] loss: 5233472.0
[step: 13] loss: 5143450.0
[step: 14] loss: 5057409.0
[step: 15] loss: 4975326.0
[step: 16] loss: 48971

In [None]:
#s_t = time.time()

test_path = 'test'
test_list = os.listdir(test_path)
X_test = []
number_of_csvs = 2624
number_of_rows_in_each_csvs = 150000
divide_size = int(number_of_rows_in_each_csvs/seq_length)
print("divide_size : ", divide_size)


for path in test_list:
    test = pd.read_csv(os.path.join(test_path, path), dtype=np.float64)
    #print("test shape : ", test.shape)
    for i in range(0, seq_length):
        #test_divided_by_size = test[i*size:(i+1)*15000, :]
        test_divided_by_length = test[i*divide_size:(i+1)*divide_size]
        #print("test_divided_by_length shape : ", test_divided_by_length.shape)

        X_test.append(gen_features(test_divided_by_length).values)
    
X_test = np.array(X_test)
print("X_test shape : ", X_test.shape) #maybe 2624*10 = 26240 (26240,13)
#print(X_test[0:1000, :])

def build_dataset_for_predict(XX, seq_length):
    dataX = []
    for i in range(0, number_of_csvs*seq_length,seq_length): # 0 ~ 26230
        _x = XX[i:i + seq_length, :]
        dataX.append(_x)
    return np.array(dataX)

X_test_final = build_dataset_for_predict(X_test, seq_length)
print("X_test_final shape : ", X_test_final.shape) # (26240, 10, 13)

#X_test = MinMaxScaler(X_test)
#scaler = StandardScaler().fit(X_test)
#X_test = scaler.transform(X_test)

predict = sess.run(Y_pred, feed_dict={X: X_test_final})
print("predict shape : ", predict.shape)

# reg = reg_grid.best_estimator_
# reg.fit(X_train, y_train)
# y_pred = reg.predict(X_test)
pd.DataFrame(predict).to_csv('prediction_LSTM.csv', header=None)

#e_t = time.time()
#duration = (e_t - s_t) / 60
#print("{:.1f} min tooked".format(duration))