In [50]:
import pandas as pd

# Import data

In [51]:
df = pd.read_csv('stationary_data.csv')

In [52]:
df.head()

Unnamed: 0,start,close_price,tweet_count,twitter_sentiment,volume,volatility,rsi,macd,reddit_sentiment,reddit_post_count
0,2018-01-03,-0.078084,0.06845,6.096648,25299970.0,-0.141918,0.016552,30.100003,0.213028,-104.0
1,2018-01-04,0.011353,-0.066855,4.738479,4911300000.0,-0.021054,0.030055,53.719591,-0.83322,-17.0
2,2018-01-05,0.085086,-0.084542,5.809624,2057699000.0,0.243666,0.114442,185.854342,0.342103,48.0
3,2018-01-06,-0.105366,-0.24669,6.64363,-5526299000.0,0.163605,0.005255,149.031677,-0.065387,-185.0
4,2018-01-07,-0.067319,0.091045,8.091246,-2448600000.0,0.006811,-0.090538,26.993014,0.184959,53.0


## imputer

In [53]:
from sklearn.impute import KNNImputer

In [54]:
imputer = KNNImputer(n_neighbors=2)
df['reddit_sentiment'],df['reddit_post_count'],df['twitter_sentiment'] = imputer.fit_transform(df[['reddit_sentiment','reddit_post_count','twitter_sentiment']]).T

## Train test split

In [55]:
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

# mpl.rcParams['figure.figsize'] = (8, 6)
# mpl.rcParams['axes.grid'] = False

In [56]:
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]

num_features = df.shape[1]

### Standarize

In [57]:
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

  train_mean = train_df.mean()
  train_std = train_df.std()


In [149]:
train_mean

close_price         -7.657057e-05
tweet_count         -9.431487e-04
twitter_sentiment    4.868430e+00
volume               4.045745e+06
volatility          -1.625830e-03
rsi                  3.539409e-04
macd                -1.177125e-02
reddit_sentiment    -1.306929e-03
reddit_post_count   -1.290079e+00
dtype: float64

# Model?

## Creating datasets

In [17]:
train_df.shape

(905, 10)

In [18]:
test_df.shape

(130, 10)

In [58]:
sequence_length = 7

In [59]:
train_df['target'] = train_df['close_price'].shift(-sequence_length)

In [60]:
train_df.drop('start', axis = 1, inplace = True)

In [61]:
train_df.dropna()

Unnamed: 0,close_price,macd,reddit_post_count,reddit_sentiment,rsi,tweet_count,twitter_sentiment,volatility,volume,target
0,-1.282915,0.659219,-1.244836,0.563656,0.145075,0.409370,0.708600,-0.820883,0.005212,1.056800
1,0.187977,1.176309,-0.190403,-2.187755,0.266023,-0.388832,-0.074973,-0.113681,1.203402,-2.237758
2,1.400588,4.069058,0.597392,0.903096,1.021837,-0.493172,0.543006,1.435256,0.503616,2.510337
3,-1.731595,3.262921,-2.226549,-0.168516,0.043896,-1.449731,1.024172,0.966801,-1.356202,-0.248610
4,-1.105872,0.591200,0.657991,0.489839,-0.814077,0.542666,1.859351,0.049367,-0.601460,-1.127149
...,...,...,...,...,...,...,...,...,...,...
893,0.058953,0.013762,-0.578240,-0.571180,-0.005860,0.982887,-1.434736,-0.163125,0.137616,-0.110625
894,-0.087907,-0.071556,0.645871,-1.534328,-0.360344,7.794126,-2.789252,0.042164,-0.550381,-0.164714
895,-0.017652,-0.155452,-1.305435,0.530168,-0.445726,-0.612211,-2.407624,-0.030022,-0.033059,-0.170655
896,0.144526,-0.080090,0.209554,1.456391,0.175874,-6.587683,-1.985591,-0.329861,-0.434900,0.317315


In [62]:
X_train = train_df.drop('target', axis = 1)
y_train = train_df['target']

In [90]:
dataset = tf.keras.preprocessing.timeseries_dataset_from_array(
    X_train, y_train, sequence_length=10, batch_size = 16)


In [91]:
exx, exxy = next(iter(dataset))

In [92]:
exxy

<tf.Tensor: shape=(16,), dtype=float64, numpy=
array([ 1.05679971, -2.23775785,  2.51033709, -0.24861039, -1.1271494 ,
        0.74606285, -3.09135903,  2.59901617,  0.85467572, -0.22546441,
        1.5478712 , -3.47993858,  0.77056102,  0.88267402,  0.82300126,
       -0.87084659])>

In [93]:
train_df.iloc[:10]

Unnamed: 0,close_price,macd,reddit_post_count,reddit_sentiment,rsi,tweet_count,twitter_sentiment,volatility,volume,target
0,-1.282915,0.659219,-1.244836,0.563656,0.145075,0.40937,0.7086,-0.820883,0.005212,1.0568
1,0.187977,1.176309,-0.190403,-2.187755,0.266023,-0.388832,-0.074973,-0.113681,1.203402,-2.237758
2,1.400588,4.069058,0.597392,0.903096,1.021837,-0.493172,0.543006,1.435256,0.503616,2.510337
3,-1.731595,3.262921,-2.226549,-0.168516,0.043896,-1.449731,1.024172,0.966801,-1.356202,-0.24861
4,-1.105872,0.5912,0.657991,0.489839,-0.814077,0.542666,1.859351,0.049367,-0.60146,-1.127149
5,-0.343035,-1.959509,2.572938,0.28039,-0.987329,1.151345,-0.66836,-0.184221,0.623827,0.746063
6,0.725799,-2.655829,-1.281196,-0.540476,-0.43382,0.538181,0.855073,-0.026156,-0.4311,-3.091359
7,1.0568,-1.507488,-0.505521,1.080213,0.307582,0.20973,-0.650843,-0.044257,0.450426,2.599016
8,-2.237758,-3.98806,1.118548,-1.073881,-1.205913,0.811707,0.019531,0.342525,-0.483284,0.854676
9,2.510337,-2.144758,-2.299269,-0.547355,0.537809,-1.164093,1.101803,0.122919,-1.096774,-0.225464


In [96]:
exx;

In [97]:
test_df['target'] = test_df['close_price'].shift(-sequence_length)

In [102]:
#test_df.drop('start', axis = 1, inplace = True)

In [103]:
X_test = test_df.drop('target', axis = 1)
y_test = test_df['target']

In [104]:
dataset_test = tf.keras.preprocessing.timeseries_dataset_from_array(
    X_test, y_test, sequence_length=10)

In [105]:
etx, ety = next(iter(dataset_test))

In [106]:
val_df['target'] = val_df['close_price'].shift(-sequence_length)

In [83]:
val_df.drop('start', axis = 1, inplace = True)

In [107]:
X_val = val_df.drop('target', axis = 1)
y_val =val_df['target']

In [108]:
dataset_val = tf.keras.preprocessing.timeseries_dataset_from_array(
    X_val, y_val, sequence_length=10)

In [109]:
evx, evy = next(iter(dataset_val))

In [111]:
len(dataset_test)

1

## train model

In [213]:
lstm_model = tf.keras.models.Sequential([
    # Shape [batch, time, features] => [batch, time, lstm_units]
    tf.keras.layers.GRU(50, return_sequences=True),
    tf.keras.layers.GRU(20),
    #tf.keras.layers.Dropout(0.2),
    # Shape => [batch, time, features]
    tf.keras.layers.Dense(units=20),
    
    tf.keras.layers.Dense(units=1)
    # Adding a second LSTM layer and some Dropout regularisation
])

# regularization?
# GRU layer

In [214]:
lstm_model.compile(optimizer = 'Adam', loss = 'mae')

In [215]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [216]:
es = EarlyStopping(patience = 20, restore_best_weights=True)

In [217]:
reduce_lr = ReduceLROnPlateau(patience = 15)

In [None]:
lstm_model.fit(dataset, validation_data = dataset_val, epochs = 2_000, callbacks = [es, reduce_lr])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
 3/56 [>.............................] - ETA: 1s - loss: 0.0458

In [None]:
lstm_model.evaluate(dataset_test)

In [None]:
y_pred = lstm_model.predict(dataset_test)

In [None]:
etx, ety = next(iter(dataset_test))

In [None]:
y_test = ety.numpy()

In [None]:
y_pred = y_pred*train_std['close_price']+train_mean['close_price']

In [None]:
y_test = y_test*train_std['close_price']+train_mean['close_price']

In [None]:
plt.plot(y_test, label = 'test')
plt.plot(y_pred, label = 'actual')
plt.legend()

In [None]:
((y_pred.flatten() > 0) == (y_test>0)).mean()

In [None]:
y_pred.flatten()>0

In [None]:
from tensorflow.keras import models

In [209]:
lstm_model.save('lstm_gru_7_9')

2021-09-07 17:34:09.650495: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: lstm_gru_7_9/assets


INFO:tensorflow:Assets written to: lstm_gru_7_9/assets


In [210]:
loaded_model = models.load_model('lstm_gru_7_9')

In [211]:
loaded_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, None, 50)          9150      
_________________________________________________________________
gru_1 (GRU)                  (None, 20)                4320      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 21        
Total params: 13,491
Trainable params: 13,491
Non-trainable params: 0
_________________________________________________________________
