# Bitcoin Time Series Prediction with LSTM

In [None]:
!sudo pip3 install keras

In [None]:
!sudo pip3 install pandas

In [None]:
!sudo pip3 install plotly

In [None]:
!sudo pip3 install seaborn

In [None]:
!sudo pip3 install numpy

In [None]:
!sudo pip3 install sklearn

In [None]:
!sudo pip3 install tensorflow

In [None]:
!sudo pip3 install quandl

#### Import necessary library needed for the model training

In [1]:
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import plotly.offline as py
import plotly.graph_objs as go
import numpy as np
import seaborn as sns
py.init_notebook_mode(connected=True)
%matplotlib inline

Using TensorFlow backend.


#### Use Quandl api to get BTC/USD data from kraken exchange

In [2]:
import quandl
data = quandl.get('BCHARTS/KRAKENUSD', returns='pandas')

In [None]:
bitcoin_data= quandl.get('BITFINEX/BTCUSD',returns='pandas')
eth_data= quandl.get("BITFINEX/ETHUSD",returns='pandas')

#### Read data set

In [3]:
data1 = pd.read_csv(filepath_or_buffer="bitcoinprices.txt")
data2 = pd.read_csv(filepath_or_buffer="sentiment6.txt")

#### View data info

In [4]:
data1['stamp']=data1.iloc[:,0]
data1['price']=data1.iloc[:,1]
data1.drop(data1.columns[ [0,1]],axis=1,inplace=True)

In [5]:
data1.head()

Unnamed: 0,stamp,price
0,20161118,750.9
1,20160613,690.9
2,20160808,588.7
3,20170206,1047.3
4,20161125,737.3


In [6]:
data2['stamp']=data2.iloc[:,0]
data2['sentiment']=data2.iloc[:,1]
data2.drop(data2.columns[0],axis=1,inplace=True)
data2.drop(data2.columns[0],axis=1,inplace=True)

In [7]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 642 entries, 0 to 641
Data columns (total 2 columns):
stamp        642 non-null int64
sentiment    642 non-null object
dtypes: int64(1), object(1)
memory usage: 10.1+ KB


In [8]:

#Remove any entries with sentiment=None
data2.drop( data2.index[data2['sentiment'].str.contains("None") ]  ,axis=0,inplace=True)

In [9]:
data2[data2['sentiment'].str.contains("None") ]

Unnamed: 0,stamp,sentiment


In [10]:
data2.info()
data2.dropna(axis=1,how='all')
data2['sentiment'] = data2.sentiment.astype(float)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 641 entries, 0 to 641
Data columns (total 2 columns):
stamp        641 non-null int64
sentiment    641 non-null object
dtypes: int64(1), object(1)
memory usage: 15.0+ KB


#### View data rows

In [11]:
data1.head()

Unnamed: 0,stamp,price
0,20161118,750.9
1,20160613,690.9
2,20160808,588.7
3,20170206,1047.3
4,20161125,737.3


In [12]:
data2.head()

Unnamed: 0,stamp,sentiment
0,20171113,0.380219
1,20171112,0.380219
2,20171111,0.380219
3,20171110,0.380219
4,20171109,0.380219


#### Join on data frame

In [18]:
data = pd.merge(data1,data2, on='stamp', how='inner')

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 640 entries, 0 to 639
Data columns (total 3 columns):
stamp        640 non-null int64
price        640 non-null float64
sentiment    640 non-null float64
dtypes: float64(2), int64(1)
memory usage: 20.0 KB


In [20]:
data.describe()

Unnamed: 0,stamp,price,sentiment
count,640.0,640.0,640.0
mean,20165620.0,1618.316562,0.302382
std,4940.311,1591.560667,0.095483
min,20160210.0,379.2,-0.040372
25%,20160720.0,603.5,0.249485
50%,20161230.0,868.4,0.304928
75%,20170610.0,2351.95,0.379694
max,20171110.0,7489.9,0.51851


In [21]:
data['stamp'] = pd.to_datetime(data['stamp'].apply(str),format='%Y%m%d')
data = data.sort_values(by='stamp', ascending=[ True])

data.head()

Unnamed: 0,stamp,price,sentiment
571,2016-02-11,379.2,0.306488
425,2016-02-12,389.0,0.260142
211,2016-02-13,397.7,0.304902
112,2016-02-14,406.8,0.308635
588,2016-02-15,400.9,0.321268


#### Plot line graph base on `Weighted Price`

In [22]:
btc_trace = go.Scatter(x=data['stamp'], y=data['price'], name= 'Price')
py.iplot([btc_trace])

#### Fill value 0 data points on `Weighted Price` with NAN and then use ffill method to fill values

In [23]:
data['price'].replace(0, np.nan, inplace=True)
data['price'].fillna(method='ffill', inplace=True)

#### Plot new line graph again on `Weighted Price` with newly filled values

In [24]:
btc_trace = go.Scatter(x=data['stamp'], y=data['price'], name= 'Price')
py.iplot([btc_trace])

In [25]:
data.head()

Unnamed: 0,stamp,price,sentiment
571,2016-02-11,379.2,0.306488
425,2016-02-12,389.0,0.260142
211,2016-02-13,397.7,0.304902
112,2016-02-14,406.8,0.308635
588,2016-02-15,400.9,0.321268


### Using `Weighted Price` as a feature to train the LSTM model 

#### Use MinMaxScaler to normalize `Weighted Price` to range from 0 to 1

In [None]:
from sklearn.preprocessing import MinMaxScaler
values = data['price'].values.reshape(-1,1)
sentiment = data['sentiment'].values.reshape(-1,1)
values = values.astype('float32')
sentiment = values.astype('float32')
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

#### Split 70% of data for training and 30% for testing

In [None]:
train_size = int(len(scaled) * 0.7)
test_size = len(scaled) - train_size
train, test = scaled[0:train_size,:], scaled[train_size:len(scaled),:]
print(len(train), len(test))
split = train_size

#### Create function for creating dataset with look back

In [None]:
def create_dataset(dataset, look_back, sentiment):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        a = dataset[i:(i + look_back), 0]
        np.append(a,sentiment[i])
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    print(len(dataY))
    return np.array(dataX), np.array(dataY)

#### Generate dataset for trainX, trainY, testX, testY

In [None]:
look_back = 1
trainX, trainY = create_dataset(train, look_back, sentiment[0:train_size])
testX, testY = create_dataset(test, look_back, sentiment[train_size:len(scaled)])

In [None]:
trainX.shape

#### Reshape X for model training

In [None]:
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

#### Running the LSTM model with 300 epochs

In [None]:
model = Sequential()
model.add(LSTM(100, input_shape=(trainX.shape[1], trainX.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
history = model.fit(trainX, trainY, epochs=300, batch_size=100, validation_data=(testX, testY), verbose=0, shuffle=False)

#### Plot line graph to show amount loss according the the epoch

In [None]:
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

#### Make prediction using textX and plotting line graph against testY

In [None]:
yhat = model.predict(testX)
pyplot.plot(yhat, label='predict')
pyplot.plot(testY, label='true')
pyplot.legend()
pyplot.show()

#### Scaler Inverse Y back to normal value

In [None]:
yhat_inverse = scaler.inverse_transform(yhat.reshape(-1, 1))
testY_inverse = scaler.inverse_transform(testY.reshape(-1, 1))

#### RMSE

In [None]:
rmse = sqrt(mean_squared_error(testY_inverse, yhat_inverse))
print('Test RMSE: %.3f' % rmse)

#### Plot line graph with Y as USD

In [None]:
pyplot.plot(yhat_inverse, label='predict')
pyplot.plot(testY_inverse, label='actual', alpha=0.5)
pyplot.legend()
pyplot.show()

#### Convert X to dates

In [None]:
predictDates = data.tail(len(testX)).stamp

#### Reshape testY and yhat for plotly

In [None]:
testY_reshape = testY_inverse.reshape(len(testY_inverse))
yhat_reshape = yhat_inverse.reshape(len(yhat_inverse))

#### Plot predicted and actual line graph with X=dates, Y=USD

In [None]:
actual_chart = go.Scatter(x=predictDates, y=testY_reshape, name= 'Actual Price')
predict_chart = go.Scatter(x=predictDates, y=yhat_reshape, name= 'Predict Price')
py.iplot([predict_chart, actual_chart])

### Using additional features for model training

#### Find corrleration in features to `Weighted Price`

In [None]:
sns.heatmap(data.corr(), annot=True, cmap='RdYlGn', linewidths=0.1, vmin=0)

Observation: `Volume` is corrlerated to `Weighted Price`. 
`Open`, `High`, `Low`, `Close` are directly related to `Weighted Price`

#### Function to convert series to supervised learning

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

#### Get all data values

In [None]:
values = data[['price']].values
values = values.astype('float32')

#### Normalize features to range from 0 to 1

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)

#### Frame as supervised learning

In [None]:
reframed = series_to_supervised(scaled, 1, 1)
reframed.head()

#### Drop unncessary columns 

In [None]:
#reframed.drop(reframed.columns[[1,3]], axis=1, inplace=True)
#print(reframed.head())

#### Split data to 70% training, 30% testing

In [None]:
values = reframed.values
n_train_hours = int(len(values) * 0.7)
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]
# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

#### Training the LSTM model with 300 epochs

In [None]:
multi_model = Sequential()
multi_model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2])))
multi_model.add(Dense(1))
multi_model.compile(loss='mae', optimizer='adam')
multi_history = multi_model.fit(train_X, train_y, epochs=300, batch_size=100, validation_data=(test_X, test_y), verbose=0, shuffle=False)

#### Plot line graph to show amount loss according the the epoch

In [None]:
pyplot.plot(multi_history.history['loss'], label='multi_train')
pyplot.plot(multi_history.history['val_loss'], label='multi_test')
pyplot.legend()
pyplot.show()

#### Make prediction using textX and plotting line graph against testY

In [None]:
yhat = multi_model.predict(test_X)
pyplot.plot(yhat, label='predict')
pyplot.plot(test_y, label='true')
pyplot.legend()
pyplot.show()

#### Scaler Inverse Y back to normal value

In [None]:
test_X = test_X.reshape((test_X.shape[0], test_X.shape[1]))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]

#### RMSE

In [None]:
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

#### Plot line graph with actual price, predicted price with feature `Weighted Price`, predicted price with features `Volume` and `Weighted Price`

In [None]:
actual_chart = go.Scatter(x=predictDates, y=inv_y, name= 'Actual Price')
multi_predict_chart = go.Scatter(x=predictDates, y=inv_yhat, name= 'Multi Predict Price')
predict_chart = go.Scatter(x=predictDates, y=yhat_reshape, name= 'Predict Price')
py.iplot([predict_chart, multi_predict_chart, actual_chart])

- LSTM with single feature of `Weighted Price` have RMSE of 159.194
- LSTM with features of `Volume(BTC)`, `Volume(Currency)` and `Weighted Price` have RMSE of 96.184
- LSTM with multi features shows more accurate results as show in line chart above