In [1]:
#imports các thư viện
import numpy as np
from math import sqrt
from numpy import concatenate

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle

from matplotlib import pyplot
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat


import keras.utils
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Activation
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.layers import GRU
from keras.optimizers import Adam

import tensorflow as tf
import seaborn as sn
import seed
import os
tf.get_logger().setLevel('ERROR')

In [None]:
def rsi(data, period: int = 14):

    delta = data["Close"].diff()

    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    gain = up.ewm(com=(period - 1), min_periods=period).mean()
    loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

    RS = gain / loss
    return 100 - (100 / (1 + RS))

In [None]:
#lag granularity (độ trễ) - days or hours
lag_granularity = "hours"
#lag value
lag = 1
# type of analyser - TextBlob or vader
analyser = "vader"
# analyser = "TextBlob"
#dataset grouped type - day or hour
dataset_grouped_by = "hour"

In [None]:
#read dataset
folder = "./../../datasets/tweets_prices_volumes_sentiment/" + \
    analyser+"/"+dataset_grouped_by+"_datasets/cleaned"
filename = folder+"/final_data_lag_"+lag_granularity+"_" + \
    str(lag)+".csv" if (lag > 0) else folder+"/final_data_no_lag.csv"
df = pd.read_csv(filename, index_col='DateTime', parse_dates=True)


In [None]:
#group by datetime
df = df.groupby('DateTime').agg(lambda x: x.mean())
df

In [None]:
#tính toán các chỉ số indicator: rsi, ma
#get change label
df["Change"] = (df["Close"] > df["Close"].shift(1)).astype(int)

add_RSI = True
add_longMAvg = True
add_shortMAvg = True

if(add_RSI):
    #calcualte RSI
    RSI = 14
    df['RSI'] = rsi(df, RSI)
    df = df.iloc[RSI:]

#calculate moving averages
if(add_shortMAvg):
    short_window = 9
    df['short_mavg'] = df.rolling(window=short_window)["Close"].mean()

if(add_longMAvg):
    long_window = 21
    df["long_mavg"] = df.rolling(window=long_window)["Close"].mean()

if(add_longMAvg):
    df = df.iloc[long_window:]
elif(add_RSI):
    df = df.iloc[RSI:]
elif(add_shortMAvg):
    df = df.iloc[short_window:]

In [None]:
#keep only wanted columns
features = ['Change', 'subjectivity', 'polarity', 'Tweet_vol', 'Volume_(BTC)'] if analyser == "Textblob" else [
    'Change', 'Close', 'pos_pol', 'neg_pol', 'Tweet_vol']

if(add_RSI):
    features.append("RSI")

if(add_longMAvg):
    features.append("long_mavg")

if(add_shortMAvg):
    features.append("short_mavg")

df = df[features]

In [None]:
#plot correlation matrix
sn.heatmap(df.corr(), annot=True)
plt.show()


In [None]:
#creating copy so that data is not loaded once again
df_copy = df.copy()

In [None]:
#number of previous records to consider for every example
n_lag = 7
#number of features
n_features = len(features)
#calculate total_features
total_features = n_lag*n_features

if(total_features == 0):
    total_features = n_features

In [None]:
data_with_lagged = df_copy
data_with_lagged

In [None]:
#divide df into train and test
train_ratio = 0.85
data_len = len(data_with_lagged)
train_size = int(data_len*train_ratio)

data_hours = data_with_lagged.iloc[:, :2].reset_index().drop(['DateTime'], axis=1)

train = data_with_lagged.iloc[:train_size, :2]
train = train.reset_index()
train = train.drop(['DateTime'], axis=1)

data_hours

In [None]:
#chuẩn hóa
xscaler = MinMaxScaler(feature_range=(0, 1))
train = xscaler.fit_transform(train)
scaler_data = xscaler.transform(data_hours)
print(train.shape, scaler_data.shape)

In [None]:
X_train = []
y_train = []

for i in range(n_lag, len(train)):
  X_train.append(train[i-n_lag:i, :])
  y_train.append(train[i, :])

X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

print(X_train.shape, y_train.shape)

In [None]:
# np.random.seed(1)
# tf.random.set_seed(1)

# design network
model_gru_hour_1 = Sequential()
neurons = 64
epochs = 50
dropout = 0.25
batch_size = 80
activ_func = "relu"

model_gru_hour_1.add(GRU(neurons, return_sequences=True, input_shape=(
    X_train.shape[1], X_train.shape[2]), activation=activ_func))
model_gru_hour_1.add(GRU(neurons, return_sequences=False))
model_gru_hour_1.add(Dense(2))
model_gru_hour_1.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# early stopping callback
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience =20)

# fit network
history = model_gru_hour_1.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

In [None]:
#tạo testing data set
test_data = scaler_data[train_size - n_lag: , :]
x_test = []
y_test = data_with_lagged.iloc[train_size:, :2]

for i in range(n_lag, len(test_data)):
  x_test.append(test_data[i-n_lag:i, :])

x_test = np.asarray(x_test)
print(x_test.shape, y_test.shape)

In [None]:
#get the models predicted price value
predictions = model_gru_hour_1.predict(x_test)
predictions = xscaler.inverse_transform(predictions)
predictions

In [None]:
#sai số
mse = (((y_test - predictions)**2).mean())
mae = np.abs((y_test - predictions).mean())
mape = np.mean(np.abs((y_test - predictions) / y_test)) * 100
smape = (100 / len(y_test)) * np.sum(2 * np.abs(predictions - y_test) / (np.abs(predictions) + np.abs(y_test)))

print(mse["Close"], mae["Close"], mape["Close"], smape["Close"])

In [None]:
#
valid = data_with_lagged[train_size:]
valid['Predictions'] = predictions[:, 1]

#hiển thị dữ liệu gần đây
plt.figure(figsize=(16,8))
plt.title('Model')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Giá (USD)', fontsize=18)
plt.plot(valid[['Close','Predictions']])
plt.legend(['Val', 'Predictions'], loc='lower right')
plt.show()

In [None]:
#create file if it does not exist
data = {
    'Model': 'GRU',
    'Thông tin': ['MAE', 'MSE', 'MAPE', 'sMAPE'],
    'Price': [mae["Close"], mse["Close"], mape["Close"], smape["Close"]],
    # 'Price + Twitter': [mae1["Close"], mse1["Close"], mape1["Close"], smape1["Close"]],
}
result_table = pd.DataFrame(data)
result_table.to_csv("result-hours.csv", index=False)
result_table