# **Global Crypto Currency Price Database_EDA and Prediction Using LSTM**

I am a beginner in data analysis. So there may be issues in the code or data interpretation.  
Please feel free to comment for suggestions! Thank you :)

In [None]:
# Packages

# Data Processing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 200
import seaborn as sns

# For Ignoring Warning
import warnings

# Statistics
import math
from scipy import stats
from scipy.stats import norm

# Deep Learning
import tensorflow as tf

# File Path
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# random seed
np.random.seed(123)
tf.random.set_seed(123)

In [None]:
# function that load data
df_metadata = pd.read_csv("/kaggle/input/global-cryptocurrency-price-database/metadata.csv")

def load_data(name):
    
    filename = df_metadata[df_metadata['Coin Pair Name'] == name]['File Path'].values[0]
    path = f"/kaggle/input/global-cryptocurrency-price-database/data/{filename}"
    
    df_result = pd.read_csv(path)
    
    return df_result

I will proceed with the analysis using Ethereum data.

In [None]:
df_ETH = load_data("Ethereum USD")
df_ETH.head()

# 1. Overview

In [None]:
df_ETH.info()

In [None]:
# convert 'Date' dtype to datatime
df_ETH["Date"] = pd.to_datetime(df_ETH["Date"])

# check
df_ETH.info()

In [None]:
# missing values
df_ETH.isnull().sum()

In [None]:
df_ETH.describe().round(3)

# 2. EDA

## 2.1. Time series

In [None]:
# close price
plt.figure(figsize = (8, 4), facecolor = "white")

sns.lineplot(
    data = df_ETH,
    x = "Date", y = "Close"
)

plt.show()

In [None]:
# volume
plt.figure(figsize = (8, 4), facecolor = "white")

sns.lineplot(
    data = df_ETH,
    x = "Date", y = "Volume"
)

plt.show()

In [None]:
# percentage change
df_ETH["Change(%)"] = (df_ETH["Close"] - df_ETH["Close"].shift(1)) / df_ETH["Close"].shift(1) * 100

plt.figure(figsize = (8, 4), facecolor = "white")

sns.lineplot(
    data = df_ETH,
    x = "Date", y = "Change(%)"
)

plt.show()

## 2.2. Distribution

In [None]:
def summary_numerical_dist(df_data, col, q_min, q_max):
    
    # plot size
    fig = plt.figure(figsize = (10, 8), facecolor = "white")


    # setting subplot
    layout_plot = (2, 2)
    num_subplot = 4
    axes = [None for _ in range(num_subplot)]

    list_shape_subplot = [
        [(0, 0), (0, 1), (1, 0), (1, 1)], # loc
        [1, 1, 1, 1], # rowspan
        [1, 1, 1, 1] # colspan
    ]

    for i in range(num_subplot):
        axes[i] = plt.subplot2grid(
            layout_plot, list_shape_subplot[0][i],
            rowspan = list_shape_subplot[1][i],
            colspan = list_shape_subplot[2][i]
        )

    
    # histplot: axes[0]
    sns.histplot(
        data = df_data,
        x = col,
        kde = True,
        ax = axes[0]
    )
    
    
    # QQ plot: axes[1]
    stats.probplot(
        x = df_data[col],
        dist = stats.norm,
        plot = axes[1]
    )
    
    
    # boxplot, to detect outlier: axes[2]
    sns.boxplot(
        data = df_data,
        x = col,
        ax = axes[2]
    )
    
    
    # lineplot, to detect outlier: axes[3]
    pts = df_data[col].quantile(q = np.arange(q_min, q_max, 0.01))
    sns.lineplot(
        x = pts.index,
        y = pts,
        ax = axes[3]
    )
    axes[3].grid(True)
    
    
    # lists
    list_title = ["Histogram", "QQ plot", "Boxplot", "Outlier"]
    
    for i in range(num_subplot):
        # title
        axes[i].set_title(list_title[i])
        
    
    # suptitle
    plt.suptitle(f"Distribution of: {col}", fontsize = 15)
    
    
    # interval adjustment and output
    plt.tight_layout()
    plt.show()

In [None]:
summary_numerical_dist(df_ETH, 'Open', .95, 1)

In [None]:
summary_numerical_dist(df_ETH, 'High', .95, 1)

In [None]:
summary_numerical_dist(df_ETH, 'Low', .95, 1)

In [None]:
summary_numerical_dist(df_ETH, 'Close', .95, 1)

In [None]:
summary_numerical_dist(df_ETH, 'Volume', .95, 1)

Overall, it appears that a logarithmic transformation is needed as it shows a right-skewed distribution.

# 3. Data Transformation

In [None]:
df_ETH_transformed = pd.DataFrame()

In [None]:
df_ETH_transformed['Date'] = df_ETH['Date']

# log transformation and check
for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
    df_ETH_transformed[col] = np.log1p(df_ETH[col])
    summary_numerical_dist(df_ETH_transformed, col, .95, 1)

In [None]:
df_ETH_transformed.set_index('Date', inplace = True)
df_ETH_transformed.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_not_y = MinMaxScaler()
scaler_y = MinMaxScaler()

In [None]:
# scaling
def scale_data(data):

    arr_not_y = scaler_not_y.fit_transform(data.drop(['Close'], axis = 1).values)
    arr_y = scaler_y.fit_transform(data['Close'].values.reshape(-1, 1))

    arr_result = np.concatenate([arr_y, arr_not_y], 1)
    
    return arr_result

In [None]:
arr_ETH_transformed = scale_data(df_ETH_transformed)

# 4. Modeling

## 4.1. Make Sequence and Split

In [None]:
# function that make sequence data
def create_sequences(data, seq_length):
    X, y = [], []
    
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length]) # data of past days
        y.append(data[i + seq_length][0]) # 'Close' of next day
        
    return np.array(X), np.array(y)

In [None]:
seq_length = 10  # the number of past days to be used for predictions
X, y = create_sequences(arr_ETH_transformed, seq_length)

In [None]:
# split
train_size = int(len(X) * 0.70)
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

## 4.2. Model Building

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [None]:
model_LSTM = Sequential([
    LSTM(units = 50, activation = 'relu', return_sequences = True, input_shape = (seq_length, 5)),
    LSTM(units = 50, activation = 'relu', return_sequences = False),
    Dense(units = 1)
])

model_LSTM.compile(optimizer = 'adam', loss = 'mean_squared_error')

## 4.3. Training

In [None]:
model_LSTM_history = model_LSTM.fit(
    X_train, y_train,
    epochs = 50,
    batch_size = 64,
    validation_data = (X_test, y_test)
)

In [None]:
# loss curve
plt.figure(figsize = (5, 3))

plt.plot(model_LSTM_history.history['loss'], color = "blue")
plt.plot(model_LSTM_history.history['val_loss'], color = "orange")

plt.xlabel("epoch")
plt.legend(['loss','val_loss'])

plt.show()

## 4.4. Evaluation

In [None]:
# prediction
y_pred = np.concatenate([model_LSTM.predict(X_train), model_LSTM.predict(X_test)])
y_pred_rescaled = scaler_y.inverse_transform(y_pred)
y_pred_rescaled = np.expm1(y_pred_rescaled)

In [None]:
# inverse scaling: actual y
y_rescaled = np.expm1(scaler_y.inverse_transform(y.reshape(-1, 1)))

In [None]:
# plotting
plt.figure(figsize = (8, 4), facecolor = "white")

plt.plot(y_rescaled, color = "red")
plt.plot(y_pred_rescaled, color = "blue", linestyle = "--")
plt.vlines(
    train_size,
    ymin = y_pred_rescaled.min(), ymax = y_pred_rescaled.max(),
    color = "grey",
    linestyle = "--",
    alpha = 0.5
)

plt.legend(['actual','prediction'])

plt.show() 

In [None]:
# df for analyze residual
df_ETH_LSTM_resid = pd.concat([pd.DataFrame(y_rescaled, columns = ['Close_actual']),
                               pd.DataFrame(y_pred_rescaled, columns = ['Close_pred'])],
                              axis = 1)
df_ETH_LSTM_resid['Date'] = df_ETH['Date']
df_ETH_LSTM_resid['resid'] = df_ETH_LSTM_resid['Close_pred'] - df_ETH_LSTM_resid['Close_actual']

# check
df_ETH_LSTM_resid.head()

In [None]:
summary_numerical_dist(df_ETH_LSTM_resid, 'resid', .95, 1)

In [None]:
# residual plot
plt.figure(figsize = (4, 3), facecolor = "white")

sns.regplot(
    data = df_ETH_LSTM_resid,
    x = 'Close_pred', y = 'resid',
    lowess = True,
    line_kws = {'color': 'red'},
    scatter_kws = {'alpha': 0.25}
)

plt.hlines(
    y = 0, xmin = df_ETH_LSTM_resid['Close_pred'].min(), xmax = df_ETH_LSTM_resid['Close_pred'].max(),
    linestyle = "--",
    color = "grey"
)

ylim_max = max(abs(df_ETH_LSTM_resid['resid'].min()), abs(df_ETH_LSTM_resid['resid'].max()))
plt.ylim(-ylim_max, ylim_max)

plt.show()

# 5. Conclusion

Overall, the prediction performance was not bad. However, **relying on such time series models for investment can be challenging and risky in the market**.  
And it seems that there are minor deviations from the assumptions of normality and homoskedasticity in the residuals.

Again, please feel free to comment for suggestions! Thank you :)