**This is the final project in Data Analysis.**

Goals:
1. Analyze the cryptocurrency market in the selected time period
2. Try to predict the results of exchange rates (without focusing on events)
3. Analyze current profitability and risk
4. Prepare forecasted financial statements
5. Analyze the share of cryptocurrencies in the economy

Description of variables

* slug - unique name of cryptocurrency (text)
* symbol - unique short name (text)
* name - name of cryptocurrency (text)
* date - dates (categorical)
* ranknow - market entry (ordinal)
* open - starting bid price (numerical)
* high - highest bid price (numerical)
* low - lowest bid price (numerical)
* close - closing bid price (numerical)
* volume - number of transactions (quantitative)
* market - market capitalization (numerical)
* close_ratio - difference between open and close price (numerical)
* spread - difference between the lowest and the highest price (numerical)

In [134]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import datetime as dt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.express as px
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE, SelectKBest, chi2, VarianceThreshold

import xgboost as xgb

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/all-crypto-currencies/crypto-markets.csv


Read the data and drop the symbols

In [135]:
df = pd.read_csv("/kaggle/input/all-crypto-currencies/crypto-markets.csv")

In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 942297 entries, 0 to 942296
Data columns (total 13 columns):
slug           942297 non-null object
symbol         942297 non-null object
name           942297 non-null object
date           942297 non-null object
ranknow        942297 non-null int64
open           942297 non-null float64
high           942297 non-null float64
low            942297 non-null float64
close          942297 non-null float64
volume         942297 non-null float64
market         942297 non-null float64
close_ratio    942297 non-null float64
spread         942297 non-null float64
dtypes: float64(8), int64(1), object(4)
memory usage: 93.5+ MB


In [137]:
df.describe()

Unnamed: 0,ranknow,open,high,low,close,volume,market,close_ratio,spread
count,942297.0,942297.0,942297.0,942297.0,942297.0,942297.0,942297.0,942297.0,942297.0
mean,1000.170608,348.3522,408.593,296.2526,346.1018,8720383.0,172506000.0,0.459499,112.34
std,587.575283,13184.36,16163.86,10929.31,13098.22,183980200.0,3575590000.0,0.32616,6783.713
min,1.0,2.5e-09,3.2e-09,2.5e-10,2e-10,0.0,0.0,-1.0,0.0
25%,465.0,0.002321,0.002628,0.002044,0.002314,175.0,29581.0,0.1629,0.0
50%,1072.0,0.023983,0.026802,0.021437,0.023892,4278.0,522796.0,0.4324,0.0
75%,1484.0,0.22686,0.250894,0.204391,0.225934,119090.0,6874647.0,0.7458,0.03
max,2072.0,2298390.0,2926100.0,2030590.0,2300740.0,23840900000.0,326502500000.0,1.0,1770563.0


In [138]:
dfnum = df.drop(['symbol', 'slug', 'name', 'date'], axis=1)

In [139]:
dfnum.mean()  # Mean value

ranknow        1.000171e+03
open           3.483522e+02
high           4.085930e+02
low            2.962526e+02
close          3.461018e+02
volume         8.720383e+06
market         1.725060e+08
close_ratio    4.594995e-01
spread         1.123400e+02
dtype: float64

In [140]:
def dataframe_range(dataframe):  # Data range
    df_range = pd.DataFrame(dataframe.max() - dataframe.min())
    return df_range

In [141]:
dataframe_range(dfnum)

Unnamed: 0,0
ranknow,2071.0
open,2298390.0
high,2926100.0
low,2030590.0
close,2300740.0
volume,23840900000.0
market,326502500000.0
close_ratio,2.0
spread,1770563.0


In [142]:
dfnum.std()  # Standard deviation

ranknow        5.875753e+02
open           1.318436e+04
high           1.616386e+04
low            1.092931e+04
close          1.309822e+04
volume         1.839802e+08
market         3.575590e+09
close_ratio    3.261605e-01
spread         6.783713e+03
dtype: float64

In [143]:
dfnum.std() ** 2  # Dispersion is squared degree of standard deviation

ranknow        3.452447e+05
open           1.738273e+08
high           2.612703e+08
low            1.194499e+08
close          1.715634e+08
volume         3.384870e+16
market         1.278484e+19
close_ratio    1.063807e-01
spread         4.601876e+07
dtype: float64

In [144]:
df.isnull().sum()  # Checking NULLs, we're lucky

slug           0
symbol         0
name           0
date           0
ranknow        0
open           0
high           0
low            0
close          0
volume         0
market         0
close_ratio    0
spread         0
dtype: int64

In [145]:
df = df.drop(['slug'], axis=1)  # Drop useless columns

df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')  # Transform date to date object

In [146]:
df.head(10)

Unnamed: 0,symbol,name,date,ranknow,open,high,low,close,volume,market,close_ratio,spread
0,BTC,Bitcoin,2013-04-28,1,135.3,135.98,132.1,134.21,0.0,1488567000.0,0.5438,3.88
1,BTC,Bitcoin,2013-04-29,1,134.44,147.49,134.0,144.54,0.0,1603769000.0,0.7813,13.49
2,BTC,Bitcoin,2013-04-30,1,144.0,146.93,134.05,139.0,0.0,1542813000.0,0.3843,12.88
3,BTC,Bitcoin,2013-05-01,1,139.0,139.89,107.72,116.99,0.0,1298955000.0,0.2882,32.17
4,BTC,Bitcoin,2013-05-02,1,116.38,125.6,92.28,105.21,0.0,1168517000.0,0.3881,33.32
5,BTC,Bitcoin,2013-05-03,1,106.25,108.13,79.1,97.75,0.0,1085995000.0,0.6424,29.03
6,BTC,Bitcoin,2013-05-04,1,98.1,115.0,92.5,112.5,0.0,1250317000.0,0.8889,22.5
7,BTC,Bitcoin,2013-05-05,1,112.9,118.8,107.14,115.91,0.0,1288693000.0,0.7521,11.66
8,BTC,Bitcoin,2013-05-06,1,115.98,124.66,106.64,112.3,0.0,1249023000.0,0.3141,18.02
9,BTC,Bitcoin,2013-05-07,1,112.25,113.44,97.7,111.5,0.0,1240594000.0,0.8767,15.74


Traders still love to analyze the concept of HLC (and OHLC|HL) [proof](https://www.mypivots.com/dictionary/definition/92/hlc-3)

In [147]:
df['ohlc_average'] = (df['open'] + df['high'] + df['low'] + df['close']) / 4

Checking other currencies

In [148]:
top10 = df[(df['ranknow'] >= 1) & (df['ranknow'] <= 10)]
top10.name.unique()

array(['Bitcoin', 'XRP', 'Ethereum', 'Stellar', 'Bitcoin Cash', 'EOS',
       'Litecoin', 'Tether', 'Bitcoin SV', 'Cardano'], dtype=object)

*Volume* - All trades buys and sells that were made during that time (for example 24 hours like coinmarketcap does by default).

*Circulating supply* - number of coins mined and existing right now.

*Marketcap* = circulating supply multiply by price of coin.

In [149]:
fig = px.pie(top10, values='volume', names='name', title='Cryptocurrencies Top-10 by Transaction Volume')
fig.show()

In [150]:
fig = px.pie(top10, values='market', names='name', title='Cryptocurrencies Top-10 by Market capitalization')
fig.show()

In [151]:
fig = tools.make_subplots(subplot_titles=('Time'))
for name in top10.name.unique():
    currency = top10[top10['name'] == name]
    trace = go.Scatter(x=currency['date'], y=currency['ohlc_average'], name=name)
    fig.append_trace(trace, 1, 1)
    
fig['layout'].update(title='Top-10 Cryptocurrencies exchange rates comparison')
fig['layout']['yaxis1'].update(title='USD')
fig.show()


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



Adding minor cryptocurrencies that not affect too much on the market

In [152]:
top10minorCurrencies = df[(df['ranknow'] >= 11) & (df['ranknow'] <= 21)]

top10minorCurrencies.name.unique()

array(['Monero', 'TRON', 'IOTA', 'Dash', 'NEM', 'Binance Coin', 'NEO',
       'Ethereum Classic', 'Zcash', 'Bitcoin Gold', 'Tezos'], dtype=object)

In [153]:
fig = px.pie(top10minorCurrencies, values='volume', names='name', title='Minor Cryptocurrencies by Transaction Volume')
fig.show()

In [154]:
fig = px.pie(top10minorCurrencies, values='market', names='name', title='Minor Cryptocurrencies by Market capitalization')
fig.show()

In [155]:
fig = tools.make_subplots(subplot_titles=('Time'))
for name in top10minorCurrencies.name.unique():
    currency = top10minorCurrencies[top10minorCurrencies['name'] == name]
    trace = go.Scatter(x=currency['date'], y=currency['ohlc_average'], name=name)
    fig.append_trace(trace, 1, 1)
    
fig['layout'].update(title='Top-10 Cryptocurrencies exchange rates comparison')
fig['layout']['yaxis1'].update(title='USD')
fig.show()

In [210]:
currency = df[df['name']=='Bitcoin']
currency.head()

Unnamed: 0,symbol,name,date,ranknow,open,high,low,close,volume,market,close_ratio,spread,ohlc_average
0,BTC,Bitcoin,2013-04-28,1,135.3,135.98,132.1,134.21,0.0,1488567000.0,0.5438,3.88,134.3975
1,BTC,Bitcoin,2013-04-29,1,134.44,147.49,134.0,144.54,0.0,1603769000.0,0.7813,13.49,140.1175
2,BTC,Bitcoin,2013-04-30,1,144.0,146.93,134.05,139.0,0.0,1542813000.0,0.3843,12.88,140.995
3,BTC,Bitcoin,2013-05-01,1,139.0,139.89,107.72,116.99,0.0,1298955000.0,0.2882,32.17,125.9
4,BTC,Bitcoin,2013-05-02,1,116.38,125.6,92.28,105.21,0.0,1168517000.0,0.3881,33.32,109.8675


In [221]:
currency['target'] = currency['close'].shift(-30)

In [222]:
X = currency.dropna().copy()
X['year'] = X['date'].apply(lambda x: x.year)
X['month'] = X['date'].apply(lambda x: x.month)
X['day'] = X['date'].apply(lambda x: x.day)
X = X.drop(['date', 'symbol', 'name', 'ranknow', 'target'], axis=1)

y = currency.dropna()['target']

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=1)

X_train.shape, X_test.shape

((1609, 12), (403, 12))

In [223]:
forecast = currency[currency['target'].isnull()]
forecast = forecast.drop('target', axis=1)

X_forecast = forecast.copy()
X_forecast['year'] = X_forecast['date'].apply(lambda x: x.year)
X_forecast['month'] = X_forecast['date'].apply(lambda x: x.month)
X_forecast['day'] = X_forecast['date'].apply(lambda x: x.day)
X_forecast = X_forecast.drop(['date', 'symbol', 'name', 'ranknow'], axis=1)

In [224]:
currency = currency.drop('target', axis=1)

In [225]:
classifiers = {
    'LinearRegression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=1)
}

summary = list()
for name, clf in classifiers.items():
    print(name)
    nada = clf.fit(X_train, y_train)
    
    print(f'R2: {r2_score(y_test, clf.predict(X_test)):.2f}')
    print(f'MAE: {mean_absolute_error(y_test, clf.predict(X_test)):.2f}')
    print(f'MSE: {mean_squared_error(y_test, clf.predict(X_test)):.2f}')
    print()
    
    summary.append({
        'MSE': mean_squared_error(y_test, clf.predict(X_test)),
        'MAE': mean_absolute_error(y_test, clf.predict(X_test)),
        'R2': r2_score(y_test, clf.predict(X_test)),
        'name': name,
    })

LinearRegression
R2: 0.87
MAE: 613.77
MSE: 1854888.16

Random Forest Regressor
R2: 0.99
MAE: 153.39
MSE: 182977.71



In [226]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train.values, y_train.values)
dtest = xgb.DMatrix(X_test.values)

param = {
    'max_depth': 10,
    'eta': 0.3
}
num_round = 20
bst = xgb.train(param, dtrain, num_round)
# make prediction
print('XGBoost')
print(f'R2: {r2_score(y_test, bst.predict(dtest)):.2f}')
print(f'MAE: {mean_absolute_error(y_test, bst.predict(dtest)):.2f}')
print(f'MSE: {mean_squared_error(y_test, bst.predict(dtest)):.2f}')

summary.append({
    'MSE': mean_squared_error(y_test, bst.predict(dtest)),
    'MAE': mean_absolute_error(y_test, bst.predict(dtest)),
    'R2': r2_score(y_test, bst.predict(dtest)),
    'name': 'XGBoost',
})

XGBoost
R2: 0.99
MAE: 148.76
MSE: 160077.83


In [227]:
summary = pd.DataFrame(summary)

fig = tools.make_subplots(rows=1, cols=3, subplot_titles=(
    'R-квадратичная ошибка', 'Средняя абсолютная ошибка', 'Среднеквадратичная ошибка'
))

trace0 = go.Bar(x=summary['name'], y=summary['R2'], name='R2')
fig.append_trace(trace0, 1, 1)

trace1 = go.Bar(x=summary['name'], y=summary['MAE'], name='MAE')
fig.append_trace(trace1, 1, 2)

trace2 = go.Bar(x=summary['name'], y=summary['MSE'], name='MSE')
fig.append_trace(trace2, 1, 3)

fig['layout'].update(title='Сравнение метрик')
fig['layout'].update(showlegend=False)

py.iplot(fig)


plotly.tools.make_subplots is deprecated, please use plotly.subplots.make_subplots instead



In [230]:
clf = LinearRegression()
clf.fit(X_train, y_train)
target = clf.predict(X_forecast)

final = pd.concat([currency, forecast])
final = final.groupby('date').sum()

day_one_forecast = currency.iloc[-1].date + dt.timedelta(days=1)
date = pd.date_range(day_one_forecast, periods=30, freq='D')
predictions = pd.DataFrame(target, columns=['target'], index=date)
final = final.append(predictions)
final.index.names = ['date']
final = final.reset_index()


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [231]:
trace0 = go.Scatter(
    x=final['date'], y=final['close'],
    name='Close'
)

trace1 = go.Scatter(
    x=final['date'], y=final['target'],
    name='Target'
)

data = [trace0, trace1]
layout = go.Layout(
    title='Визуализация результатов предсказания курса BTC по линейной регрессии',
    yaxis={
        'title': 'USD',
        'nticks': 10,
    },
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [228]:
clf = RandomForestRegressor(n_estimators=100, random_state=1)
clf.fit(X_train, y_train)
target = clf.predict(X_forecast)

final = pd.concat([currency, forecast])
final = final.groupby('date').sum()

day_one_forecast = currency.iloc[-1].date + dt.timedelta(days=1)
date = pd.date_range(day_one_forecast, periods=30, freq='D')
predictions = pd.DataFrame(target, columns=['target'], index=date)
final = final.append(predictions)
final.index.names = ['date']
final = final.reset_index()


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [229]:
trace0 = go.Scatter(
    x=final['date'], y=final['close'],
    name='Close'
)

trace1 = go.Scatter(
    x=final['date'], y=final['target'],
    name='Target'
)

data = [trace0, trace1]
layout = go.Layout(
    title='Визуализация результатов предсказания курса BTC по случайным деревьям',
    yaxis={
        'title': 'USD',
        'nticks': 10,
    },
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

In [168]:
dtrain = xgb.DMatrix(X_train.values, y_train.values)
dtest = xgb.DMatrix(X_test.values)

param = {
    'max_depth': 10,
    'eta': 0.3
}
num_round = 20
bst = xgb.train(param, dtrain, num_round)

X_forecast = xgb.DMatrix(X_forecast.values)
target = bst.predict(X_forecast)

final = pd.concat([currency, forecast])
final = final.groupby('date').sum()

day_one_forecast = currency.iloc[-1].date + dt.timedelta(days=1)
date = pd.date_range(day_one_forecast, periods=30, freq='D')
predictions = pd.DataFrame(target, columns=['target'], index=date)
final = final.append(predictions)
final.index.names = ['date']
final = final.reset_index()


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [169]:
trace0 = go.Scatter(
    x=final['date'], y=final['close'],
    name='Close'
)

trace1 = go.Scatter(
    x=final['date'], y=final['target'],
    name='Target'
)

data = [trace0, trace1]
layout = go.Layout(
    title='Визуализация результатов предсказания курса BTC по xgboost',
    yaxis={
        'title': 'USD',
        'nticks': 10,
    },
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)