In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import quandl #where quant data comes from
from datetime import datetime

In [2]:
import plotly
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)

### Retriving Bitcoin data thru API 
First we get data by using API.

In [3]:
f = open("access.txt", "r")
access = f.read()
quandl.ApiConfig.api_key = access

In [4]:
def get_quandl_data(quandl_id):
    '''Download and cache Quandl dataseries'''
    cache_path = '{}.pkl'.format(quandl_id).replace('/','-')
    try:
        f = open(cache_path,'rb')
        df = pickle.load(f)
        print('Loaded {} from cache'.format(quandl_id))
    except(OSError,IOError)as e:
        print('Downloading {} from Quandl'.format(quandl_id))
        df = quandl.get(quandl_id,returns = 'pandas')
        df.to_pickle(cache_path)
        print('Cacheed {} at {}'.format(quandl_id,cache_path))
    return df

Here in the code we using `pickle` to serialize and save the downloaded data as a file, which will prevent the script from re-downloading the same data each time run the script. The function return the data as a pd dataframe.

In [5]:
# pull the historical btc exchange rate for the kraken btc exchange
btc_usd_price_kraken = get_quandl_data('BCHARTS/KRAKENUSD')

Downloading BCHARTS/KRAKENUSD from Quandl
Cacheed BCHARTS/KRAKENUSD at BCHARTS-KRAKENUSD.pkl


In [6]:
btc_usd_price_kraken.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume (BTC),Volume (Currency),Weighted Price
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2014-01-07,874.6704,892.06753,810.0,810.0,15.622378,13151.472844,841.835522
2014-01-08,810.0,899.84281,788.0,824.98287,19.182756,16097.329584,839.156269
2014-01-09,825.56345,870.0,807.42084,841.86934,8.158335,6784.249982,831.572913
2014-01-10,839.99,857.34056,817.0,857.33056,8.02451,6780.220188,844.938794
2014-01-11,858.2,918.05471,857.16554,899.84105,18.748285,16698.566929,890.671709


In [7]:
# chart rh btc priding data
btc_trace = go.Scatter(x=btc_usd_price_kraken.index, y = btc_usd_price_kraken['Weighted Price'])
py.iplot([btc_trace])

### Get the price data from more BTC exchange
Since there are some hitch in the dataset, we don't want them to impact the result of our analysis, so we need to import the data from other exchange markets and fill out the spikes.

In [8]:
exchanges = ['COINBASE', 'BITSTAMP','ITBIT']
exchange_data = {}
exchange_data['KRAKEN'] = btc_usd_price_kraken

for item in exchanges:
    exchange_code = 'BCHARTS/{}USD'.format(item)
    exchange_data[item] = get_quandl_data(exchange_code)

Downloading BCHARTS/COINBASEUSD from Quandl
Cacheed BCHARTS/COINBASEUSD at BCHARTS-COINBASEUSD.pkl
Downloading BCHARTS/BITSTAMPUSD from Quandl
Cacheed BCHARTS/BITSTAMPUSD at BCHARTS-BITSTAMPUSD.pkl
Downloading BCHARTS/ITBITUSD from Quandl
Cacheed BCHARTS/ITBITUSD at BCHARTS-ITBITUSD.pkl


In [9]:
def merge_dfs(dfs,labels,col):
    '''merge the dataframes on columns'''
    series = {}
    for index in range(len(dfs)):
        series[labels[index]]= dfs[index][col]
    return pd.DataFrame(series)

btc_usd = merge_dfs(list(exchange_data.values()),list(exchange_data.keys()),'Weighted Price')

In [10]:
btc_usd.tail()

Unnamed: 0_level_0,KRAKEN,COINBASE,BITSTAMP,ITBIT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-07-25,10012.416316,,10008.689763,
2019-07-26,9775.856187,,9774.153324,
2019-07-27,9666.544853,,9609.387919,
2019-07-28,9437.472739,,9428.827935,
2019-07-29,9537.708035,,9532.31155,


After filling out the gaps in the dataset, we try to plot the data first, now these data shown below are from four different exchange markets.

In [11]:
def df_scatter(df,title,separate_y_axis = False, y_axis_label='', scale = 'linear',initial_hide =False):
    '''generate a scatter plot of the entire dataframe
    There might be problms with plotly version >= 3.0.0. downgrade to 2.7.0 will fix the issue
    '''
    label_arr = list(df)
    series_arr = list(map(lambda col:df[col],label_arr))
    
    layout = go.Layout(
        title = title,legend = dict(orientation = 'h'),xaxis=dict(type='date'),
        yaxis = dict(title=y_axis_label,showticklabels=not separate_y_axis,
                    type=scale)
    )
    y_axis_config = dict(overlaying='y',showticklabels=False,type=scale)
    visibility = 'visible'
    if initial_hide:
        visibility = 'legendonly'
        
    # form trace for each series
    trace_arr = []
    for index,series in enumerate(series_arr):
        trace = go.Scatter(x=series.index,y=series,
                           name=label_arr[index],visible = visibility)
        # Add separate axis for the series
        if separate_y_axis:
            trace['yaxis'] = 'y{}'.format(index + 1)
            layout['yaxis{}'.format(index+1)]=y_axis_config
        trace_arr.append(trace)
    
    fig = go.Figure(data=trace_arr,layout = layout)
    py.iplot(fig)
    
df_scatter(btc_usd,'Bitcoin Price (USD) by Exchange')

### Do further data cleaning 

The goal is to remove all the 0's in the dataset to make sure the precision of the analysis, since the bitcoin price never has 0 as its value.

In [12]:
btc_usd.replace(0,np.nan,inplace=True)

In [13]:
# plot again
df_scatter(btc_usd,'Bitcoin Price (USD) by Exchange')

#### Average prices  

After removing all the 0s in the dataset, now we calculate the average price for each cryptocurrency for later use.

In [14]:
# add a new col to calculate the avg of the prices
btc_usd['avg_btc'] = btc_usd.mean(axis=1)

In [15]:
btc_trace = go.Scatter(x=btc_usd.index, y = btc_usd['avg_btc'])
py.iplot([btc_trace])

### Retrive Altcoin price data  

Our ultimate goal is to find out the relationships between the different currencies, and then to decide what to do next based on the results.

In [16]:
def get_json_data(url,cache_path):
    '''Download and cache json data and return a dataframe'''
    try:
        f = open(cache_path,'rb')
        df = pickle.load(f)
        print('Loaded {} from cache'.format(url))
    except(OSError,IOError)as e:
        print('Downloading {} from Quandl'.format(url))
        df = pd.read_json(url)
        df.to_pickle(cache_path)
        print('Cached response at {}'.format(url,cache_path))
    return df

In [19]:
base_polo_url = "https://poloniex.com/public?command=returnChartData&currencyPair={}&start={}&end={}&period={}"
start_date = datetime.strptime('2015-01-01','%Y-%m-%d')
end_date = datetime.now()
period = 86400 # 86400s per day

def get_crypto_data(poloniex):
    url = base_polo_url.format(poloniex,start_date.timestamp(),end_date.timestamp(),period)
    data_df = get_json_data(url,poloniex)
    data_df = data_df.set_index('date')
    return data_df
    

In [20]:
altcoins = ['ETH', 'LTC','XRP','ETC','STR','DASH','SC','XMR','XEM']
alt_data = {}
for coin in altcoins:
    coinpair = 'BTC_{}'.format(coin)
    alt_data[coin] = get_crypto_data(coinpair)

Downloading https://poloniex.com/public?command=returnChartData&currencyPair=BTC_ETH&start=1420099200.0&end=1564531699.406939&period=86400 from Quandl
Cached response at https://poloniex.com/public?command=returnChartData&currencyPair=BTC_ETH&start=1420099200.0&end=1564531699.406939&period=86400
Downloading https://poloniex.com/public?command=returnChartData&currencyPair=BTC_LTC&start=1420099200.0&end=1564531699.406939&period=86400 from Quandl
Cached response at https://poloniex.com/public?command=returnChartData&currencyPair=BTC_LTC&start=1420099200.0&end=1564531699.406939&period=86400
Downloading https://poloniex.com/public?command=returnChartData&currencyPair=BTC_XRP&start=1420099200.0&end=1564531699.406939&period=86400 from Quandl
Cached response at https://poloniex.com/public?command=returnChartData&currencyPair=BTC_XRP&start=1420099200.0&end=1564531699.406939&period=86400
Downloading https://poloniex.com/public?command=returnChartData&currencyPair=BTC_ETC&start=1420099200.0&end=1

In [23]:
display(alt_data['ETH'].tail())
display(alt_data['LTC'].tail())

Unnamed: 0_level_0,close,high,low,open,quoteVolume,volume,weightedAverage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-07-27,0.02187,0.022287,0.0216,0.022266,5220.464942,114.188161,0.021873
2019-07-28,0.022179,0.022327,0.02165,0.02189,2572.842168,56.379416,0.021913
2019-07-29,0.022174,0.02227,0.02192,0.0222,2920.789837,64.448468,0.022065
2019-07-30,0.021891,0.022211,0.02177,0.02217,3991.896369,87.452277,0.021907
2019-07-31,0.02187,0.0219,0.02187,0.021892,60.384376,1.322025,0.021893


Unnamed: 0_level_0,close,high,low,open,quoteVolume,volume,weightedAverage
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-07-27,0.009363,0.009606,0.00921,0.009565,9514.017971,88.908123,0.009345
2019-07-28,0.009405,0.009503,0.00914,0.009375,5324.379894,49.685825,0.009332
2019-07-29,0.009502,0.009581,0.009312,0.00943,7068.883047,66.818554,0.009452
2019-07-30,0.009419,0.009542,0.009386,0.0095,5243.798441,49.58041,0.009455
2019-07-31,0.009403,0.00942,0.009401,0.009411,0.0881,0.000829,0.009405


### Convert price in USD
We now have crypto-btc, and btc-usd, we could have crypto-usd, as a more convinient way in terms of understanding the data.

In [27]:
for coin in alt_data.keys():
    alt_data[coin]['price_usd'] = alt_data[coin]['weightedAverage']*btc_usd['avg_btc']


In [28]:
combined_df = merge_dfs(list(alt_data.values()),list(alt_data.keys()),'price_usd')
combined_df['BTC'] = btc_usd['avg_btc']

In [30]:
df_scatter(combined_df, 'Cryptocurrency Prices (USD)',separate_y_axis = False, 
           y_axis_label = 'Coin Value (USD)',scale = 'log') # if not using log scale, the plot would look very diverse.

### Figure out the relationships between the cryptocurrencies  

Here we want to dig some more insights from the dataset between the cryptocurrencies.  
It's time to do the corellation analysis among the cryptocurrencies. 
There are similar fluctuations appear along the timeline, thus we could use corr() in pandas to do the correlation analysis, which computes the pearson correlation coefficient for each col in the dataframe.  
Compute correlations directly on a non-stationary time series can give biased corr values. We will work around it by using pct_change() method, which will convert each cell in the dataframe from an absolute price value to a daily return percentage.

In [33]:
combined_df_16 = combined_df[combined_df.index.year==2016]
combined_df_17 = combined_df[combined_df.index.year==2017]
combined_df_18 = combined_df[combined_df.index.year==2018]
combined_df_19 = combined_df[combined_df.index.year==2019]
combined_df_19.pct_change().corr(method ='pearson')

Unnamed: 0,ETH,LTC,XRP,ETC,STR,DASH,SC,XMR,XEM,BTC
ETH,1.0,0.741028,0.75102,0.763768,0.765428,0.783579,0.703728,0.782337,0.716728,0.774505
LTC,0.741028,1.0,0.653059,0.654736,0.650853,0.662729,0.630933,0.706168,0.565552,0.671076
XRP,0.75102,0.653059,1.0,0.700428,0.781639,0.699882,0.572589,0.717191,0.570448,0.674407
ETC,0.763768,0.654736,0.700428,1.0,0.718797,0.672953,0.652928,0.689846,0.640564,0.61552
STR,0.765428,0.650853,0.781639,0.718797,1.0,0.699322,0.661898,0.710457,0.724707,0.605965
DASH,0.783579,0.662729,0.699882,0.672953,0.699322,1.0,0.679674,0.794348,0.590207,0.736885
SC,0.703728,0.630933,0.572589,0.652928,0.661898,0.679674,1.0,0.673348,0.630416,0.672646
XMR,0.782337,0.706168,0.717191,0.689846,0.710457,0.794348,0.673348,1.0,0.575475,0.737648
XEM,0.716728,0.565552,0.570448,0.640564,0.724707,0.590207,0.630416,0.575475,1.0,0.567828
BTC,0.774505,0.671076,0.674407,0.61552,0.605965,0.736885,0.672646,0.737648,0.567828,1.0


In [32]:
# now we need to visualize the correlation matrix by heatmap
def corr_heatmap(df,title,absolute_bound = True):
    heatmap = go.Heatmap(
        z = df.corr(method = 'pearson').as_matrix(),
        x = df.columns,
        y = df.columns,
        colorbar = dict(title = 'Pearson Coeeficient'),
    )
    layout = go.Layout(title = title)
    
    if absolute_bound:
        heatmap['zmax'] = 1.0
        heatmap['zmin'] = -1.0
    
    fig = go.Figure(data= [heatmap],layout = layout)
    py.iplot(fig)

In [38]:
corr_heatmap(combined_df_19.pct_change(),'Cryptocurrency Correlations in 2019')


Method .as_matrix will be removed in a future version. Use .values instead.



In [39]:
corr_heatmap(combined_df_18.pct_change(),'Cryptocurrency Correlations in 2018')


Method .as_matrix will be removed in a future version. Use .values instead.



In [40]:
corr_heatmap(combined_df_17.pct_change(),'Cryptocurrency Correlations in 2017')



Method .as_matrix will be removed in a future version. Use .values instead.



In [41]:
corr_heatmap(combined_df_16.pct_change(),'Cryptocurrency Correlations in 2016')


Method .as_matrix will be removed in a future version. Use .values instead.



Since in the market there are many analysis talking about the relationship between the different crypocurrencies, but some of them don't have data to support their conclusions. Here we are using data and visualization tools to display some straightforward insights from the raw data in the market.
The above process could be considered as a EDA.

The correlations between the cryptocurrencies from 2016 to 2019 is getting more and more stronger.Reasons could be as follows:  

1. more and more attensions to the cryptocurrencies/blockchain  
2. hedging funds also has certain impact on the crypto
...


## What could we do after this?

1. blockchain mining datasets 
2. stocks, commdenities, to see the correlations 
3. train a ml model to predict price，(CNN, RNN ...) 
4. trading bot, chatting bot  
5. is quant investment making money? (based on the historical data) 