In [1]:
import os, pickle, quandl
import numpy as np
import pandas as pd
from datetime import datetime
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import time
import math
from scipy import stats


py.init_notebook_mode(connected = True)


########################################################################
#                        API Functions
########################################################################

#purpose: download and cache a Quandl dataseries
#signature: get_quandle_data(quandl_id: ) -> df: pandas dataframe
## *** quandl API key = WZpC82bNrwpdw59PaRK1
def get_quandl_data(quandl_id):
    home = os.getcwd()
    
    # create directory if it doesn't already exist
    if 'quandl_cache' not in os.listdir(home):
        os.mkdir('quandl_cache')
    
    #change directory to quandl_cache
    path = os.path.join(home, 'quandl_cache/')
    os.chdir(path)
        
    # download & cache
    cache_path = 'quandl_cache/{}.pkl'.format(quandl_id).replace('/','-')
    try:
        #serialize python object structure
        file = open(cache_path, 'rb')
        df = pickle.load(file)
        print('Loaded {} from cache'.format(quandl_id))
    except (OSError, IOError) as e:
        print('Downloading {} frpm Quandl'.format(quandl_id))
        df = quandl.get(quandl_id, returns = "pandas")
        df.to_pickle(cache_path)
        print('Cached {} at {}'.format(quandl_id, cache_path))
        
    os.chdir('../') 
    return df


# purpose: merge common column of each dataframe into combined dataframe
# signature: merge_dfs_on_column(dataframes: list, labels: list, col: String) 
# -> pd.DataFrame(series_dict): dataframe
def merge_dfs_on_column(dataframes, labels, col):
    series_dict = {labels[i] : dataframes[i][col] for i in range(len(dataframes))}
    return pd.DataFrame(series_dict)

# purpose: 
# signature: df_scatter(df:dataframe, title, seperate_y_axis:Boolean, y_axis_label:String, scale:String, initial_hide:boolean) 
# -> ploty scatter plot
def df_scatter(df, title, seperate_y_axis = False, y_axis_label='', scale='linear', initial_hide=False): 
    label_arr = list(df)
    # lambda to be used for form trace
    series_arr = list(map(lambda col: df[col], label_arr))
    
    # plot layout config
    layout = go.Layout(
        title=title,
        legend=dict(orientation="h"),
        xaxis=dict(type='date'),
        yaxis=dict(
            title=y_axis_label,
            showticklabels= not seperate_y_axis,
            type=scale
        )
    )
    
    y_axis_config = dict(
        overlaying='y',
        showticklabels=False,
        type=scale )
    
    visibility = 'visible'
    if initial_hide:
        visibility = 'legendonly'
        
    # Form Trace for each series
    trace_arr = []
    for index, series in enumerate(series_arr):
        trace = go.Scatter(
            x=series.index, 
            y=series, 
            name=label_arr[index],
            visible=visibility
        )
        
        # add separate axis for series 
        if seperate_y_axis:
            trace['yaxis'] = 'y{}'.format(index + 1)
            layout['yaxis{}'.format(index + 1)] = y_axis_config  
        
        trace_arr.append(trace)
    
    fig = go.Figure(data = trace_arr, layout = layout)
    py.iplot(fig)
    
# purpose: Download and cache JSON data, return as a dataframe.
# signature: get_json_data(json_url:String, cache_path:String)
# -> df: dataframe
def get_json_data(json_url, cache_path): 
    try:        
        f = open(cache_path, 'rb')
        df = pickle.load(f)   
        print('Loaded {} from cache'.format(cache_path)) #was json object
    except (OSError, IOError) as e:
        print('Downloading {}'.format(json_url))
        df = pd.read_json(json_url)
        df.to_pickle(cache_path)
        print('Cached {} at {}'.format(json_url, cache_path))
    return df
    
    
#purpose: retrieve crypto data from poloniex 
#signiture: get_crypto_data(poloniex_pair:String) -> data_df: dataframe
def get_crypto_data(poloniex_pair):
    BASE_POLO_URL = 'https://poloniex.com/public?command=returnChartData&currencyPair={}&start={}&end={}&period={}'
    start_date = datetime.strptime('2015-01-01', '%Y-%m-%d') # get data from start of 2015
    end_date = datetime.now()
    period = 86400 # pull daily data (86,400 seconds per day)
    
    # use time() instead of timestamp() with python 2
    #json_url = BASE_POLO_URL.format(poloniex_pair, start_date.time(), end_date.time(), period)
    json_url = BASE_POLO_URL.format(poloniex_pair, start_date.timestamp(), end_date.timestamp(), period)
    #json_url = BASE_POLO_URL.format(poloniex_pair, start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'), period)
    
    
    data_df = get_json_data(json_url, poloniex_pair)
    data_df = data_df.set_index('date')
    return data_df

def get_serialized_crypto_data(poloniex_pair, serializedCoinPath):
    BASE_POLO_URL = 'https://poloniex.com/public?command=returnChartData&currencyPair={}&start={}&end={}&period={}'
    start_date = datetime.strptime('2015-01-01', '%Y-%m-%d') # get data from start of 2015
    end_date = datetime.now()
    period = 86400 # pull daily data (86,400 seconds per day)
    
    # use time() instead of timestamp() with python 2
    #json_url = BASE_POLO_URL.format(poloniex_pair, start_date.time(), end_date.time(), period)
    json_url = BASE_POLO_URL.format(poloniex_pair, start_date.timestamp(), end_date.timestamp(), period)
    #json_url = BASE_POLO_URL.format(poloniex_pair, start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'), period)
    
#     try:
#         data_df = get_json_data(json_url, serializedCoinPath)
#     except FileNotFoundError as e1:
        
#         except (OSError, IOError) as e2:
            
        
    data_df = get_json_data(json_url, serializedCoinPath)#poloniex_pair
    data_df = data_df.set_index('date')
    return data_df

    

    
    
# a function which takes two numpy arrays of equal size and calculates
# the pearson correlation coefficient for the two sets of data
# @params 2 numpy arrays
# @returns a float corresponding to the pearson r value
def pearson_coefficient(npX, npY):
    if len(npX) != len(npY):
        return -23
    
    n = len(npX)
    x2 = 0
    y2 = 0
    x = 0
    y = 0
    xy = 0
    
    #sum coefficient parameters
    for i in range(n):
        x2 += npX[i] ** 2
        y2 += npY[i] ** 2
        x += npX[i]
        y += npY[i]
        xy += npX[i] * npY[i]

    coef = float((xy - ((x * y)/n)) /math.sqrt((x2 - ((x ** 2) / n)) * (y2 - ((y ** 2) / n))))
    
    return coef


#purpose: This is a new function for correlating two crypto coins via a plot
# crypto_correlation(pandas.DataFrame, string, pandas.DataFrame, string) 
# --> *returns: pearson coeff *prints: Plot
def crypto_correlation(df_left, name1, df_right, name2, plotBool=False):
    #make sure dates match && read df into arrays
    left_returns, right_returns = match_dates(df_left, name1, df_right, name2)
    print(str(len(left_returns)),str(len(right_returns)))
    
    # calculate values for linear regression
    slope, intercept, r_val, p_val, std_err = stats.linregress(left_returns, right_returns)
    
    best_fit_x = np.arange(min(left_returns), max(left_returns), (max(left_returns)- min(left_returns)) / 10000.0)#not sure this is right for my model?
    print("slope:", slope, "\tintercept:", intercept)
    best_fit_y = slope * best_fit_x + intercept
    plot = [go.Scatter(
        x = left_returns,
        y = right_returns,
        name = str(name1).title() + "vs. " + str(name2).title() + "Correlation",
        mode = 'markers',
        marker = dict(
            size = 10,
            color = 'rgba(255, 0, 0, .9)',
            line = dict(width = 2,)
        )
    ), go.Scatter(x = best_fit_x,
                  y = best_fit_y,
                  mode='lines',
                  marker=go.Marker(color='rgb(31, 119, 180)'),
                  name='Fit'
              )]
    layout = dict( 
        title = str(name1)+ ' vs. '+str(name2)+' correlation',
        #yaxis = dict(zeroline = False),
        #xaxis = dict(zeroline = False)
    )
          
    if plotBool:
       fig = dict(data=plot,layout=layout)
       py.iplot(fig, filename='plot')
    
    return pearson_coefficient(left_returns, right_returns)



#(HELPER FOR crypto_correlation function)date match function
# purpose: returns 2 numpy arrays of daily returns for corresponding days of 2 different currencies
# match_dates(left_df: dataframe, right_df: dataframe) -> np.Array, np.Array
def match_dates(left_df, symbol1, right_df, symbol2):
    
    # merge dataframes on the date (e.g. the index of the df)
    merged_df = pd.merge(left_df, right_df, how = 'inner', left_index=True, right_index=True)#, validate = "one_to_one"
    #######################################################################
    # so it's merging on date, but I need to differentiate between prices of each currency
    #######################################################################
    
    lst1 = []
    lst2 = []
    
    #actually range of merged 
    range_df = range(len(merged_df) - 1)

    # calculate daily returns
    for i in range_df:
        lst1.append((merged_df['price_usd_x'][i + 1] - merged_df['price_usd_x'][i]) / merged_df['price_usd_x'][i])
        #print("i:", i, "\trange\[-1\]",range_df[-1])
        #if i < range_df[-1]:
        lst2.append((merged_df['price_usd_y'][i + 1] - merged_df['price_usd_y'][i]) / merged_df['price_usd_y'][i])
        
    # return 2 numpy arrays containing daily returns for matching dates
    npA1 = np.array(lst1)
    npA2 = np.array(lst2)

    return npA1, npA2

The cell below retrieves bitcoin and altcoin data and computes the pearson values for a matrix where the rows and columns are represented by the altcoins list where i,j = 0 -> ETH,ETH and i,j = 7 -> GNT,GNT

In [2]:
import os, pickle, quandl, time, math, sys
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib notebook



########################################################
# compile data
########################################################

# exchange info
altcoins = ['ETH','LTC','XRP','ETC','DASH','XMR','XEM', 'GNT']
alt_data = {}

#change dir to poloniex_cache & create dir if it doesn't exist 
if not 'poloniex_cache' in os.listdir(os.getcwd()):
    os.mkdir('poloniex_cache')
    
path = os.path.join(os.getcwd(), 'poloniex_cache')
os.chdir(path)

for alt in altcoins:
    coinpair = 'BTC_{}'.format(alt)
    crypto_price_df = get_serialized_crypto_data(coinpair, coinpair)
    #crypto_price_df = get_serialized_crypto_data(coinpair, os.path.join('poloniex_cache',coinpair))
    alt_data[alt] = crypto_price_df

#leave the directory
os.chdir('../')

exchanges = ['KRAKEN','COINBASE', 'BITSTAMP','ITBIT','OKCOIN', 'GETBTC']#,'COINSBANK','HITBTC','LYBIT','ANXHK','BITME','BITBOX','INTRSNG','BTCE','WEEX','JUST','CBX']
exch_data = {}
# retrieve exchange data and read into dictionary
for exchange in exchanges:
    exchange_df = get_quandl_data('BCHARTS/{}USD'.format(exchange))
    #time.sleep(1000)
    exch_data[exchange] = exchange_df
    
# Merge BTC price data series' into single dataframe
btc_usd_datasets = merge_dfs_on_column(list(exch_data.values()), list(exch_data.keys()), 'Weighted Price')
btc_usd_datasets.replace(0, np.nan, inplace = True) # remove 0 values
# Calc avg in new column
btc_usd_datasets['avg_btc_price_usd'] = btc_usd_datasets.mean(axis = 1)



# calculate USD Price as new col in each dataframe
for alt in alt_data.keys():
    alt_data[alt]['price_usd'] = alt_data[alt]['weightedAverage'] * btc_usd_datasets['avg_btc_price_usd']
    

########################################################
# organize into daily returns
########################################################
dailyReturns = {}

for alt in alt_data.keys():
    dailyReturns[alt] = np.empty(len(alt_data[alt]['price_usd']) - 1)
    for i in range(len(alt_data[alt]['price_usd']) - 1):
        # calculate returns for day
        dayRet = (alt_data[alt]['price_usd'][i + 1] - alt_data[alt]['price_usd'][i]) / alt_data[alt]['price_usd'][i]
        #insert them into numpy array
        np.insert(dailyReturns[alt], i, dayRet)

        
#matrix to contain preprocessed pearson correlation values between currencies
pearsonValueMatrix = np.empty([len(alt_data.keys()), len(alt_data.keys())])
completed = {}
indexVector = []
i = 0
j = 0
pVal = []

# create the file
file = open('crypto_pearson.txt', 'w')
for a1 in alt_data.keys():
    indexVector.append(i)
    tmp = []
    for a2 in alt_data.keys():
        #calculate pearson value and print plot
        str1 = a1 + "," + a2
        completed[str1] = crypto_correlation(alt_data[a1], a1, alt_data[a2], a2, False)
        tmp.append(completed[str1])
        np.insert(pearsonValueMatrix, [i,j], completed[str1])
        j += 1
        #append value to file
        file.write(str(completed[str1]) + ',')
    file.write('\n')

    pVal.append(tmp)
    i += 1
file.write("\n")
file.write(str(altcoins))
#close the file
file.close()


#print(pearsonValueMatrix)
print("label\t\t", "pearson value")
print("type:\t", type(pVal[0][0]))
print(sys.float_info)
for it in pVal:    
    print(it)



Loaded BTC_ETH from cache
Loaded BTC_LTC from cache
Loaded BTC_XRP from cache
Loaded BTC_ETC from cache
Loaded BTC_DASH from cache
Loaded BTC_XMR from cache
Loaded BTC_XEM from cache
Loaded BTC_GNT from cache
Loaded BCHARTS/KRAKENUSD from cache
Loaded BCHARTS/COINBASEUSD from cache
Loaded BCHARTS/BITSTAMPUSD from cache
Loaded BCHARTS/ITBITUSD from cache
Loaded BCHARTS/OKCOINUSD from cache
Loaded BCHARTS/GETBTCUSD from cache
878 878
slope: 1.0 	intercept: 0.0
878 878
slope: 0.203199597598 	intercept: 0.00408444747289
878 878
slope: 0.151533292588 	intercept: 0.00727397609035
527 527
slope: 0.479316710539 	intercept: 0.00598545637588
878 878
slope: 0.236314278873 	intercept: 0.00571015226461
878 878
slope: 0.245515930354 	intercept: 0.00679166007895
878 878
slope: 0.237442147222 	intercept: 0.0116290077862
318 318
slope: 0.809856885822 	intercept: 0.00227038240919
878 878
slope: 0.313542606139 	intercept: 0.00784650498465
1096 1096
slope: 1.0 	intercept: 0.0
1096 1096
slope: 0.3403819109