In [40]:
import luigi
import pandas as pd
import numpy as np
import sys
import os
sys.path.insert(0, '/home/fbuonerba/codes/')
from mp_functions import upload_log_return, upload_factor_loadings
from coinapi_v1 import CoinAPIv1
import datetime
from datetime import datetime, timedelta
import time
import calendar
import json
import urllib.request
import multiprocessing as mp
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [41]:
with open('/home/fbuonerba/codes/meta_data/new_coins.txt') as ff:
    coins=json.load(ff)
coins=np.array(coins)
quotes=['USD','BTC']
#sysm=[str(coin)+'_'+str(quote) for coin in coins for quote in quotes]
def get_returns(beg,end):
    #weights=0 or 1
    t=beg
    matrix=[]
    while t<=end+1:
        ret_t=[]
        for base in coins:
            for quote in quotes:
                returns=upload_log_return(t, base, quote, 86400)
                if np.isnan(returns)==True:
                    returns=0
                ret_t.append(returns)
                #print(base,quote)
        matrix.append(ret_t)
        t+=86400
    R=np.array(matrix)
    norms=np.linalg.norm(R, axis=0)
    W=np.where(norms==0)
    R=np.delete(R, W, axis=1) 
    std=np.std(R, axis=0)
    #return degenerate_indices, returns
    return(W, R)

#####W[0]=those pairs for which no trading activity is recorded
#VET,NPXS have no data for this time period. Indeed they were ranked low on cmc.
#then BTC has no data vs BTC, the rest have no data vs USD.
# ind_usd=np.array([int(x/2) for x in W[0] if x%2==0])
# ind_btc=np.array([int(x/2) for x in W[0] if x%2==1])
# bad_coins_usd=coins[ind_usd]
# bad_coins_btc=coins[ind_btc]
# very_bad_coins=[x for x in bad_coins_usd if x in bad_coins_btc]


In [58]:
#####Factors: January to April#####
#####log_mkcap and coin_ratio computed averaging.
#####Those are the good ones!
factors=[]
keys=['returns_variance', 'returns_strength', 'rates_high_low','turnover', 'log_marketcap','coin_ratio']
folder='/home/fbuonerba/factor_loadings/factors_'

def get_raw_factors(beg,end,W):
    factors=[]
    std=[]
    for coin in coins:
        for quote in quotes:
            with open(folder+coin+'_'+quote+'_'+str(beg)+'_'+str(end)+'_86400.txt') as data:
                fac=json.load(data)
            ordr=[]
            for key in keys:
                if key=='returns_variance':
                    E=fac[key]**.5
                    std.append(E)
                else:
                    E=fac[key]
                ordr.append(E)
            factors.append(ordr)
            #factors.append(fac)
    factors=np.array(factors)
    std=np.array(std)
    factors=np.delete(factors, W, axis=0)
    std=np.delete(std, W, axis=0)
    std=std.reshape(-1,1)
    #####here we do normalization using z-score along coin axis#####
    factors[np.where(np.isnan(factors)==True)]=0
    
    ###There are two NaN entries, corresponding to coin ratio of BCD, 43rd position.
    ###In a random cmc_historical, the supply is nan. Confirmed on cmc website chart:
    #at the time BCD was doing poorly - in march it was ranked 1250th.
    return(std, factors)

def get_processed_factors(beg,end,W):
    std, factors=get_raw_factors(beg,end,W)
    factors=(factors-np.mean(factors,axis=0))/np.var(factors,axis=0)**.5
    return(std, factors)


In [59]:
#####Factors: January to April#####
#####log_mkcap and coin_ratio computed on the day.
factors=[]
keys=['returns_variance', 'returns_strength', 'rates_high_low','turnover']
exact_keys=['log_mkcap_exact_','coin_ratio_exact_']
folder='/home/fbuonerba/factor_loadings/factors_'
exact_folder='/home/fbuonerba/factor_loadings/'
def get_raw_barra_factors(beg,end,W):
    factors=[]
    std=[]
    for coin in coins:
        for quote in quotes:
            with open(folder+coin+'_'+quote+'_'+str(beg)+'_'+str(end)+'_86400.txt') as data:
                fac=json.load(data)
            ordr=[]
            for key in keys:
                if key=='returns_variance':
                    E=fac[key]**.5
                    std.append(E)
                else:
                    E=fac[key]
                ordr.append(E)
            ####exact factors####
            with open(exact_folder+exact_keys[0]+coin+'_'+quote+'_'+str(end)+'.txt') as ff:
                exc=json.load(ff)
            ordr.append(exc)
            with open(exact_folder+exact_keys[1]+coin+'_'+str(end)+'.txt') as ff:
                exc=json.load(ff)
            ordr.append(exc)
            factors.append(ordr)
    actors=np.array(factors)
    std=np.array(std)
    factors=np.delete(factors, W, axis=0)
    std=np.delete(std, W, axis=0)
    std=std.reshape(-1,1)
    factors[np.where(np.isnan(factors)==True)]=0
    return(std, factors)
    ###There are two NaN entries, corresponding to coin ratio of BCD, 43rd position.
    ###In a random cmc_historical, the supply is nan. Confirmed on cmc website chart:
    #at the time BCD was doing poorly - in march it was ranked 1250th.
def get_processed_barra_factors(beg,end,W):
    std, factors=get_raw_barra_factors(beg,end,W)
    factors=(factors-np.mean(factors,axis=0))/np.var(factors,axis=0)**.5
    return(std, factors)

In [60]:
####regression against returns from April 2nd to May 2nd; loadings computed on April 1st.####
def get_stats(beg,end,beg1,end1,method, weight):
    #method=averaged or barra
    #weight=0 or 1
    W,R=get_returns(beg,end)
    if method=='averaged':
        std, factors=get_processed_factors(beg1,end1,W)
        if weight==0:
            pass
        else:
            factors=factors*(std**-1)
    elif method=='barra':
        std, factors=get_processed_barra_factors(beg1,end1,W)
        if weight==0:
            pass
        else:
            factors=factors*(std**-1)
    reg=linear_model.LinearRegression()
    #sklearn automatically preprocesses the data by removing np.mean(axis=0)#
    reg.fit(factors,R.T)
    beta=reg.coef_
    inter=reg.intercept_
    inter=inter.reshape(-1,1)
    total_beta=np.hstack((beta,inter)) 
    total_factors=np.hstack((factors, np.ones(factors.shape[0]).reshape(-1,1)))
    R_hat=reg.predict(factors) #predicted returns = (tot_fact).(tot_beta.T)
    #####compute r_squared and F_scores#####
    #numpy and sklearn have built-in functions computing r2.
    #reg.score computes the mean along axis=0
    #r2_score computes the mean over full matrix. Example:
    #r2_score(R.T,R_hat1,multioutput='variance_weighted')
    residuals=R.T-R_hat
    R_mean=np.mean(R.T,axis=0) #daily average return
    tss=np.sum( (R.T-R_mean)**2, axis=0 ) #daily total_sum_squares
    rss=np.sum( (R.T-R_hat)**2, axis=0 ) #daily residual_sum_squares ~ var residuals
    r2=1-rss/tss #daily R^2
    cov_residuals=np.dot(residuals.T,residuals)/(factors.shape[0]-factors.shape[1])#daily
    var_residuals=cov_residuals.diagonal() #daily unbiased variance of residuals
    F=((tss-rss)/factors.shape[1])/var_residuals #daily F-scores
    #####compute z_scores, using total_beta and total_factors for compactness#####
    inv=np.linalg.inv(np.dot(total_factors.T,total_factors)) #usual (X^T.X)^{-1}
    cov_beta=np.tensordot(inv,var_residuals, axes=0) #daily covariance of betas: inv*var(residual)
    var_beta=cov_beta.diagonal() #daily variance of betas
    std_beta=np.sqrt(var_beta) #daily standard errors
    z_scores=total_beta/std_beta
    return(total_beta, total_factors, R_hat, r2, F, z_scores)
    
# df_list=[]
# for t in range(30):
#     df=pd.DataFrame()
#     df1=pd.DataFrame()
#     df['factor_name']=['returns_variance', 'returns_strength', 'returns_high_low', 'turnover', 'average_log_mkcap', 'average_coin_ratio','1']
#     df['beta']=total_beta[t]
#     df['std_error']=std_beta[t]
#     df['z_score']=z_scores[t]
#     df1['r_squared']=np.array([r2[t]])
#     df1['F_score']=np.array([F[t]])
#     df_list.append([df,df1])
# r2_barra_wt=r2

In [61]:

methods=['averaged','barra']
weights=[0,1]
T=[0,1,2,3]
beg=[1522540800+3600+t*604800 for t in T]
end=[1522540800+(t+1)*604800 for t in T]
beg1=[1515283200+t*604800 for t in T]
end1=[1522540800+t*604800 for t in T]
DDD={}

for m in methods:
    for w in weights:
        for t in T:
            roar=str(m)+str(w)+str(t)
            DDD[roar]=[get_stats(beg[t],end[t],beg1[t],end1[t],m,w)]


ralf0=np.empty((7,))
ralf1=np.empty((7,))
ralf2=np.empty((7,))
ralf3=np.empty((7,))

for key in DDD.keys():
    if key[-1]=='0':
        ralf0=np.vstack((ralf0, DDD[key][0][3]))
        print(key,DDD[key][0][3])
ralf0=ralf0[1:]
print('*******************')
for key in DDD.keys():
    if key[-1]=='1':
        ralf1=np.vstack((ralf1, DDD[key][0][3]))
        print(key,DDD[key][0][3])
ralf1=ralf1[1:]
print('*******************')
for key in DDD.keys():
    if key[-1]=='2':
        ralf2=np.vstack((ralf2, DDD[key][0][3]))
        print(key,DDD[key][0][3])
ralf2=ralf2[1:]
print('*******************')
for key in DDD.keys():
    if key[-1]=='3':
        ralf3=np.vstack((ralf3, DDD[key][0][3]))
        print(key,DDD[key][0][3])
ralf3=ralf3[1:]

totralf=[ralf0,ralf1,ralf2,ralf3]

for i in range(4):
    win=np.argmax(totralf[i], axis=0)
    print(win)

averaged10 [0.21910224 0.21843303 0.1719067  0.19680271 0.19798863 0.17341126
 0.09337745]
barra10 [0.23464047 0.25262882 0.19286133 0.21248771 0.20915593 0.19243795
 0.11491211]
barra00 [0.77553548 0.56876826 0.2654341  0.66683001 0.71535337 0.42790842
 0.09352656]
averaged00 [0.76149581 0.54469087 0.22337914 0.65219095 0.70271566 0.39468982
 0.05424383]
*******************
averaged01 [0.86266332 0.1007106  0.79798622 0.05906898 0.22335251 0.03211415
 0.23419538]
barra11 [0.29160276 0.07003139 0.18620155 0.029991   0.14858962 0.03582689
 0.12926576]
averaged11 [0.29020461 0.05527754 0.18160981 0.02219976 0.11216792 0.02304809
 0.12627529]
barra01 [0.86272134 0.13748312 0.8027595  0.06507402 0.26692414 0.04117157
 0.23486581]
*******************
averaged02 [0.11002444 0.48603144 0.5877402  0.11168411 0.56862633 0.42895364
 0.05912551]
barra02 [0.1354339  0.5168336  0.60562253 0.19247145 0.59909818 0.45504422
 0.06939645]
barra12 [0.22086513 0.14108849 0.14909606 0.14959044 0.29453871 0

In [None]:
#compare R^2-performance of barra/averaged factors, weighted/non weighted regressions.

print(r2_mine_nowt)
print(r2_mine_wt)
print(r2_barra_nowt)
print(r2_barra_wt)

t=list(range(31))
fig = plt.figure(figsize=(13, 13))
plt.title('R^2 comparison')
plt.scatter(t, r2_mine_nowt, c='r', label='mine')
plt.scatter(t, r2_mine_wt, c='b', label='mine_weight')
plt.scatter(t, r2_barra_nowt, c='g', label='barra')
plt.scatter(t, r2_barra_wt, c='y', label='barra_weight')
plt.legend()
plt.savefig('R^2_comparison.png')
plt.show()

FF=np.vstack((r2_mine_nowt,r2_mine_wt,r2_barra_nowt,r2_barra_wt))
scores=np.argmax(FF,axis=0)
print(variances)
[len(np.where(scores==i)[0]) for i in range(4)]

In [None]:
###compute covariance of factor returns###
#either compute cov of total_beta; or compute cov of beta, and attach a 1x1 block with inter.
#results are slightly different - intercept and beta are slightly correlated over time.
ess_cov=np.cov(beta.T)
ess1=np.hstack((ess_cov, np.zeros(6).reshape(-1,1)))
ess2=np.vstack((ess1, np.zeros(7).reshape(1,-1)))
ess2[-1,-1]=np.var(inter)

ess_totcov=np.cov(total_beta.T)
totfac=np.hstack((np.ones((factors.shape[0], 1)), factors))
D=np.cov(residuals, bias=1)
D1=np.var(residuals,axis=1)
D2=D.diagonal()

COV=np.dot(totfac, np.dot(ess_totcov,totfac.T)) + np.diag(D1)

In [None]:
###compute weights of optimal portfolio, given alpha###
#ideally alpha=returns, which you don't know.
#Namely, minimize w^T.C.w-alpha.w 
#solution is w=C^{-1}.alpha
def optimal_weights(alpha):
    #alpha=np.random.rand(factors.shape[0], 1)
    COV_=np.linalg.inv(COV)
    return np.dot(COV_,alpha)
#computeoptimal portfolio return: take returns after May2nd and check performance.
def portfolio_return(alpha, days_after_may2=1):
    test_returns=[]
    for base in coins:
        for quote in quotes:
            returns=upload_log_return(1525219200+days_after_may2*86400, base, quote, 86400)
            if np.isnan(returns)==True:
                returns=0
            test_returns.append(returns)
    test_returns=np.delete(test_returns, W)
    print(test_returns)
    ignorant_returns=np.dot(alpha,test_returns)
    hedged_returns=np.dot(optimal_weights(alpha).T, test_returns)
    return ignorant_returns, hedged_returns


In [None]:
alpha=np.zeros(76) 
alpha[6:]=1
print(alpha)
alpha/=np.sum(alpha)
#optimal_weights(alpha)
print(portfolio_return(alpha, 5))

In [None]:
###looking for homoscedasticity: variance of errors independent of time/estimated returns:
####white test for homoscedasticity: regress squared-error against regressors, check r2.
resi=residuals**2
reg.fit(factors, resi)
residuals_hat=reg.predict(factors)
resmean=np.mean(resi,axis=0) #daily average return
tss=np.sum( (resi-resmean)**2, axis=0 ) #daily total_sum_squares
rss=np.sum( (resi- residuals_hat)**2, axis=0 ) #daily residual_sum_squares ~ var residuals
r2=1-rss/tss #daily R^2

#compute covariance of error timeseries: TT_t,s=cov(err_t,err_s).
#TT should be close to a homothety
TT=np.cov(residuals.T)
TTD=TT.diagonal()
plt.scatter(range(len(TTD)),TTD)
#on each row, pick column with maximal entry and check if it's diagonal.
#works beautifully, T is almost diagonal
MAX=np.argmax(TT,axis=0)
MAX-list(range(len(TT)))



In [None]:
####looking for patterns in variance of returns versus marketcap.

pathh='/home/fbuonerba/factor_loadings/'
mkcap_list_usd=[]
mkcap_list_btc=[]
variance_usd=[]
variance_btc=[]
for coin in coins:
    with open(pathh+'log_mkcap_exact_'+str(coin)+'_USD_1531612800.txt')as uo:
        mcap=json.load(uo)
    mkcap_list_usd.append(mcap)
    with open(pathh+'log_mkcap_exact_'+str(coin)+'_BTC_1531612800.txt')as uoo:
        cap=json.load(uoo)
    mkcap_list_btc.append(cap)
    with open(pathh+'variance_'+str(coin)+'_USD_1524355200_1531612800_604800.txt')as uo:
        var=json.load(uo)
    variance_usd.append(var)
    with open(pathh+'variance_'+str(coin)+'_BTC_1524355200_1531612800_604800.txt')as uoo:
        varr=json.load(uoo)
    variance_btc.append(varr)

order_usd=np.argsort(np.array(mkcap_list_usd))
order_btc=np.argsort(np.array(mkcap_list_btc))
new_var_usd=[variance_usd[x] for x in order_usd]
new_var_btc=[variance_btc[x] for x in order_btc]


In [None]:
####Outliers = HB,DIG,ODE for both USD and BTC.
plt.scatter(list(range(len(new_var_btc))), new_var_btc, c='r', label='btc')
plt.savefig('ordered_var_btc.png')

In [None]:
plt.scatter(list(range(len(new_var_usd))), new_var_usd, c='b', label='usd')
plt.savefig('ordered_var_usd.png')

In [None]:
#report printout
print('Definition of factors, for each coin pair:')
print('*returns_variance = var(returns_t), over weekly returns Jan 7th-Apr 1st.')
print('*returns_strength=sum_t( log(1+return_t) ) over returns as above.')
print('*returns_high_low=log( max_t(return_t)/min_t(return_t) ) over returns as above.')
print('*turnover=(total traded volume)/(average coin supply) over trades Jan 7th-Apr 1st.')
print('*log_mkcap=average_t( log(coin_supply_t*price_t) ) over weekly supply and price Jan 7th-Apr 1st.')
print('*coin_ratio=average_t(coin_supply_t/coin_supply_ever) over weekly supply as above.')
print('Factor loadings have been scaled by z-score.')
print('')
for y in df_list:    
    for x in y:
        print(x)

In [None]:
#####regression against returns on April 1st only#####
t=1522540800
ret=[]
for base in coins:
    for quote in quotes:
        returns=upload_log_return(t, base, quote, 86400)
        if np.isnan(returns)==True:
            returns=0
        ret.append(returns)
ret=np.array(ret)
reg=linear_model.LinearRegression()
reg.fit(factors,ret.T)
beta=reg.coef_
inter=reg.intercept_
ret_hat=reg.predict(factors)
rss=np.linalg.norm(ret-ret_hat)**2
var_error=rss/(factors.shape[0]-factors.shape[1])
tss=np.linalg.norm(ret-np.mean(ret))**2
r_square=1-rss/tss
F_score=(tss-rss)/(factors.shape[1]*var_error)
cov_beta=np.linalg.inv(np.dot(factors.T,factors))*var_error
std_errors=np.sqrt(cov_beta.diagonal())
t_stats=beta/std_errors

df=pd.DataFrame()
df['factor_name']=['returns_variance', 'returns_strength', 'returns_high_low', 'turnover', 'log_mkcap', 'coin_ratio']
df['beta']=beta
df['std_error']=std_errors
df['z_score']=t_stats
df1=pd.DataFrame()
df1['r_squared']=np.array([r_square])
df1['F_score']=np.array([F_score])
print('Regression on April 1st returns.')
print('Definition of factors, for each coin pair:')
print('*returns_variance = var(returns_t), over weekly returns Jan 7th-Apr 1st.')
print('*returns_strength=sum_t( log(1+return_t) ) over returns as above.')
print('*returns_high_low=log( max_t(return_t)/min_t(return_t) ) over returns as above.')
print('*turnover=(total traded volume)/(average coin supply) over trades Jan 7th-Apr 1st.')
print('*log_mkcap=average_t( log(coin_supply_t*price_t) ) over weekly supply and price Jan 7th-Apr 1st.')
print('*coin_ratio=average_t(coin_supply_t/coin_supply_ever) over weekly supply as above.')
print('Factor loadings have been scaled by z-score.')
print('')
print(df)
print(df1)

In [None]:
#####February to May#####

from mp_functions import upload_log_return
with open('/home/fbuonerba/codes/meta_data/top_coins.txt') as ff:
    coins=json.load(ff)
coins=list(coins)
####LIZA is outlier for 2 of the risk factors...####
coins.remove('LIZA')
quotes=['USD','BTC']
#sysm=[str(coin)+'_'+str(quote) for coin in coins for quote in quotes]
t=1517702400
matrix=[]
#matrix of daily log_returns (time, base_quote)
while t<=1525564801:
    ret_t=[]
    for base in coins:
        for quote in quotes:
            returns=upload_log_return(t, base, quote, 86400)
            if np.isnan(returns)==True:
                returns=0
            ret_t.append(returns)
            #print(base,quote)
    matrix.append(ret_t)
    t+=86400
R=np.array(matrix)

factors=[]
factor_names1=['variance_', 'strength_', 'high_low_']
factor_names2=['turnover_', 'log_mkcap_']
factor_names3=['coin_ratio_']
path='/home/fbuonerba/codes/factor_loadings/'
finpath1='_'+str(1517702400)+'_'+str(1525564800)+'_'+str(604800)+'.txt'
finpath2='_'+str(1517702400)+'_'+str(1525564800)+'.txt'
finpath3='.txt'
for base in coins:
    for quote in quotes:
        sym=str(base)+'_'+str(quote)
        sym_row=[]
        for name in factor_names1:
            with open(path+name+sym+finpath1) as file:
                x=json.load(file)
            sym_row.append(x)
        for name in factor_names2:
            with open(path+name+sym+finpath2) as file:
                x=json.load(file)
            sym_row.append(x)
        for name in factor_names3:
            #symll=sym.split('_')
            with open(path+name+base+finpath3) as file:
                x=json.load(file)
            sym_row.append(x)
        #sym_row.append(1)
        factors.append(sym_row)
factors=np.array(factors)  
#normalization using z-score along coin axis:
factors=(factors-np.mean(factors,axis=0))/np.var(factors,axis=0)**.5
#X_with_ones = np.hstack((np.ones((factors.shape[0], 1)), factors))

In [None]:
######################################################
######################################################
######################################################

In [None]:
#regression against each one single time, BTC/USD
factor_test=factors
r=[]
for i in range(8):
    R_test=R[i]
    beta_test=np.linalg.lstsq(factor_test,R_test.T, rcond=None)
    S=np.dot(factors,beta_test[0])
    res_test=np.linalg.norm(R_test-S)**2
    r.append(1-res_test/np.linalg.norm(R_test)**2)
r=np.array(r)
print(np.max(r), np.min(r))

In [None]:
reg=linear_model.LinearRegression()
for i in range(6):
    fac=factors.T[i].reshape(-1,1)
    ret=R[8].T
    #print(fac.shape)
    reg.fit(fac,ret)
    temp_beta=reg.coef_[0]
    temp_inter=reg.intercept_
    print(temp_beta, temp_inter)
    print(reg.score(fac,ret))
    plt.plot(fac, ret,'o')
    plt.plot(fac, temp_beta*fac + temp_inter, 'r', label='Fitted line')
    plt.show()

In [None]:
for t in range(88):
    print(coins[int(t/2)],quotes[t%2])
    plt.plot(range(len(R)), R.T[t])
    plt.show()