In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import plotnine
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt
from time import time

# file:///C:/Users/ERIKDR~1/AppData/Local/Temp/AgahiMaryam2013.pdf
# https://stats.stackexchange.com/questions/353834/theoretical-computation-kendalls-tau

In [2]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def rvec(x):
    return np.atleast_2d(x)

def cvec(x):
    return rvec(x).T

def to_3d(mat):
    return np.atleast_3d(mat).transpose(2,0,1)

def srho(x,y):
    return stats.spearmanr(x,y)[0]

"""
VECTORIZED PAIRWISE CORRELATION
A : (n x p x s)
n: sample size (rows of the data)
p: columns of the data (could be bootstrapped columns)
s: copies of the n x p matrix (could be studentized copies)
"""

def pairwise_cor(A, B):
    assert A.shape == B.shape
    n = A.shape[0]
    if (len(A.shape) == 2):
        mu_A, mu_B = rvec(A.mean(0)), rvec(B.mean(0))
        se_A, se_B = A.std(axis=0,ddof=1), B.std(axis=0,ddof=1)
    else:
        mu_A, mu_B = to_3d(A.mean(0)), to_3d(B.mean(axis=0))
        se_A, se_B = A.std(axis=0,ddof=1), B.std(axis=0,ddof=1)
    D = np.sum((A - mu_A) * (B - mu_B),0) / (n-1)
    return D / (se_A*se_B)

def bs_student_spearman(x, y, n_bs, n_s, alpha=0.05):
    # alpha = rvec([0.05, 0.1, 0.2])
    tt = ['student','normal','quant']
    if isinstance(alpha, float) | isinstance(alpha,list):
        alpha = np.array([alpha])
    alpha = rvec(alpha)
    assert len(x) == len(y)
    assert np.all(alpha > 0) & np.all(alpha < 0.5)
    # (i) Get baseline statistic
    rho = stats.spearmanr(x, y)[0]
    n = len(x)
    pvals = np.r_[alpha/2,1-alpha/2].T
    # (ii) Transform data into ranks and sample with replacement
    x_r, y_r = stats.rankdata(x), stats.rankdata(y)
    x_bs = pd.Series(x_r).sample(frac=n_bs,replace=True)
    y_bs = pd.Series(y_r).iloc[x_bs.index]
    x_bs = x_bs.values.reshape([n,n_bs])
    y_bs = y_bs.values.reshape([n,n_bs])
    rho_bs = pairwise_cor(x_bs, y_bs)
    se_bs = rho_bs.std(ddof=1)
    # (iii) Bootstrap the bootstraps (studentize) to get standard error
    x_s = pd.DataFrame(x_bs).sample(frac=n_s,replace=True)
    y_s = pd.DataFrame(y_bs).iloc[x_s.index]
    x_s = x_s.values.reshape([n_s,n,n_bs]).transpose(1,2,0)
    y_s = y_s.values.reshape([n_s,n,n_bs]).transpose(1,2,0)
    se_s = pairwise_cor(x_s, y_s).std(axis=1,ddof=1)
    del x_s, y_s
    # Get the confidence intervals for the different approaches
    z_q = np.quantile(rho_bs,pvals.flat).reshape(pvals.shape)
    z_n = stats.norm.ppf(pvals)
    t_s = (rho_bs-rho)/se_s
    z_s = np.quantile(t_s,pvals.flat).reshape(pvals.shape)
    df = pd.DataFrame(np.r_[rho - se_bs*z_s[:,[1,0]], rho - se_bs*z_n[:,[1,0]], z_q],columns=['lb','ub'])
    df = df.assign(tt=np.repeat(tt,len(pvals)),alpha=np.tile(2*pvals[:,0],len(tt)))
    return df

In [5]:
seed = 1234
nsim, n = 1000, 100
n_bs, n_s = 500, 500
np.random.seed(seed)

x_gt = np.random.randn(int(1e7))
y_gt = stats.norm.cdf(x_gt) + np.random.rand(len(x_gt))
rho_gt = stats.spearmanr(x_gt, y_gt)[0]

stime = time()
holder = [] #np.zeros([nsim, 3])
for i in range(nsim):
    x = np.random.randn(n)
    y = stats.norm.cdf(x) + np.random.rand(n)
    tmp_df = bs_student_spearman(x, y, n_bs=500, n_s=500, alpha=[0.05, 0.10, 0.2])
    holder.append(tmp_df.assign(idx=i))
    if (i + 1) % 100 == 0:        
        nleft, nsec = nsim - (i+1), time() - stime
        rate = (i+1) / nsec
        eta = nleft / rate
        print('ETA: %i seconds (%i of %i)' % (eta, i+1, nsim))

ETA: 561 seconds (100 of 1000)
ETA: 490 seconds (200 of 1000)
ETA: 405 seconds (300 of 1000)
ETA: 337 seconds (400 of 1000)
ETA: 283 seconds (500 of 1000)
ETA: 227 seconds (600 of 1000)
ETA: 169 seconds (700 of 1000)
ETA: 111 seconds (800 of 1000)
ETA: 54 seconds (900 of 1000)
ETA: 0 seconds (1000 of 1000)


In [6]:
df_sim = pd.concat(holder).reset_index(None, True)

In [7]:
dat_cov = df_sim.groupby(['tt','alpha']).apply(lambda x: np.mean((x.lb<=rho_gt) & (x.ub>=rho_gt))).reset_index()
dat_cov.rename(columns={0:'coverage'},inplace=True)
dat_cov.pivot('tt','alpha','coverage')

alpha,0.05,0.10,0.20
tt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
normal,0.942,0.886,0.788
quant,0.936,0.89,0.786
student,0.937,0.897,0.793


In [8]:
df_sim.groupby(['alpha','tt']).apply(lambda x: pd.Series({'err_lb':np.mean(rho_gt < x.lb),
                                                          'err_ub':np.mean(rho_gt > x.ub),
                                                          'lb':x.lb.mean(),'ub':x.ub.mean()}))

Unnamed: 0_level_0,Unnamed: 1_level_0,err_lb,err_ub,lb,ub
alpha,tt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.05,normal,0.03,0.028,0.603083,0.785184
0.05,quant,0.022,0.042,0.596081,0.776209
0.05,student,0.015,0.048,0.588746,0.774657
0.1,normal,0.062,0.052,0.617722,0.770546
0.1,quant,0.049,0.061,0.613212,0.764671
0.1,student,0.04,0.063,0.6073,0.762554
0.2,normal,0.11,0.102,0.634599,0.753669
0.2,quant,0.104,0.11,0.632474,0.750563
0.2,student,0.089,0.118,0.628031,0.74813


In [9]:
from arch.bootstrap import IIDBootstrap

In [10]:
# holder = np.zeros([nsim, 2])
# for i in range(nsim):
#     x = np.random.randn(n)
#     y = stats.norm.cdf(x) + np.random.rand(n)    

In [11]:
bs = IIDBootstrap(x=x,y=y)
bs.conf_int(func=srho, reps=1000,method='bca',size=0.95)

array([[0.49256373],
       [0.72761943]])

In [12]:
print(stats.spearmanr(x, y))
print(pairwise_cor(cvec(stats.rankdata(x)), cvec(stats.rankdata(y)))[0])

SpearmanrResult(correlation=0.6223942394239423, pvalue=4.754313796782138e-12)
0.6223942394239425
