Permalink
Browse files

added fast unconditional multivariate cdf and tested with continuous …

…and mixed data
  • Loading branch information...
gpanterov committed Jul 7, 2012
1 parent 7c7ae88 commit 125c66ae37965ddb8297705b66bce1b9d05cfcd6
@@ -12,6 +12,7 @@
# NOTE: As it is, this module does not interact with the existing API
import numpy as np
+from scipy.special import erf
def AitchisonAitken(h, Xi, x, num_levels=False):
@@ -59,15 +60,6 @@ def AitchisonAitken(h, Xi, x, num_levels=False):
----------
Nonparametric econometrics : theory and practice / Qi Li and Jeffrey Scott Racine.
"""
-## Xi=np.asarray(Xi)
-## N = np.shape(Xi)[0]
-## if Xi.ndim>1:
-## K=np.shape(Xi)[1]
-## else:
-## K=1
-##
-## if K==0: return Xi
-## h = np.asarray(h, dtype = float)
Xi = np.abs(np.asarray(Xi, dtype=int))
x = np.abs(np.asarray(x, dtype=int))
@@ -296,5 +288,87 @@ def AitchisonAitken_Convolution(h, Xi, Xj):
for x in Dom_x[i]:
Sigma_x += AitchisonAitken (h[i], Xi[:, i], int(x),
num_levels=len(Dom_x[i])) * AitchisonAitken(h[i], Xj[i], int(x), num_levels=len(Dom_x[i]))
- Ordered[:, i] = Sigma_x[:, 0]
+ Ordered[:, i] = Sigma_x[:, 0]
return Ordered
+
+def Gaussian_cdf(h, Xi, x):
+ Xi = np.asarray(Xi)
+ x = np.asarray(x)
+ h = np.asarray(h, dtype=float)
+ N = np.shape(Xi)[0]
+ if Xi.ndim > 1:
+ K = np.shape(Xi)[1]
+ else:
+ K = 1
+ if K == 0:
+ return Xi
+ cdf = 0.5 * h * (1 + erf((x - Xi) / (h * np.sqrt(2))))
@josef-pkt

josef-pkt Jul 7, 2012

special.ndtr(x) has the cdf for standard normal, used by scipy.stats.distributions.norm

+ cdf = cdf.reshape([N, K])
+ return cdf
+
+def AitchisonAitken_cdf(h, Xi, x_u):
+ Xi = np.abs(np.asarray(Xi, dtype=int))
+ #x_u = np.abs(np.asarray(x, dtype=int))
+
+ if Xi.ndim > 1:
+ K = np.shape(Xi)[1]
+ N = np.shape(Xi)[0]
+ elif Xi.ndim == 1:
+ K = 1
+ N = np.shape(Xi)[0]
+ else: # ndim ==0 so Xi is a single point (number)
+ K = 1
+ N = 1
+ if K == 0:
+ return Xi
+
+ h = np.asarray(h, dtype=float)
+ Xi = Xi.reshape([N, K])
+ Dom_x = [np.unique(Xi[:, i]) for i in range(K)]
+ Ordered = np.empty([N, K])
+ for i in range(K):
+ Sigma_x = 0
+ # TODO: This can be vectorized
+ for x in Dom_x[i]:
+ if x <= x_u:
+ Sigma_x += AitchisonAitken (h[i], Xi[:, i], int(x),
+ num_levels=len(Dom_x[i]))
+ Ordered[:, i] = Sigma_x[:, 0]
+ return Ordered
+
+ if num_levels:
+ c = num_levels
+ kernel_value = np.empty(Xi.shape)
+ kernel_value.fill(h / (c - 1))
+ inDom = (Xi == x) * (1 - h)
+ kernel_value[Xi == x] = inDom[Xi == x]
+ kernel_value = kernel_value.reshape([N, K])
+ return kernel_value
+
+def WangRyzin_cdf(h, Xi, x_u):
+ Xi = np.abs(np.asarray(Xi, dtype=int))
+ h = np.asarray(h, dtype=float)
+ if Xi.ndim > 1:
+ K = np.shape(Xi)[1]
+ N = np.shape(Xi)[0]
+ elif Xi.ndim == 1:
+ K = 1
+ N = np.shape(Xi)[0]
+ else: # ndim ==0 so Xi is a single point (number)
+ K = 1
+ N = 1
+ if K == 0:
+ return Xi
+ Xi = Xi.reshape([N, K])
+ h = h.reshape((K, ))
+ Dom_x = [np.unique(Xi[:, i]) for i in range(K)]
+ Ordered = np.empty([N, K])
+ for i in range(K):
+ Sigma_x = 0
+ # TODO: Think about vectorizing this for optimal performance
+ for x in Dom_x[i]:
+ if x <= x_u:
+ Sigma_x += WangRyzin(h[i], Xi[:, i], int(x))
+ Ordered[:, i] = Sigma_x[:, 0]
+ return Ordered
+
@@ -20,6 +20,7 @@
from scipy import integrate, stats
import np_tools as tools
import scipy.optimize as opt
+import KernelFunctions as kf
__all__ = ['UKDE', 'CKDE']
@@ -313,7 +314,7 @@ def IMSE(self, bw):
# there is a + because loo_likelihood returns the negative
return (F / (self.N ** 2) + self.loo_likelihood(bw) * 2 / ((self.N) * (self.N - 1)))
- def cdf(self, val):
+ def cdf2(self, val):
"""
Returns the cumulative distribution function evaluated at val
@@ -344,6 +345,18 @@ def cdf(self, val):
elif self.K == 3:
func = tools.IntegrateTrpl
return func(val, self.pdf)
+ def cdf(self, edat=None):
+
+ if edat is None:
+ edat = self.tdat
+
+ cdf_est = tools.GPKE(self.bw, tdat=self.tdat, edat=edat,var_type=self.var_type,
+ ckertype = "gaussian_cdf", ukertype="aitchisonaitken_cdf",
+ okertype='wangryzin_cdf') / self.N
+ cdf_est = np.squeeze(cdf_est)
+
+ return cdf_est
+
def __repr__(self):
"""Provide something sane to print."""
@@ -8,7 +8,10 @@
epanechnikov=kf.Epanechnikov, gaussian=kf.Gaussian,
gauss_convolution=kf.Gaussian_Convolution,
wangryzin_convolution=kf.WangRyzin_Convolution,
- aitchisonaitken_convolution=kf.AitchisonAitken_Convolution)
+ aitchisonaitken_convolution=kf.AitchisonAitken_Convolution,
+ gaussian_cdf=kf.Gaussian_cdf,
+ aitchisonaitken_cdf=kf.AitchisonAitken_cdf,
+ wangryzin_cdf=kf.WangRyzin_cdf)
@rgommers

rgommers Jul 7, 2012

Collaborator

Indentation not equal here.

Convolution_Kernels = dict(gauss_convolution=kf.Gaussian_Convolution,
wangryzin_convolution=kf.WangRyzin_Convolution)
@@ -280,5 +283,5 @@ def GPKE_Reg(bw, tdat, edat, var_type, ckertype='gaussian',
kernel_func[ukertype](bw[isunordered], tdat[:, isunordered], edat[i, isunordered])
), axis=1)
- dens[:,i] = np.prod(Kval, axis=1) #* 1. / (np.prod(bw[iscontinuous]))
+ dens[:,i] = np.prod(Kval, axis=1) * 1. / (np.prod(bw[iscontinuous]))
return dens
@@ -2,9 +2,8 @@
import numpy.testing as npt
import matplotlib.pyplot as plt
import scipy.stats as stats
-#import statsmodels.nonparametric as nparam
-import nonparametric2 as nparam
-reload(nparam)
+import statsmodels.nonparametric as nparam
+
class MyTest(object):
def setUp(self):
N = 60
@@ -229,7 +228,20 @@ def test_poisson(self, a=2, N=250):
plt.title("Nonparametric Estimation of the Density of Poisson Distributed Random Variable")
plt.legend(('Actual','Scott','CV_LS','CV_ML'))
plt.show()
-
+ def test_continuous_cdf(self,edat=None):
+ dens = nparam.UKDE (tdat = [self.Italy_gdp, self.growth], var_type = 'cc',
+ bw = 'cv_ml')
+ sm_result = dens.cdf()[0:5]
+ R_result = [0.192180770,0.299505196,0.557303666,
+ 0.513387712, 0.210985350]
+ npt.assert_allclose(sm_result, R_result, atol = 1e-3)
@rgommers

rgommers Jul 7, 2012

Collaborator

Need a blank line here.

+ def test_mixeddata_cdf(self, edat=None):
+ dens = nparam.UKDE(tdat = [self.Italy_gdp, self.oecd], var_type='cu',
+ bw='cv_ml')
+ sm_result = dens.cdf()[0:5]
+ R_result = [0.54700010, 0.65907039, 0.89676865, 0.74132941, 0.25291361]
+ npt.assert_allclose(sm_result, R_result, atol = 1e-3)
+
class TestCKDE(MyTest):
def test_mixeddata_CV_LS (self):

8 comments on commit 125c66a

nice, I like having the cdf available, Azzalini (IIRC) mentioned that the rate (as function of n) for the bandwidth for cdf should be smaller (?) than for the pdf. Did you see anything about bw choice for cdf?

Collaborator

rgommers replied Jul 7, 2012

Fast, and seems to work well. At least in 1-D, converges nicely to the empirical CDF.

Collaborator

rgommers replied Jul 7, 2012

@josef-pkt that's what I thought too, bandwidth isn't the same as for pdf.

Collaborator

rgommers replied Jul 7, 2012

Can you factor out all the code related to asarray, K, N and reshaping? It's more than 10 lines that are duplicated in every single kernel function. Should be something like

def _get_shape_and_type(Xi, x, kind='c'):
    ...
    return Xi, x, K, N
Collaborator

rgommers replied Jul 7, 2012

The UKDE, CKDE interface now doesn't allow specifying the kernels to use. The Epannechnikov kernel isn't used at all. Are you planning to expand that interface? In any case, what the default kernels are should be documented.

Collaborator

rgommers replied Jul 7, 2012

I have the feeling that the for-loop in GPKE can still be optimized, it's very expensive now. You can see this easily by profiling in IPython. Use for example %prun dens_scott.cdf() after having run the below script. 33000 function calls for a 1-D example with 1000 points.

import numpy as np

from statsmodels.sandbox.distributions.mixture_rvs import mixture_rvs
from statsmodels.nonparametric import UKDE
from statsmodels.tools.tools import ECDF

import matplotlib.pyplot as plt


np.random.seed(12345)
obs_dist = mixture_rvs([.25,.75], size=1000, dist=[stats.norm, stats.norm],
                kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5)))

dens_scott = UKDE(tdat=[obs_dist], var_type='c', bw='normal_reference')
est_scott = dens_scott.pdf()

xx = np.linspace(-4, 4, 100)
est_scott_cdf = dens_scott.cdf(xx)
ecdf = ECDF(obs_dist)

# Plot the cdf
fig = plt.figure()
plt.plot(xx, est_scott_cdf, 'b.-')
plt.plot(ecdf.x, ecdf.y, 'r.-')

plt.show()
Collaborator

rgommers replied Jul 7, 2012

The for-loop could be over the number of variables instead of the data points, right?

It looks that way to me too. @gpanterov is there any reason this couldn't be vectorized over the number of observations?

Please sign in to comment.