# gpanterov/statsmodels forked from statsmodels/statsmodels

added fast unconditional multivariate cdf and tested with continuous …

`…and mixed data`
gpanterov committed Jul 7, 2012
1 parent 7c7ae88 commit 125c66ae37965ddb8297705b66bce1b9d05cfcd6
@@ -12,6 +12,7 @@
# NOTE: As it is, this module does not interact with the existing API
import numpy as np
+from scipy.special import erf
def AitchisonAitken(h, Xi, x, num_levels=False):
@@ -59,15 +60,6 @@ def AitchisonAitken(h, Xi, x, num_levels=False):
----------
Nonparametric econometrics : theory and practice / Qi Li and Jeffrey Scott Racine.
"""
-## Xi=np.asarray(Xi)
-## N = np.shape(Xi)[0]
-## if Xi.ndim>1:
-## K=np.shape(Xi)[1]
-## else:
-## K=1
-##
-## if K==0: return Xi
-## h = np.asarray(h, dtype = float)
Xi = np.abs(np.asarray(Xi, dtype=int))
x = np.abs(np.asarray(x, dtype=int))
@@ -296,5 +288,87 @@ def AitchisonAitken_Convolution(h, Xi, Xj):
for x in Dom_x[i]:
Sigma_x += AitchisonAitken (h[i], Xi[:, i], int(x),
num_levels=len(Dom_x[i])) * AitchisonAitken(h[i], Xj[i], int(x), num_levels=len(Dom_x[i]))
- Ordered[:, i] = Sigma_x[:, 0]
+ Ordered[:, i] = Sigma_x[:, 0]
return Ordered
+
+def Gaussian_cdf(h, Xi, x):
+ Xi = np.asarray(Xi)
+ x = np.asarray(x)
+ h = np.asarray(h, dtype=float)
+ N = np.shape(Xi)[0]
+ if Xi.ndim > 1:
+ K = np.shape(Xi)[1]
+ else:
+ K = 1
+ if K == 0:
+ return Xi
+ cdf = 0.5 * h * (1 + erf((x - Xi) / (h * np.sqrt(2))))

#### josef-pkt Jul 7, 2012

special.ndtr(x) has the cdf for standard normal, used by scipy.stats.distributions.norm

+ cdf = cdf.reshape([N, K])
+ return cdf
+
+def AitchisonAitken_cdf(h, Xi, x_u):
+ Xi = np.abs(np.asarray(Xi, dtype=int))
+ #x_u = np.abs(np.asarray(x, dtype=int))
+
+ if Xi.ndim > 1:
+ K = np.shape(Xi)[1]
+ N = np.shape(Xi)[0]
+ elif Xi.ndim == 1:
+ K = 1
+ N = np.shape(Xi)[0]
+ else: # ndim ==0 so Xi is a single point (number)
+ K = 1
+ N = 1
+ if K == 0:
+ return Xi
+
+ h = np.asarray(h, dtype=float)
+ Xi = Xi.reshape([N, K])
+ Dom_x = [np.unique(Xi[:, i]) for i in range(K)]
+ Ordered = np.empty([N, K])
+ for i in range(K):
+ Sigma_x = 0
+ # TODO: This can be vectorized
+ for x in Dom_x[i]:
+ if x <= x_u:
+ Sigma_x += AitchisonAitken (h[i], Xi[:, i], int(x),
+ num_levels=len(Dom_x[i]))
+ Ordered[:, i] = Sigma_x[:, 0]
+ return Ordered
+
+ if num_levels:
+ c = num_levels
+ kernel_value = np.empty(Xi.shape)
+ kernel_value.fill(h / (c - 1))
+ inDom = (Xi == x) * (1 - h)
+ kernel_value[Xi == x] = inDom[Xi == x]
+ kernel_value = kernel_value.reshape([N, K])
+ return kernel_value
+
+def WangRyzin_cdf(h, Xi, x_u):
+ Xi = np.abs(np.asarray(Xi, dtype=int))
+ h = np.asarray(h, dtype=float)
+ if Xi.ndim > 1:
+ K = np.shape(Xi)[1]
+ N = np.shape(Xi)[0]
+ elif Xi.ndim == 1:
+ K = 1
+ N = np.shape(Xi)[0]
+ else: # ndim ==0 so Xi is a single point (number)
+ K = 1
+ N = 1
+ if K == 0:
+ return Xi
+ Xi = Xi.reshape([N, K])
+ h = h.reshape((K, ))
+ Dom_x = [np.unique(Xi[:, i]) for i in range(K)]
+ Ordered = np.empty([N, K])
+ for i in range(K):
+ Sigma_x = 0
+ # TODO: Think about vectorizing this for optimal performance
+ for x in Dom_x[i]:
+ if x <= x_u:
+ Sigma_x += WangRyzin(h[i], Xi[:, i], int(x))
+ Ordered[:, i] = Sigma_x[:, 0]
+ return Ordered
+
 @@ -20,6 +20,7 @@ from scipy import integrate, stats import np_tools as tools import scipy.optimize as opt +import KernelFunctions as kf __all__ = ['UKDE', 'CKDE'] @@ -313,7 +314,7 @@ def IMSE(self, bw): # there is a + because loo_likelihood returns the negative return (F / (self.N ** 2) + self.loo_likelihood(bw) * 2 / ((self.N) * (self.N - 1))) - def cdf(self, val): + def cdf2(self, val): """ Returns the cumulative distribution function evaluated at val @@ -344,6 +345,18 @@ def cdf(self, val): elif self.K == 3: func = tools.IntegrateTrpl return func(val, self.pdf) + def cdf(self, edat=None): + + if edat is None: + edat = self.tdat + + cdf_est = tools.GPKE(self.bw, tdat=self.tdat, edat=edat,var_type=self.var_type, + ckertype = "gaussian_cdf", ukertype="aitchisonaitken_cdf", + okertype='wangryzin_cdf') / self.N + cdf_est = np.squeeze(cdf_est) + + return cdf_est + def __repr__(self): """Provide something sane to print."""
@@ -8,7 +8,10 @@
epanechnikov=kf.Epanechnikov, gaussian=kf.Gaussian,
gauss_convolution=kf.Gaussian_Convolution,
wangryzin_convolution=kf.WangRyzin_Convolution,
- aitchisonaitken_convolution=kf.AitchisonAitken_Convolution)
+ aitchisonaitken_convolution=kf.AitchisonAitken_Convolution,
+ gaussian_cdf=kf.Gaussian_cdf,
+ aitchisonaitken_cdf=kf.AitchisonAitken_cdf,
+ wangryzin_cdf=kf.WangRyzin_cdf)

#### rgommers Jul 7, 2012

Collaborator

Indentation not equal here.

Convolution_Kernels = dict(gauss_convolution=kf.Gaussian_Convolution,
wangryzin_convolution=kf.WangRyzin_Convolution)
@@ -280,5 +283,5 @@ def GPKE_Reg(bw, tdat, edat, var_type, ckertype='gaussian',
kernel_func[ukertype](bw[isunordered], tdat[:, isunordered], edat[i, isunordered])
), axis=1)
- dens[:,i] = np.prod(Kval, axis=1) #* 1. / (np.prod(bw[iscontinuous]))
+ dens[:,i] = np.prod(Kval, axis=1) * 1. / (np.prod(bw[iscontinuous]))
return dens
@@ -2,9 +2,8 @@
import numpy.testing as npt
import matplotlib.pyplot as plt
import scipy.stats as stats
-#import statsmodels.nonparametric as nparam
-import nonparametric2 as nparam
+import statsmodels.nonparametric as nparam
+
class MyTest(object):
def setUp(self):
N = 60
@@ -229,7 +228,20 @@ def test_poisson(self, a=2, N=250):
plt.title("Nonparametric Estimation of the Density of Poisson Distributed Random Variable")
plt.legend(('Actual','Scott','CV_LS','CV_ML'))
plt.show()
-
+ def test_continuous_cdf(self,edat=None):
+ dens = nparam.UKDE (tdat = [self.Italy_gdp, self.growth], var_type = 'cc',
+ bw = 'cv_ml')
+ sm_result = dens.cdf()[0:5]
+ R_result = [0.192180770,0.299505196,0.557303666,
+ 0.513387712, 0.210985350]
+ npt.assert_allclose(sm_result, R_result, atol = 1e-3)

#### rgommers Jul 7, 2012

Collaborator

Need a blank line here.

+ def test_mixeddata_cdf(self, edat=None):
+ dens = nparam.UKDE(tdat = [self.Italy_gdp, self.oecd], var_type='cu',
+ bw='cv_ml')
+ sm_result = dens.cdf()[0:5]
+ R_result = [0.54700010, 0.65907039, 0.89676865, 0.74132941, 0.25291361]
+ npt.assert_allclose(sm_result, R_result, atol = 1e-3)
+
class TestCKDE(MyTest):
def test_mixeddata_CV_LS (self):

### josef-pkt replied Jul 7, 2012

 nice, I like having the cdf available, Azzalini (IIRC) mentioned that the rate (as function of n) for the bandwidth for cdf should be smaller (?) than for the pdf. Did you see anything about bw choice for cdf?
Collaborator

### rgommers replied Jul 7, 2012

 Fast, and seems to work well. At least in 1-D, converges nicely to the empirical CDF.
Collaborator

### rgommers replied Jul 7, 2012

 @josef-pkt that's what I thought too, bandwidth isn't the same as for pdf.
Collaborator

### rgommers replied Jul 7, 2012

 Can you factor out all the code related to `asarray`, `K, N` and reshaping? It's more than 10 lines that are duplicated in every single kernel function. Should be something like ``````def _get_shape_and_type(Xi, x, kind='c'): ... return Xi, x, K, N ``````
Collaborator

### rgommers replied Jul 7, 2012

 The `UKDE, CKDE` interface now doesn't allow specifying the kernels to use. The Epannechnikov kernel isn't used at all. Are you planning to expand that interface? In any case, what the default kernels are should be documented.
Collaborator

### rgommers replied Jul 7, 2012

 I have the feeling that the for-loop in `GPKE` can still be optimized, it's very expensive now. You can see this easily by profiling in IPython. Use for example `%prun dens_scott.cdf()` after having run the below script. 33000 function calls for a 1-D example with 1000 points. ``````import numpy as np from statsmodels.sandbox.distributions.mixture_rvs import mixture_rvs from statsmodels.nonparametric import UKDE from statsmodels.tools.tools import ECDF import matplotlib.pyplot as plt np.random.seed(12345) obs_dist = mixture_rvs([.25,.75], size=1000, dist=[stats.norm, stats.norm], kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.5))) dens_scott = UKDE(tdat=[obs_dist], var_type='c', bw='normal_reference') est_scott = dens_scott.pdf() xx = np.linspace(-4, 4, 100) est_scott_cdf = dens_scott.cdf(xx) ecdf = ECDF(obs_dist) # Plot the cdf fig = plt.figure() plt.plot(xx, est_scott_cdf, 'b.-') plt.plot(ecdf.x, ecdf.y, 'r.-') plt.show() ``````
Collaborator

### rgommers replied Jul 7, 2012

 The for-loop could be over the number of variables instead of the data points, right?

### jseabold replied Jul 12, 2012

 It looks that way to me too. @gpanterov is there any reason this couldn't be vectorized over the number of observations?