# Intrinsic Dimension and Density Estimation

You can use external libraries for linear algebra operations but you are expected to write your
own algorithms.

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.preprocessing import OrdinalEncoder

from algorithms import my_two_nn, my_pca, my_histogram_density_estimation, my_kernel_density_estimation

## Exercise 1
Using the `dry_beans_dataset` as we did in previous laboratories (ie. follow the same preprocessing steps but do not perform a train-test split), program your own implementation of the two-NN estimate for the Intrinsic Dimension.

Is the result compatible with what you would expect from an analysis of PCA’s spectrum?

In [13]:
df = pd.read_excel("../Datasets/Dry_Bean_Dataset.xlsx")

y = df['Class']
X = df.drop('Class', axis=1)

In [14]:
y = np.array(y)

encoder = OrdinalEncoder()
encoder.fit(y.reshape(-1, 1))

y = encoder.transform(y.reshape(-1, 1))

In [15]:
d = my_two_nn(X)
print(d)

(13611, 3)
[    0 10926 10918]
1.9102908936513991


In [16]:
x_pca, dim, selected_eigenvecs = my_pca(X)

## Exercise 2

Using the following code, create a one-dimensional dataset of size N = 100.
```
X = np.concatenate(
(np.random.standard_t(1, int(0.04*N))-3.5,
np.random.normal(5, 1, int(0.48 * N)),
np.random.normal(7.5, 1, int(0.48 * N))
))[:, np.newaxis]
```

Compute the density estimation with your implementations of: \
• Histogram Density Estimation (Freedman Diaconis rule) \
• Kernel Density Estimation (KDE) - Gaussian kernel (Silverman’s rule)

In [14]:
# set seed
np.random.seed(42)

N = 10000
X = np.concatenate(
(np.random.standard_t(1, int(0.04*N))-3.5,
np.random.normal(5, 1, int(0.48 * N)),
np.random.normal(7.5, 1, int(0.48 * N))
))[:, np.newaxis]

In [15]:
px.histogram(X[:, 0], nbins=10, title="Dataset Histogram", labels={"value": "X"}, width=600, height=400).show()

In [16]:
prob_hist, x0, x1, h = my_histogram_density_estimation(X)
print(prob_hist)

10000 0.24492320139268245 -43.211996600775606 91.99366406194864 553
[  1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   1   0   0   0   0   0   1   1   0   0   0   0   0   0   1   0
   0   0   1   1   1   1   1   0   0   1   0   0   0   3   1   1   0   0
   0   0   0   0   1   2   0   1   1   1   1   0   0   1   0   1   0   0
   2   0   2   2   1   3   2   4   7   7   5   6  14  13  13  18  24  31
  35  25  24  23   8   9  11   9   9   5   3   5   2   4   4   4   4   2
   1   0   1   5   9  13  24  35  64  94 136 206 280 324 416 438 507 472
 511 475 417 444 417 476 416 515 469 483 453 371 317 275 204 147  78  60
  29  21  11   6   1   1   0   0   0   1   0   0   0   1

In [11]:
# plot histogram density estimation
bins = np.arange(x0, x1 + h, h)

centers = bins[:-1] + h/2  # centri dei bin

fig = px.bar(x=centers, y=prob_hist,
             title="Histogram Density Estimation",
             labels={"x": "X", "y": "Probability"},
             width=700, height=400)
fig.show()

In [12]:
probs, h, x_grid = my_kernel_density_estimation(X, grid_points=1000)

1000 0.4542159576960844


In [13]:
fig = px.line(x=x_grid, y=probs, title=f"Kernel Density Estimation (h={h:.3f})",
              labels={"x": "X", "y": "Density"}, width=700, height=400)
fig.show()