# Quickstart Tutorial
This short demo of the ddl package shows the main interfaces and basic ideas of the library.

In [None]:
# Setup imports
from __future__ import division, print_function
import sys, os, warnings
import numpy as np
import matplotlib.pyplot as plt
sys.path.append('..')  # Enable importing from package ddl without installing ddl

# Setup plotting functions
%matplotlib inline 
PLOT_HEIGHT = 3
def get_ax(ax, title):
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(PLOT_HEIGHT, PLOT_HEIGHT))
        fig.tight_layout()
    if title is not None:
        ax.set_title(title)
    ax.axis('equal')
    ax.set_adjustable('box')
    return ax
def plot_data(X, y, ax=None, title=None):
    ax = get_ax(ax, title)
    ax.scatter(X[:, 0], X[:, 1], c=y, s=4)
def plot_density(density, bounds=[[0, 1], [0, 1]], n_grid=40, ax=None, title=None):
    ax = get_ax(ax, title)
    x = np.linspace(*bounds[0], n_grid)
    y = np.linspace(*bounds[1], n_grid)
    X_grid, Y_grid = np.meshgrid(x, y)
    logpdf = density.score_samples(np.array([X_grid.ravel(), Y_grid.ravel()]).T)
    pdf_grid = np.exp(logpdf).reshape(X_grid.shape)
    ax.pcolormesh(X_grid, Y_grid, -pdf_grid, cmap='gray', zorder=-1)
def plot_multiple(X_arr, y_arr, titles=None):
    if titles is None:
        titles = [None] * len(X_arr)
    n_cols = int(min(5, len(X_arr)))
    n_rows = int(np.ceil(len(X_arr) / n_cols))
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(PLOT_HEIGHT * n_cols, PLOT_HEIGHT * n_rows))
    axes = axes.ravel()
    fig.tight_layout()
    for i, (X, y, ax, title) in enumerate(zip(X_arr, y_arr, axes, titles)):
        if hasattr(X, 'score_samples'):
            # Special case of plotting a density instead
            if 'get_prev_bounds' in y:
                if y.pop('get_prev_bounds'):
                    y['bounds'] = [axes[i - 1].get_xlim(), axes[i - 1].get_ylim()]
            plot_density(X, ax=ax, title=title, **y)
        else:
            plot_data(X, y, ax=ax, title=title)

In [None]:
# Make toy dataset to play with
from ddl.datasets import make_toy_data
D = make_toy_data(data_name='concentric_circles', n_samples=500, random_state=0)
X, y = D.X * [1, 2], D.y # Add axis scaling to make it a little more interesting
plot_data(X, y)

## Destructors
Destructors are the building block of everything in the `ddl` library.
At their core, destructors are invertible transformations that project onto the unit hypercube.
Each destructor, explicitly or *implicitly* has a corresponding probabilistic density which is equal to abs(det(Jacobian)).

First, we will give an example of an independent destructor with an *explicit* density.
(Note that when fitting the destructor, the density is fitted first and then the destructor is fitted based on the density. Thus, the density does not need to be fitted a priori.)

In [None]:
# Independent destructor
from ddl.independent import IndependentDensity, IndependentDestructor
from ddl.univariate import ScipyUnivariateDensity
import scipy.stats

# Create independent Gaussian/normal density
ind_density = IndependentDensity(
    univariate_estimators=ScipyUnivariateDensity(scipy_rv=scipy.stats.norm)
)
# Create corresponding destructor using the explicit density created above
ind_destructor = IndependentDestructor(ind_density)

# Fit and transform data 
Z_ind = ind_destructor.fit_transform(X)
plot_multiple([X, ind_destructor.density_, Z_ind], 
              [y, dict(get_prev_bounds=True), y], #=np.array([np.min(X,axis=0), np.max(X, axis=0)]).T), y], 
              titles=['Before destructor', 'Density', 'After destructor'])

Note that data has been transformed onto the unit square.

In [None]:
# Print out mean and variance of estimated independent Gaussians
univ_densities = ind_destructor.density_.univariate_densities_
for i, (mu, sigma) in enumerate([(u.rv_.args[0], u.rv_.args[1]) for u in univ_densities]):
    print('Mean and standard deviation of variable %d: %g, %g' % (i, mu, sigma) )

Note that the estimated standard deviation for the second variable is twice as much as the first.
Thus, the transformation removes the oval shape of the distribution.

We now present a tree destructor to give another example of a destructor with an explicit density.
Because our tree destructor is a *canonical* destructor (i.e. one whose domain is the unit hypercube), we will use our previously destroyed data `Z_ind` instead of `X`.

In [None]:
from ddl.tree import TreeDensity, TreeDestructor, RandomTreeEstimator
tree_density = TreeDensity(
    tree_estimator=RandomTreeEstimator(min_samples_leaf=3, random_state=0)
)
tree_destructor = TreeDestructor(tree_density)
Z_tree = tree_destructor.fit_transform(Z_ind)
plot_multiple([Z_ind, tree_destructor.density_, Z_tree], [y, {}, y], 
              titles=['Before tree destructor', 'Tree density', 'After tree destructor'])

Note that the tree density is piecewise constant.
(This implies that the destructor is piecewise linear since the density is equal to the abs(det(Jacobian)).)

## Composite destructors (Implicit density)
We will now introduce composite destructors which are simply destructors composed of multiple transformations.
The density for composite destructors is *implicit* based on the transformations in the composition.

*Note that the last transformation must be a true destructor but the other transformations merely need to be invertible.*

We will compose the above two destructors into a single composite destructor.

In [None]:
from sklearn.base import clone
from ddl.base import CompositeDestructor
composite_destructor = CompositeDestructor([clone(ind_destructor), clone(tree_destructor)])
Z_tree_2 = composite_destructor.fit_transform(X)
plot_multiple([X, composite_destructor.density_, Z_tree_2, Z_tree], [y, dict(get_prev_bounds=True), y, y], 
              titles=['Before composite destructor', 'Implicit density', 'After composite destructor',
                      'Two separate transformations'])

Notice how the composite destructor does the same thing as doing two sequential, separate transformations (far right).

## Deep destructors
Finally, we will show how to construct a deep destructor that automatically builds up a composite destructor by appending destructors as needed.
(The deep destructor chooses the number of destructors/layers based on cross validation.)

A deep destructor has an initial destructor whose domain can be anything but requires canonical destructors (i.e. destructors whose domain is the unit hypercube) for other transformation layers in the destructor.
Thus, we will use a simple initial destructor and then a tree destructor, which is a canonical destructor by construction.

In [None]:
from sklearn.base import clone
from ddl.deep import DeepDestructorCV
deep_destructor = DeepDestructorCV(
    init_destructor=clone(ind_destructor),
    canonical_destructor=clone(tree_destructor),
    cv=3,
)
Z_deep = deep_destructor.fit_transform(X)
print('Number of layers including initial destructor = %d' % len(deep_destructor.fitted_destructors_))
plot_multiple([X, composite_destructor.density_, Z_deep], [y, dict(get_prev_bounds=True), y], 
              titles=['Before deep destructor', 'Implicit density', 'After deep destructor'])

We can inspect the local densities associated with each destructor/layer of the deep destructor:

In [None]:
n_layers = len(deep_destructor.fitted_destructors_)
# Transform up to layer i
Z = [deep_destructor.transform(X, partial_idx=np.arange(i + 1)) for i in range(n_layers)]
# Extract "local" densities corresponding to each layer
local_densities = [d.density_ for d in deep_destructor.fitted_destructors_]

# Visualize results 
X_arr = ([X, deep_destructor.density_]
         + np.array([local_densities, Z]).transpose().ravel().tolist())
y_arr = ([y, dict(get_prev_bounds=True)]
         + [dict(get_prev_bounds=True), y]
         + np.array([[{}, y] for _ in range(n_layers - 1)]).ravel().tolist())
titles = (['Original data', 'Global deep density']
          + np.array([
              ('Local density L=%d' % (i + 1), 'Data after %dth destructor' % (i + 1))
              for i in range(n_layers)
          ]).ravel().tolist())
plot_multiple(X_arr, y_arr, titles=titles)

In [None]:
# show cv

# deep copula