EXA: data-partitioning

christianbrodbeck · Jul 25, 2021 · 9de1533 · 9de1533
1 parent b19221b
commit 9de1533
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 1 deletion.
diff --git a/doc/conf.py b/doc/conf.py
@@ -120,6 +120,7 @@ def section_order(self):
     'deconvolution': [
         'trf_intro.py',
         'mtrf.py',
+        'partitions.py',
         'epoch_impulse.py',
     ],
 })

diff --git a/eelbrain/plot/_split.py b/eelbrain/plot/_split.py
@@ -3,6 +3,7 @@
 from typing import Union
 
 from matplotlib.patches import Rectangle
+from matplotlib.ticker import MaxNLocator
 import numpy
 
 from .._trf.shared import Splits, split_data
@@ -45,7 +46,8 @@ def __init__(
         if colors is None:
             colors = colors_for_oneway(attrs, unambiguous=[6, 3, 5])
 
-        layout = Layout(1, 16/9, 2, **kwargs)
+        h_default = max(2, 0.5 + 0.15 * len(splits.splits))
+        layout = Layout(1, 16/9, h_default, **kwargs)
         EelFigure.__init__(self, None, layout)
         ax = self.figure.axes[0]
 
@@ -68,6 +70,7 @@ def __init__(
         labels = {key: labels[key] for key in handles}
         ax.set_ylabel('Split')
         ax.set_ylim(-0.5, len(splits.splits)-0.5)
+        ax.yaxis.set_major_locator(MaxNLocator(integer=True))
         ax.set_xlabel('Sample')
         ax.set_xlim(splits.segments[0, 0], splits.segments[-1, 1])
         LegendMixin.__init__(self, legend, handles, labels)

diff --git a/examples/deconvolution/partitions.py b/examples/deconvolution/partitions.py
@@ -0,0 +1,53 @@
+# ---
+# jupyter:
+#   jupytext:
+#     text_representation:
+#       extension: .py
+#       format_name: sphinx
+#       format_version: '1.1'
+#       jupytext_version: 1.11.3
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+"""
+.. _exa-data_split:
+.. currentmodule:: eelbrain
+
+Data partitions for boosting
+============================
+The boosting algorithm can use two different forms of cross-validation: cross-validation as stopping criterion (always on), and cross-validation for model evaluation (optional). This requires paertitioning the data into different segments. The :func:`eelbrain.plot.preview_partitions` function is for exploring the effect of different parameters on the way the data is split.
+
+Validation
+==========
+During boosting, every training step consists in modifying one element of the kernel/TRF. After every such step, the new TRF is evaluated against the validation data. For continuous data (without :class:`Case` dimension), the default is to split the data into 10 equal-length segments, and perform 10 model fits, each using one of the segments as validation set. In the plots below, each "Split" shown on the y-axis corresponds to a separate run of the boosting algorithm. The results returned by the :func:`boosting` function would be based on to the average TRF of those 10 runs. 
+"""
+# sphinx_gallery_thumbnail_number = 6
+from eelbrain import *
+
+
+p = plot.preview_partitions()
+
+###############################################################################
+# The number of partitions can be controlled with the `partitions` parameter:
+
+p5 = plot.preview_partitions(partitions=5)
+p2 = plot.preview_partitions(partitions=7)
+
+###############################################################################
+# For data with multiple trials (data with a :class:`Case` dimension), the function attempts to use trials evenly across time: 
+
+p = plot.preview_partitions(20, partitions=5)
+p = plot.preview_partitions(20, partitions=2)
+
+###############################################################################
+# Testing
+# -------
+# Testing the result of a model fit with cross-validation requires data that was never used during training. Testing with cross-validation is enabled in the :func:`boosting` function by setting ``test=True``. When testing is enabled, each data segment is used in turn as testing segment. For each testing segment, the remaining segment are used in different runs as training and validation data. The results of those runs are then averaged to predict responses in the testing data. This nested loop means that the number of boosting runs can get large quickly when using many partitions, so the default is to use just four partitions:
+
+p = plot.preview_partitions(test=True)
+
+""
+p = plot.preview_partitions(20, partitions=5, test=1)