Merge pull request #80 from ealcobaca/bootstrap

Metafeature extraction with confidence intervals
ealcobaca · May 19, 2020 · 8ccd470 · 8ccd470
2 parents 156efcc + a93711f
commit 8ccd470
Show file tree

Hide file tree

Showing 7 changed files with 495 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -163,6 +163,22 @@ ft = extractor.extract_from_model(
 print(ft)
 ```
 
+You can also extract your metafeatures with confidence intervals using bootstrap. Keep in mind that this method extracts each metafeature several times, and may be very expensive depending mainly on your data and the number of metafeature extract methods called.
+
+```python
+# Extract metafeatures with confidence interval
+mfe = MFE(features=["mean", "nr_cor_attr", "sd", "max"])
+mfe.fit(X, y)
+
+ft = mfe.extract_with_confidence(
+    sample_num=256,
+    confidence=0.99,
+    verbose=1,
+)
+
+print(ft)
+```
+
 ## Documentation
 We write a great Documentation to guide you on how to use the pymfe library. You can find the Documentation in this [link](https://pymfe.readthedocs.io/en/latest/?badge=latest).
 You can find in the documentation interesting pages like:

diff --git a/pymfe/clustering.py b/pymfe/clustering.py
@@ -358,14 +358,14 @@ def _calc_pairwise_norm_interclass_dist(
 
             cls_inds = _utils.calc_cls_inds(y=y, classes=classes)
 
-        interclass_dists = np.array([
+        interclass_dists = [
             cls._calc_normalized_interclass_dist(
                 N[cls_inds[id_cls_a, :], :],
                 N[cls_inds[id_cls_b, :], :],
                 dist_metric=dist_metric)
             for id_cls_a, id_cls_b in itertools.combinations(
                 np.arange(cls_inds.shape[0]), 2)
-        ])
+        ]
 
         return interclass_dists
 
@@ -567,7 +567,12 @@ def ft_vdu(
                 classes=classes,
                 cls_inds=cls_inds).max()
 
-        vdu = pairwise_norm_interclass_dist.min() / intraclass_dists.max()
+        _min_interclass_dist = np.inf
+
+        for vals in pairwise_norm_interclass_dist:
+            _min_interclass_dist = min(_min_interclass_dist, np.min(vals))
+
+        vdu = _min_interclass_dist / intraclass_dists.max()
 
         return vdu
 
@@ -668,7 +673,12 @@ def ft_int(
 
         norm_factor = 2.0 / (class_num * (class_num - 1.0))
 
-        return pairwise_norm_interclass_dist.sum() * norm_factor
+        _sum_interclass_dist = 0.0
+
+        for vals in pairwise_norm_interclass_dist:
+            _sum_interclass_dist += np.sum(vals)
+
+        return _sum_interclass_dist * norm_factor
 
     @classmethod
     def ft_sil(cls,

diff --git a/pymfe/complexity.py b/pymfe/complexity.py
@@ -57,7 +57,6 @@ class MFEComplexity:
     computed in module ``statistical`` can freely be used for any
     precomputation or feature extraction method of module ``landmarking``).
     """
-
     @classmethod
     def precompute_complexity(cls, y: t.Optional[np.ndarray] = None,
                               **kwargs) -> t.Dict[str, t.Any]:
@@ -170,25 +169,39 @@ def _calc_ovo_comb(classes: np.ndarray) -> t.List[t.Tuple]:
         return np.asarray(list(ovo_comb), dtype=int)
 
     @staticmethod
-    def _calc_minmax(N: np.ndarray, cls_1: np.ndarray,
-                     cls_2: np.ndarray) -> np.ndarray:
+    def _calc_minmax(N_cls_1: np.ndarray, N_cls_2: np.ndarray) -> np.ndarray:
         """Compute the minimum of the maximum values per class for all feat.
 
         The index i indicate the minmax of feature i.
         """
-        minmax = np.min(
-            (np.max(N[cls_1, :], axis=0), np.max(N[cls_2, :], axis=0)), axis=0)
+        if N_cls_1.size == 0 or N_cls_2.size == 0:
+            # Note: if there is no two classes, the 'overlapping region'
+            # becomes ill defined. Thus, returning '-np.inf' alongside
+            # '_calc_maxmin()' returning '+np.inf' guarantees that no
+            # example will remain into the (undefined) 'overlapping region.'
+            return -np.inf
+
+        minmax = np.min((np.max(N_cls_1, axis=0), np.max(N_cls_2, axis=0)),
+                        axis=0)
+
         return minmax
 
     @staticmethod
-    def _calc_maxmin(N: np.ndarray, cls_1: np.ndarray,
-                     cls_2: np.ndarray) -> np.ndarray:
+    def _calc_maxmin(N_cls_1: np.ndarray, N_cls_2: np.ndarray) -> np.ndarray:
         """Compute the maximum of the minimum values per class for all feat.
 
         The index i indicate the maxmin of the ith feature.
         """
-        maxmin = np.max(
-            (np.min(N[cls_1, :], axis=0), np.min(N[cls_2, :], axis=0)), axis=0)
+        if N_cls_1.size == 0 or N_cls_2.size == 0:
+            # Note: if there is no two classes, the 'overlapping region'
+            # becomes ill defined. Thus, returning '+np.inf' alongside
+            # '_calc_minmax()' returning '-np.inf' guarantees that no
+            # example will remain into the (undefined) 'overlapping region.'
+            return np.inf
+
+        maxmin = np.max((np.min(N_cls_1, axis=0), np.min(N_cls_2, axis=0)),
+                        axis=0)
+
         return maxmin
 
     @staticmethod
@@ -261,21 +274,21 @@ def ft_f3(
         f3 = np.zeros(ovo_comb.shape[0], dtype=float)
 
         for ind, (cls_id_1, cls_id_2) in enumerate(ovo_comb):
-            cls_1 = cls_inds[cls_id_1, :]
-            cls_2 = cls_inds[cls_id_2, :]
+            N_cls_1 = N[cls_inds[cls_id_1, :], :]
+            N_cls_2 = N[cls_inds[cls_id_2, :], :]
 
             ind_less_overlap, feat_overlap_num, _ = cls._calc_overlap(
                 N=N,
-                minmax=cls._calc_minmax(N=N, cls_1=cls_1, cls_2=cls_2),
-                maxmin=cls._calc_maxmin(N=N, cls_1=cls_1, cls_2=cls_2))
+                minmax=cls._calc_minmax(N_cls_1, N_cls_2),
+                maxmin=cls._calc_maxmin(N_cls_1, N_cls_2))
 
             f3[ind] = (feat_overlap_num[ind_less_overlap] /
                        (class_freqs[cls_id_1] + class_freqs[cls_id_2]))
 
         # The measure is computed in the literature using the mean. However, it
         # is formulated here as a meta-feature. Therefore, the post-processing
         # should be used to get the mean and other measures as well.
-        return np.asarray(f3)
+        return f3
 
     @classmethod
     def ft_f4(
@@ -332,42 +345,53 @@ def ft_f4(
         f4 = np.zeros(ovo_comb.shape[0], dtype=float)
 
         for ind, (cls_id_1, cls_id_2) in enumerate(ovo_comb):
-            cls_subset_intersec = np.logical_or(cls_inds[cls_id_1, :],
-                                                cls_inds[cls_id_2, :])
+            cls_subset_union = np.logical_or(cls_inds[cls_id_1, :],
+                                             cls_inds[cls_id_2, :])
+
+            cls_1 = cls_inds[cls_id_1, cls_subset_union]
+            cls_2 = cls_inds[cls_id_2, cls_subset_union]
+            N_subset = N[cls_subset_union, :]
+
+            # Search only on remaining features, without copying any data
+            valid_attr_inds = np.arange(N_subset.shape[1])
+            N_view = N_subset[:, valid_attr_inds]
 
-            cls_1 = cls_inds[cls_id_1, cls_subset_intersec]
-            cls_2 = cls_inds[cls_id_2, cls_subset_intersec]
-            N_subset = N[cls_subset_intersec, :]
+            while N_view.size > 0:
+                N_cls_1, N_cls_2 = N_view[cls_1, :], N_view[cls_2, :]
 
-            while N_subset.size > 0:
-                # True if the example is in the overlapping region
+                # Note: 'feat_overlapped_region' is a boolean vector with
+                # True values if the example is in the overlapping region
                 ind_less_overlap, _, feat_overlapped_region = (
                     cls._calc_overlap(
-                        N=N_subset,
-                        minmax=cls._calc_minmax(N_subset, cls_1, cls_2),
-                        maxmin=cls._calc_maxmin(N_subset, cls_1, cls_2)))
+                        N=N_view,
+                        minmax=cls._calc_minmax(N_cls_1, N_cls_2),
+                        maxmin=cls._calc_maxmin(N_cls_1, N_cls_2)))
 
-                # boolean that if True, this example is in the overlapping
+                # Boolean that if True, this example is in the overlapping
                 # region
                 overlapped_region = feat_overlapped_region[:, ind_less_overlap]
 
-                # removing the non overlapped features
+                # Removing the non-overlapping instances
                 N_subset = N_subset[overlapped_region, :]
                 cls_1 = cls_1[overlapped_region]
                 cls_2 = cls_2[overlapped_region]
 
-                # removing the most efficient feature
-                N_subset = np.delete(N_subset, ind_less_overlap, axis=1)
+                # Removing the most efficient feature
+                # Note: previous versions used to delete it directly from data
+                # 'N_subset', but that procedure takes up much more memory
+                # because each 'np.delete' operation creates a new dataset.
+                valid_attr_inds = np.delete(valid_attr_inds, ind_less_overlap)
+                N_view = N_subset[:, valid_attr_inds]
 
             subset_size = N_subset.shape[0]
 
-            f4[ind] = subset_size / (
-                class_freqs[cls_id_1] + class_freqs[cls_id_2])
+            f4[ind] = subset_size / (class_freqs[cls_id_1] +
+                                     class_freqs[cls_id_2])
 
         # The measure is computed in the literature using the mean. However, it
         # is formulated here as a meta-feature. Therefore, the post-processing
         # should be used to get the mean and other measures as well.
-        return np.asarray(f4)
+        return f4
 
     @classmethod
     def ft_l2(cls,
@@ -450,7 +474,7 @@ def ft_l2(cls,
         # The measure is computed in the literature using the mean. However, it
         # is formulated here as a meta-feature. Therefore, the post-processing
         # should be used to get the mean and other measures as well.
-        return np.asarray(l2)
+        return l2
 
     @classmethod
     def ft_n1(cls, N: np.ndarray, y: np.ndarray,