Finish the combine method

glemaitre · Jun 29, 2016 · ef240ba · ef240ba
1 parent 390e139
commit ef240ba
Show file tree

Hide file tree

Showing 10 changed files with 34 additions and 148 deletions.
diff --git a/imblearn/combine/smote_enn.py b/imblearn/combine/smote_enn.py
@@ -22,8 +22,11 @@ class SMOTEENN(SamplerMixin):
         number of samples in the minority class over the the number of
         samples in the majority class.
 
-    random_state : int or None, optional (default=None)
-        Seed for random number generation.
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by np.random.
 
     verbose : bool, optional (default=True)
         Whether or not to print information about the processing.
@@ -60,15 +63,6 @@ class SMOTEENN(SamplerMixin):
 
     Attributes
     ----------
-    ratio : str or float
-        If 'auto', the ratio will be defined automatically to balance
-        the dataset. Otherwise, the ratio is defined as the
-        number of samples in the minority class over the the number of
-        samples in the majority class.
-
-    random_state : int or None
-        Seed for random number generation.
-
     min_c_ : str or int
         The identifier of the minority class.
 
@@ -100,75 +94,21 @@ def __init__(self, ratio='auto', random_state=None, verbose=True,
                  k=5, m=10, out_step=0.5, kind_smote='regular',
                  size_ngh=3, kind_enn='all', n_jobs=-1, **kwargs):
 
-        """Initialise the SMOTE ENN object.
-
-        Parameters
-        ----------
-        ratio : str or float, optional (default='auto')
-            If 'auto', the ratio will be defined automatically to balance
-            the dataset. Otherwise, the ratio is defined as the
-            number of samples in the minority class over the the number of
-            samples in the majority class.
-
-        random_state : int or None, optional (default=None)
-            Seed for random number generation.
-
-        verbose : bool, optional (default=True)
-            Whether or not to print information about the processing.
-
-        k : int, optional (default=5)
-            Number of nearest neighbours to used to construct synthetic
-            samples.
-
-        m : int, optional (default=10)
-            Number of nearest neighbours to use to determine if a minority
-            sample is in danger.
-
-        out_step : float, optional (default=0.5)
-            Step size when extrapolating.
-
-        kind_smote : str, optional (default='regular')
-            The type of SMOTE algorithm to use one of the following
-            options: 'regular', 'borderline1', 'borderline2', 'svm'.
-
-        size_ngh : int, optional (default=3)
-            Size of the neighbourhood to consider to compute the average
-            distance to the minority point samples.
-
-        kind_sel : str, optional (default='all')
-            Strategy to use in order to exclude samples.
-
-            - If 'all', all neighbours will have to agree with the samples of
-            interest to not be excluded.
-            - If 'mode', the majority vote of the neighbours will be used in
-            order to exclude a sample.
-
-        n_jobs : int, optional (default=-1)
-            The number of threads to open if possible.
-
-        Returns
-        -------
-        None
-
-        """
-        super(SMOTEENN, self).__init__(ratio=ratio, random_state=random_state,
+        super(SMOTEENN, self).__init__(ratio=ratio,
                                        verbose=verbose)
-
+        self.random_state = random_state
         self.k = k
         self.m = m
         self.out_step = out_step
         self.kind_smote = kind_smote
+        self.size_ngh = size_ngh
+        self.kind_enn = kind_enn
         self.n_jobs = n_jobs
         self.kwargs = kwargs
-
         self.sm = SMOTE(ratio=self.ratio, random_state=self.random_state,
                         verbose=self.verbose, k=self.k, m=self.m,
                         out_step=self.out_step, kind=self.kind_smote,
                         n_jobs=self.n_jobs, **self.kwargs)
-
-        self.size_ngh = size_ngh
-        self.kind_enn = kind_enn
-
         self.enn = EditedNearestNeighbours(random_state=self.random_state,
                                            verbose=self.verbose,
                                            size_ngh=self.size_ngh,
@@ -192,8 +132,6 @@ def fit(self, X, y):
             Return self.
 
         """
-        # Check the consistency of X and y
-        X, y = check_X_y(X, y)
 
         super(SMOTEENN, self).fit(X, y)
 
@@ -202,7 +140,7 @@ def fit(self, X, y):
 
         return self
 
-    def sample(self, X, y):
+    def _sample(self, X, y):
         """Resample the dataset.
 
         Parameters
@@ -222,10 +160,6 @@ def sample(self, X, y):
             The corresponding label of `X_resampled`
 
         """
-        # Check the consistency of X and y
-        X, y = check_X_y(X, y)
-
-        super(SMOTEENN, self).sample(X, y)
 
         # Transform using SMOTE
         X, y = self.sm.sample(X, y)

diff --git a/imblearn/combine/smote_tomek.py b/imblearn/combine/smote_tomek.py
@@ -23,8 +23,11 @@ class SMOTETomek(SamplerMixin):
         number of samples in the minority class over the the number of
         samples in the majority class.
 
-    random_state : int or None, optional (default=None)
-        Seed for random number generation.
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance used
+        by np.random.
 
     verbose : bool, optional (default=True)
         Whether or not to print information about the processing.
@@ -61,15 +64,6 @@ class SMOTETomek(SamplerMixin):
 
     Attributes
     ----------
-    ratio : str or float
-        If 'auto', the ratio will be defined automatically to balance
-        the dataset. Otherwise, the ratio is defined as the
-        number of samples in the minority class over the the number of
-        samples in the majority class.
-
-    random_state : int or None
-        Seed for random number generation.
-
     min_c_ : str or int
         The identifier of the minority class.
 
@@ -99,62 +93,18 @@ class SMOTETomek(SamplerMixin):
     def __init__(self, ratio='auto', random_state=None, verbose=True,
                  k=5, m=10, out_step=0.5, kind_smote='regular',
                  n_jobs=-1, **kwargs):
-
-        """Initialise the SMOTE Tomek links object.
-
-        Parameters
-        ----------
-        ratio : str or float, optional (default='auto')
-            If 'auto', the ratio will be defined automatically to balance
-            the dataset. Otherwise, the ratio is defined as the
-            number of samples in the minority class over the the number of
-            samples in the majority class.
-
-        random_state : int or None, optional (default=None)
-            Seed for random number generation.
-
-        verbose : bool, optional (default=True)
-            Whether or not to print information about the processing.
-
-        k : int, optional (default=5)
-            Number of nearest neighbours to used to construct synthetic
-            samples.
-
-        m : int, optional (default=10)
-            Number of nearest neighbours to use to determine if a minority
-            sample is in danger.
-
-        out_step : float, optional (default=0.5)
-            Step size when extrapolating.
-
-        kind_smote : str, optional (default='regular')
-            The type of SMOTE algorithm to use one of the following
-            options: 'regular', 'borderline1', 'borderline2', 'svm'.
-
-        n_jobs : int, optional (default=-1)
-            Number of threads to run the algorithm when it is possible.
-
-        Returns
-        -------
-        None
-
-        """
-        super(SMOTETomek, self).__init__(ratio=ratio,
-                                         random_state=random_state,
-                                         verbose=verbose)
-
+        super(SMOTETomek, self).__init__(ratio=ratio, verbose=verbose)
+        self.random_state = random_state
         self.k = k
         self.m = m
         self.out_step = out_step
         self.kind_smote = kind_smote
         self.n_jobs = n_jobs
         self.kwargs = kwargs
-
         self.sm = SMOTE(ratio=self.ratio, random_state=self.random_state,
                         verbose=self.verbose, k=self.k, m=self.m,
                         out_step=self.out_step, kind=self.kind_smote,
                         n_jobs=self.n_jobs, **self.kwargs)
-
         self.tomek = TomekLinks(random_state=self.random_state,
                                 verbose=self.verbose)
 
@@ -175,8 +125,6 @@ def fit(self, X, y):
             Return self.
 
         """
-        # Check the consistency of X and y
-        X, y = check_X_y(X, y)
 
         super(SMOTETomek, self).fit(X, y)
 
@@ -185,7 +133,7 @@ def fit(self, X, y):
 
         return self
 
-    def sample(self, X, y):
+    def _sample(self, X, y):
         """Resample the dataset.
 
         Parameters
@@ -205,10 +153,6 @@ def sample(self, X, y):
             The corresponding label of `X_resampled`
 
         """
-        # Check the consistency of X and y
-        X, y = check_X_y(X, y)
-
-        super(SMOTETomek, self).sample(X, y)
 
         # Transform using SMOTE
         X, y = self.sm.sample(X, y)

diff --git a/imblearn/combine/tests/data/smote_enn_reg_x.npy b/imblearn/combine/tests/data/smote_enn_reg_x.npy
diff --git a/imblearn/combine/tests/data/smote_enn_reg_x_05.npy b/imblearn/combine/tests/data/smote_enn_reg_x_05.npy
diff --git a/imblearn/combine/tests/data/smote_enn_reg_y.npy b/imblearn/combine/tests/data/smote_enn_reg_y.npy
diff --git a/imblearn/combine/tests/data/smote_enn_reg_y_05.npy b/imblearn/combine/tests/data/smote_enn_reg_y_05.npy
diff --git a/imblearn/combine/tests/data/smote_tomek_reg_x.npy b/imblearn/combine/tests/data/smote_tomek_reg_x.npy
diff --git a/imblearn/combine/tests/data/smote_tomek_reg_x_05.npy b/imblearn/combine/tests/data/smote_tomek_reg_x_05.npy
diff --git a/imblearn/combine/tests/test_smote_enn.py b/imblearn/combine/tests/test_smote_enn.py
@@ -33,19 +33,23 @@ def test_senn_bad_ratio():
 
     # Define a negative ratio
     ratio = -1.0
-    assert_raises(ValueError, SMOTEENN, ratio=ratio)
+    smote = SMOTEENN(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
     # Define a ratio greater than 1
     ratio = 100.0
-    assert_raises(ValueError, SMOTEENN, ratio=ratio)
+    smote = SMOTEENN(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
     # Define ratio as an unknown string
     ratio = 'rnd'
-    assert_raises(ValueError, SMOTEENN, ratio=ratio)
+    smote = SMOTEENN(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
     # Define ratio as a list which is not supported
     ratio = [.5, .5]
-    assert_raises(ValueError, SMOTEENN, ratio=ratio)
+    smote = SMOTEENN(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
 
 def test_smote_fit_single_class():

diff --git a/imblearn/combine/tests/test_smote_tomek.py b/imblearn/combine/tests/test_smote_tomek.py
@@ -33,19 +33,23 @@ def test_smote_bad_ratio():
 
     # Define a negative ratio
     ratio = -1.0
-    assert_raises(ValueError, SMOTETomek, ratio=ratio)
+    smote = SMOTETomek(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
     # Define a ratio greater than 1
     ratio = 100.0
-    assert_raises(ValueError, SMOTETomek, ratio=ratio)
+    smote = SMOTETomek(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
     # Define ratio as an unknown string
     ratio = 'rnd'
-    assert_raises(ValueError, SMOTETomek, ratio=ratio)
+    smote = SMOTETomek(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
     # Define ratio as a list which is not supported
     ratio = [.5, .5]
-    assert_raises(ValueError, SMOTETomek, ratio=ratio)
+    smote = SMOTETomek(ratio=ratio)
+    assert_raises(ValueError, smote.fit, X, Y)
 
 
 def test_smote_fit_single_class():