BUG Don't calculate levels if we aren't expanding

If a user has excluded a column because they know that column has too many levels, they shouldn't then get a warning about that column. Make sure we don't count the levels in columns which are going to be dropped anyway. This fix also makes the `_cols_to_expand` attribute make more sense -- it no longer contains columns which we aren't going to expand.
civisanalytics · May 16, 2018 · a11b440 · a11b440
1 parent c231cf6
commit a11b440
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
-
+### Fixed
+- In ``DataFrameETL``, don't check for levels to expand in columns which
+  are slated to be dropped. This will avoid raising a warning for too
+  many levels in a column if the user has intentionally excluded
+  that column (#39).
 
 ## [0.1.8] - 2018-04-19
 ### Fixed

diff --git a/civismlext/preprocessing.py b/civismlext/preprocessing.py
@@ -241,6 +241,8 @@ def fit(self, X, y=None):
             else:
                 self._cols_to_expand = [c for c in self.cols_to_expand if
                                         c in X.columns]
+            self._cols_to_expand = [c for c in self._cols_to_expand if
+                                    c not in self._cols_to_drop]
             log.debug("There are %d column(s) to expand.",
                       len(self._cols_to_expand))
             # Update sentinels if the defaults are in the dataframe

diff --git a/civismlext/test/test_preprocessing.py b/civismlext/test/test_preprocessing.py
@@ -291,6 +291,17 @@ def test_create_col_names_numeric(data_raw):
     assert unexpanded == ['pid', 'fruits', 'age']
 
 
+def test_dropped_cols_no_levels(data_raw):
+    # If the user requests that we drop a column, we shouldn't create
+    # levels for it. That risks raising a warning for too many levels
+    # when it doesn't matter.
+    expander = DataFrameETL(cols_to_drop=['pid'])
+    expander.fit(data_raw)
+
+    assert 'animal' in expander.levels_
+    assert 'pid' not in expander.levels_
+
+
 def test_expand_col(data_raw):
     expander = DataFrameETL(cols_to_drop=['fruits'],
                             dummy_na=True,