From a11b4405ed54326726ac9b307fe0ce1702624899 Mon Sep 17 00:00:00 2001
From: Stephen Hoover <shoover@civisanalytics.com>
Date: Wed, 16 May 2018 18:01:13 -0500
Subject: [PATCH] BUG Don't calculate levels if we aren't expanding

If a user has excluded a column because they know that column has too many levels, they shouldn't then get a warning about that column. Make sure we don't count the levels in columns which are going to be dropped anyway. This fix also makes the `_cols_to_expand` attribute make more sense -- it no longer contains columns which we aren't going to expand.
---
 CHANGELOG.md                          |  6 +++++-
 civismlext/preprocessing.py           |  2 ++
 civismlext/test/test_preprocessing.py | 11 +++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 59f1521..5dbc25e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,7 +5,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
-
+### Fixed
+- In ``DataFrameETL``, don't check for levels to expand in columns which
+  are slated to be dropped. This will avoid raising a warning for too
+  many levels in a column if the user has intentionally excluded
+  that column (#39).
 
 ## [0.1.8] - 2018-04-19
 ### Fixed
diff --git a/civismlext/preprocessing.py b/civismlext/preprocessing.py
index 24476dc..fa2e8ef 100644
--- a/civismlext/preprocessing.py
+++ b/civismlext/preprocessing.py
@@ -241,6 +241,8 @@ def fit(self, X, y=None):
             else:
                 self._cols_to_expand = [c for c in self.cols_to_expand if
                                         c in X.columns]
+            self._cols_to_expand = [c for c in self._cols_to_expand if
+                                    c not in self._cols_to_drop]
             log.debug("There are %d column(s) to expand.",
                       len(self._cols_to_expand))
             # Update sentinels if the defaults are in the dataframe
diff --git a/civismlext/test/test_preprocessing.py b/civismlext/test/test_preprocessing.py
index 15c2fef..3783e1f 100644
--- a/civismlext/test/test_preprocessing.py
+++ b/civismlext/test/test_preprocessing.py
@@ -291,6 +291,17 @@ def test_create_col_names_numeric(data_raw):
     assert unexpanded == ['pid', 'fruits', 'age']
 
 
+def test_dropped_cols_no_levels(data_raw):
+    # If the user requests that we drop a column, we shouldn't create
+    # levels for it. That risks raising a warning for too many levels
+    # when it doesn't matter.
+    expander = DataFrameETL(cols_to_drop=['pid'])
+    expander.fit(data_raw)
+
+    assert 'animal' in expander.levels_
+    assert 'pid' not in expander.levels_
+
+
 def test_expand_col(data_raw):
     expander = DataFrameETL(cols_to_drop=['fruits'],
                             dummy_na=True,