Must set index when concate dataframes (#186)

georgian-io-archive · Dec 18, 2019 · 1693466 · 1693466
1 parent e6fd2fe
commit 1693466
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 29 deletions.
diff --git a/foreshadow/concrete/internals/cleaners/base.py b/foreshadow/concrete/internals/cleaners/base.py
@@ -177,18 +177,6 @@ def transform(self, X, y=None):
 
         """
         X = check_df(X, single_column=True)
-        # Problem:
-        # I can use .apply to perform all these transformations and that
-        # works beautifully, except when I want to define a funtion that
-        # will use the pandas.series.str.split operation. In which case,
-        # the .apply fails and I don't know why.
-
-        # I need each function to accept the row as an argument so that we
-        # can inspect how much of the text was matched (for determining if
-        # it should be used). however, doing this means I need to iterate
-        # over each row for a given column on my own, which requires me to
-        # leave
-
         logging.info("Starting cleaning rows...")
         out = X[X.columns[0]].apply(self.transform_row, return_tuple=False)
         logging.info("Ending cleaning rows...")
@@ -213,13 +201,15 @@ def transform(self, X, y=None):
                 )
             columns = self.output_columns
             if columns is None:
-                columns = [
-                    X.columns[0] + str(c) for c in range(len(out.iloc[0]))
-                ]
                 # by default, pandas would have given a unique integer to
                 # each column, instead, we keep the previous column name and
                 # add that integer.
-            X = pd.DataFrame([*out.values], columns=columns)
+                columns = [
+                    X.columns[0] + str(c) for c in range(len(out.iloc[0]))
+                ]
+            # We need to set the index. Otherwise, the new data frame might
+            # misalign with other columns.
+            X = pd.DataFrame([*out.values], index=out.index, columns=columns)
         elif any(
             [isinstance(out.iloc[i], (dict)) for i in range(out.shape[0])]
         ):  # out are dicts ==  named new columns

diff --git a/foreshadow/concrete/internals/cleaners/datetimes.py b/foreshadow/concrete/internals/cleaners/datetimes.py
@@ -46,7 +46,7 @@ def make_list_of_three(x):
 
 
 class YYYYMMDDDateCleaner(BaseCleaner):
-    """Clean financial data.
+    """Clean DateTime data.
 
     Note: requires pandas input dataframes.
 

diff --git a/foreshadow/concrete/internals/cleaners/drop.py b/foreshadow/concrete/internals/cleaners/drop.py
@@ -1,8 +1,8 @@
 """DropCleaner which detects when to drop cleaner."""
-import re
 
 import pandas as pd
 
+from foreshadow.metrics import MetricWrapper, regex_rows
 from foreshadow.utils.validation import check_df
 
 from .base import BaseCleaner
@@ -19,11 +19,8 @@ def drop_transform(text):
         Otherwise: None, original text.
 
     """
-    # TODO may want to optimize, no need for regex.
-    regex = "^$"
-    text = str(text)
-    res = re.search(regex, text)
-    if res is not None:
+    if text is None or text == "" or pd.isna(text):
+        # Need to check np.isnan last as it fails on empty string.
         res = 1
     else:
         res = 0
@@ -39,7 +36,30 @@ class DropCleaner(BaseCleaner):
 
     def __init__(self):
         transformations = [drop_transform]
-        super().__init__(transformations)
+        super().__init__(
+            transformations,
+            confidence_computation={MetricWrapper(regex_rows): 1},
+        )
+
+    def metric_score(self, X: pd.DataFrame) -> float:
+        """Compute the score for this cleaner using confidence_computation.
+
+        confidence_computation is passed through init for each subclass.
+        The confidence determines which cleaner/flattener is picked in an
+        OVR fashion.
+
+        Args:
+            X: input DataFrame.
+
+        Returns:
+            float: confidence value.
+
+        """
+        score = super().metric_score(X)
+        if score < 0.9:
+            # only drop a column if 90% of the data is NaN
+            return 0
+        return score
 
     def transform(self, X, y=None):
         """Clean string columns.

diff --git a/foreshadow/parallelprocessor.py b/foreshadow/parallelprocessor.py
@@ -509,17 +509,17 @@ def fit_transform(self, X, y=None, **fit_params):
             Xs += (Xo,)
 
         # Concatenate results
-        Xs = pd.concat(Xs, axis=1)
+        result = pd.concat(Xs, axis=1)
 
         # Convert multi index to single index if specified
         if self.collapse_index:
             try:
-                Xs.columns = Xs.columns.droplevel()
-                Xs.index.name = None
-                Xs.columns.name = None
+                result.columns = result.columns.droplevel()
+                result.index.name = None
+                result.columns.name = None
             except AttributeError:  # TODO figure out why this is needed
                 pass
-        return Xs
+        return result
 
     def inverse_transform(self, X, **inverse_params):
         """Perform both a fit and a transform.

diff --git a/foreshadow/tests/test_concrete/__init__.py b/foreshadow/tests/test_concrete/__init__.py