Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Must set index when concate dataframes (#186)
Browse files Browse the repository at this point in the history
  • Loading branch information
jzhang-gp committed Dec 18, 2019
1 parent e6fd2fe commit 1693466
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 29 deletions.
22 changes: 6 additions & 16 deletions foreshadow/concrete/internals/cleaners/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,18 +177,6 @@ def transform(self, X, y=None):
"""
X = check_df(X, single_column=True)
# Problem:
# I can use .apply to perform all these transformations and that
# works beautifully, except when I want to define a funtion that
# will use the pandas.series.str.split operation. In which case,
# the .apply fails and I don't know why.

# I need each function to accept the row as an argument so that we
# can inspect how much of the text was matched (for determining if
# it should be used). however, doing this means I need to iterate
# over each row for a given column on my own, which requires me to
# leave

logging.info("Starting cleaning rows...")
out = X[X.columns[0]].apply(self.transform_row, return_tuple=False)
logging.info("Ending cleaning rows...")
Expand All @@ -213,13 +201,15 @@ def transform(self, X, y=None):
)
columns = self.output_columns
if columns is None:
columns = [
X.columns[0] + str(c) for c in range(len(out.iloc[0]))
]
# by default, pandas would have given a unique integer to
# each column, instead, we keep the previous column name and
# add that integer.
X = pd.DataFrame([*out.values], columns=columns)
columns = [
X.columns[0] + str(c) for c in range(len(out.iloc[0]))
]
# We need to set the index. Otherwise, the new data frame might
# misalign with other columns.
X = pd.DataFrame([*out.values], index=out.index, columns=columns)
elif any(
[isinstance(out.iloc[i], (dict)) for i in range(out.shape[0])]
): # out are dicts == named new columns
Expand Down
2 changes: 1 addition & 1 deletion foreshadow/concrete/internals/cleaners/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def make_list_of_three(x):


class YYYYMMDDDateCleaner(BaseCleaner):
"""Clean financial data.
"""Clean DateTime data.
Note: requires pandas input dataframes.
Expand Down
34 changes: 27 additions & 7 deletions foreshadow/concrete/internals/cleaners/drop.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""DropCleaner which detects when to drop cleaner."""
import re

import pandas as pd

from foreshadow.metrics import MetricWrapper, regex_rows
from foreshadow.utils.validation import check_df

from .base import BaseCleaner
Expand All @@ -19,11 +19,8 @@ def drop_transform(text):
Otherwise: None, original text.
"""
# TODO may want to optimize, no need for regex.
regex = "^$"
text = str(text)
res = re.search(regex, text)
if res is not None:
if text is None or text == "" or pd.isna(text):
# Need to check np.isnan last as it fails on empty string.
res = 1
else:
res = 0
Expand All @@ -39,7 +36,30 @@ class DropCleaner(BaseCleaner):

def __init__(self):
transformations = [drop_transform]
super().__init__(transformations)
super().__init__(
transformations,
confidence_computation={MetricWrapper(regex_rows): 1},
)

def metric_score(self, X: pd.DataFrame) -> float:
"""Compute the score for this cleaner using confidence_computation.
confidence_computation is passed through init for each subclass.
The confidence determines which cleaner/flattener is picked in an
OVR fashion.
Args:
X: input DataFrame.
Returns:
float: confidence value.
"""
score = super().metric_score(X)
if score < 0.9:
# only drop a column if 90% of the data is NaN
return 0
return score

def transform(self, X, y=None):
"""Clean string columns.
Expand Down
10 changes: 5 additions & 5 deletions foreshadow/parallelprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,17 +509,17 @@ def fit_transform(self, X, y=None, **fit_params):
Xs += (Xo,)

# Concatenate results
Xs = pd.concat(Xs, axis=1)
result = pd.concat(Xs, axis=1)

# Convert multi index to single index if specified
if self.collapse_index:
try:
Xs.columns = Xs.columns.droplevel()
Xs.index.name = None
Xs.columns.name = None
result.columns = result.columns.droplevel()
result.index.name = None
result.columns.name = None
except AttributeError: # TODO figure out why this is needed
pass
return Xs
return result

def inverse_transform(self, X, **inverse_params):
"""Perform both a fit and a transform.
Expand Down
Empty file.

0 comments on commit 1693466

Please sign in to comment.