Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Merge pull request #24 from georgianpartners/issue_7
Browse files Browse the repository at this point in the history
Fixed OneHotEncoder to use version from sklearn-contrib/categorey enc…
  • Loading branch information
jichaogp committed Nov 30, 2018
2 parents a9b2059 + 2da32a9 commit cd7c5ad
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 27 deletions.
22 changes: 14 additions & 8 deletions foreshadow/tests/test_transformers/test_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,28 +73,34 @@ def test_transformer_fancy_impute_invalid_params():

def test_transformer_onehotencoder_fit_transform():
import pandas as pd
from foreshadow.transformers.internals import OneHotEncoder
from foreshadow.transformers.externals import OneHotEncoder

df = pd.DataFrame({"neat": ["apple", "apple", "orange", "apple", "orange"]})
ohe = OneHotEncoder()
ohe = OneHotEncoder(use_cat_names=True, cols=["neat"], handle_unknown="ignore")
assert ohe.fit(df) == ohe
assert list(ohe.transform(df)) == [
"neat_OneHotEncoder_apple",
"neat_OneHotEncoder_orange",
"neat_OneHotEncoder_neat_apple",
"neat_OneHotEncoder_neat_orange",
]


def test_transformer_onehotencoder_fit_transform_keep_cols():
import pandas as pd
from foreshadow.transformers.internals import OneHotEncoder
from foreshadow.transformers.externals import OneHotEncoder

df = pd.DataFrame({"neat": ["apple", "apple", "orange", "apple", "orange"]})
ohe = OneHotEncoder(keep_columns=True, name="encoder")
ohe = OneHotEncoder(
keep_columns=True,
name="encoder",
use_cat_names=True,
cols=["neat"],
handle_unknown="ignore",
)
assert ohe.fit(df) == ohe
assert list(ohe.transform(df)) == [
"neat_encoder_origin_0",
"neat_encoder_apple",
"neat_encoder_orange",
"neat_encoder_neat_apple",
"neat_encoder_neat_orange",
]


Expand Down
2 changes: 1 addition & 1 deletion foreshadow/tests/test_transformers/test_smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_smart_encoder_less_than_30_levels():
import scipy.stats as ss

from foreshadow.transformers.smart import Encoder
from foreshadow.transformers.internals import OneHotEncoder
from foreshadow.transformers.externals import OneHotEncoder

np.random.seed(0)
leq_30_random_data = np.random.choice(30, size=500)
Expand Down
2 changes: 1 addition & 1 deletion foreshadow/transformers/externals.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Imputer
from sklearn.decomposition import PCA
from category_encoders import HashingEncoder
from category_encoders import HashingEncoder, OneHotEncoder

from .transformers import _get_modules

Expand Down
15 changes: 0 additions & 15 deletions foreshadow/transformers/internals/encoders.py

This file was deleted.

10 changes: 8 additions & 2 deletions foreshadow/transformers/smart.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
from sklearn.pipeline import Pipeline

from ..transformers.base import SmartTransformer
from ..transformers.internals import BoxCox, FancyImputer, OneHotEncoder
from ..transformers.internals import BoxCox, FancyImputer
from ..transformers.externals import (
MinMaxScaler,
StandardScaler,
RobustScaler,
HashingEncoder,
OneHotEncoder,
)


Expand Down Expand Up @@ -59,7 +60,12 @@ def _get_transformer(self, X, y=None, unique_num_cutoff=30, **fit_params):
col_name = X.columns[0]
unique_count = len(data.value_counts())
if unique_count <= unique_num_cutoff:
return OneHotEncoder()
return OneHotEncoder(
cols=[col_name],
return_df=True,
use_cat_names=True,
handle_unknown="ignore",
)
else:
return HashingEncoder(n_components=30, cols=[col_name])

Expand Down

0 comments on commit cd7c5ad

Please sign in to comment.