diff --git a/pytorch_tabnet/abstract_model.py b/pytorch_tabnet/abstract_model.py index 4dd6762a..db5e9369 100644 --- a/pytorch_tabnet/abstract_model.py +++ b/pytorch_tabnet/abstract_model.py @@ -13,6 +13,7 @@ create_dataloaders, define_device, ComplexEncoder, + check_input ) from pytorch_tabnet.callbacks import ( CallbackContainer, @@ -22,7 +23,7 @@ ) from pytorch_tabnet.metrics import MetricContainer, check_metrics from sklearn.base import BaseEstimator -from sklearn.utils import check_array + from torch.utils.data import DataLoader import io import json @@ -115,7 +116,7 @@ def fit( batch_size=1024, virtual_batch_size=128, num_workers=0, - drop_last=False, + drop_last=True, callbacks=None, pin_memory=True, from_unsupervised=None, @@ -182,7 +183,7 @@ def fit( else: self.loss_fn = loss_fn - check_array(X_train) + check_input(X_train) self.update_fit_params( X_train, diff --git a/pytorch_tabnet/pretraining.py b/pytorch_tabnet/pretraining.py index d4ca49e9..fd6fdc8c 100644 --- a/pytorch_tabnet/pretraining.py +++ b/pytorch_tabnet/pretraining.py @@ -1,12 +1,12 @@ import torch import numpy as np -from sklearn.utils import check_array from torch.utils.data import DataLoader from pytorch_tabnet import tab_network from pytorch_tabnet.utils import ( create_explain_matrix, filter_weights, - PredictDataset + PredictDataset, + check_input ) from torch.nn.utils import clip_grad_norm_ from pytorch_tabnet.pretraining_utils import ( @@ -55,7 +55,7 @@ def fit( batch_size=1024, virtual_batch_size=128, num_workers=0, - drop_last=False, + drop_last=True, callbacks=None, pin_memory=True, ): @@ -118,7 +118,7 @@ def fit( else: self.loss_fn = loss_fn - check_array(X_train) + check_input(X_train) self.update_fit_params( weights, diff --git a/pytorch_tabnet/pretraining_utils.py b/pytorch_tabnet/pretraining_utils.py index 03c12a15..26d37a20 100644 --- a/pytorch_tabnet/pretraining_utils.py +++ b/pytorch_tabnet/pretraining_utils.py @@ -2,8 +2,8 @@ from pytorch_tabnet.utils import ( create_sampler, PredictDataset, + check_input ) -from sklearn.utils import check_array def create_dataloaders( @@ -93,7 +93,7 @@ def validate_eval_set(eval_set, eval_name, X_train): ), "eval_set and eval_name have not the same length" for set_nb, X in enumerate(eval_set): - check_array(X) + check_input(X) msg = ( f"Number of columns is different between eval set {set_nb}" + f"({X.shape[1]}) and X_train ({X_train.shape[1]})" diff --git a/pytorch_tabnet/utils.py b/pytorch_tabnet/utils.py index 86a09c63..5eedae7e 100644 --- a/pytorch_tabnet/utils.py +++ b/pytorch_tabnet/utils.py @@ -5,6 +5,7 @@ import scipy import json from sklearn.utils import check_array +import pandas as pd class TorchDataset(Dataset): @@ -271,7 +272,7 @@ def validate_eval_set(eval_set, eval_name, X_train, y_train): len(elem) == 2 for elem in eval_set ), "Each tuple of eval_set need to have two elements" for name, (X, y) in zip(eval_name, eval_set): - check_array(X) + check_input(X) msg = ( f"Dimension mismatch between X_{name} " + f"{X.shape} and X_train {X_train.shape}" @@ -337,3 +338,15 @@ def default(self, obj): return int(obj) # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj) + + +def check_input(X): + """ + Raise a clear error if X is a pandas dataframe + and check array according to scikit rules + """ + if isinstance(X, (pd.DataFrame, pd.Series)): + err_message = "Pandas DataFrame are not supported: apply X.values when calling fit" + raise(ValueError, err_message) + check_array(X) + return