In [11]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline



In [12]:

columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

In [13]:

y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

In [14]:

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)
x_train_cat = x_train[cat_cols]

In [15]:

#fill missing values with mode on numeric features only
x_train_fill_missing = x_train_cat.fillna(x_train_cat.mode().values[0][0])

In [16]:

#fit standard scaler on x_train_fill_missing
ohe = OneHotEncoder(sparse=False, drop='first').fit(x_train_fill_missing)
#scale data after filling in missing values
x_train_fill_missing_ohe = ohe.transform(x_train_fill_missing)



In [17]:

#Now want to do the same thing on the test set!
x_test_fill_missing = x_test[cat_cols].fillna(x_train_cat.mode().values[0][0])
x_test_fill_missing_ohe = ohe.transform(x_test_fill_missing)

In [19]:
pipeline = Pipeline([("imputer", SimpleImputer(strategy='most_frequent')), ("ohe", OneHotEncoder(sparse_output=False, drop='first'))])
pipeline.fit(x_train[cat_cols])

In [20]:
print('Verify pipeline transform test set is the same\nPrinting the sum of absolute differences:')
print(abs(pipeline.transform(x_test[cat_cols]) - x_test_fill_missing_ohe).sum())

Verify pipeline transform test set is the same
Printing the sum of absolute differences:
0.0
