In [20]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [21]:
columns = ["sex","length","diam","height","whole","shucked","viscera","shell","age"]
df = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data",names=columns)

In [22]:
df.head()

Unnamed: 0,sex,length,diam,height,whole,shucked,viscera,shell,age
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [23]:
y = df.age
X=df.drop(columns=['age'])
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include=['object']).columns
#create some missing values
for i in range(1000):
    X.loc[np.random.choice(X.index),np.random.choice(X.columns)] = np.nan

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.25)

In [25]:
x_train_num = x_train[num_cols]
#fill missing values with mean on numeric features only
x_train_fill_missing = x_train_num.fillna(x_train_num.mean())
#fit standard scaler on x_train_fill_missing
scale = StandardScaler().fit(x_train_fill_missing)
#scale data after filling in missing values
x_train_fill_missing_scale = scale.transform(x_train_fill_missing)


In [26]:
#Now want to do the same thing on the test set! 
x_test_fill_missing = x_test[num_cols].fillna(x_train_num.mean())
x_test_fill_missing_scale = scale.transform(x_test_fill_missing)

In [31]:
#1. Rewrite using Pipelines!
pipeline = Pipeline([("imputer",SimpleImputer()), ("scale",StandardScaler())])
pipeline.fit(x_train[num_cols])

Pipeline(steps=[('imputer', SimpleImputer()), ('scale', StandardScaler())])

In [32]:
#2. Fit pipeline on the test and compare results
print('Verify pipeline transform test set is the same\nPrinting the sum of absolute differences:')
print(abs(x_test_fill_missing_scale - pipeline.transform(x_test[num_cols])).sum())

Verify pipeline transform test set is the same
Printing the sum of absolute differences:
7.569462873711624e-13


In [33]:
#3. Change imputer strategy to median and compare results
pipeline_median = Pipeline([("imputer",None), ("scale",None)])
pipeline_median.fit(x_train[num_cols])

Pipeline(steps=[('imputer', None), ('scale', None)])

In [34]:
print('Verify median pipeline transform is different\nPrinting the sum of absolute differences:')
print(abs(pipeline_median.transform(x_test[num_cols]) - pipeline.transform(x_test[num_cols])).sum())

Verify median pipeline transform is different
Printing the sum of absolute differences:
length     836.512662
diam       807.133645
height     730.977762
whole      918.587462
shucked    752.110236
viscera    784.606955
shell      773.062713
dtype: float64
