# Pipelines

* Pipelines sequentially apply **a list of transforms** and a **final estimator**.

    * Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods.
    * The final estimator only needs to implement fit.

* The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import datasets
iris = datasets.load_iris()

X = iris.data
y = iris.target

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

## Simple Pipeline

The simple pipeline is composed of the folloing steps:
- Transformation
    - Scaling values between 0 and 1
    - PCA (we keep 2 components)
- Estimator
    - Logistic Regression

### Transformation


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

# preprocessing_transformer = Pipeline(steps=[('normalize', Normalizer()),
#                                             ('scale_01', MinMaxScaler(feature_range=(0, 1))),
#                                             ('PCA', PCA(n_components=2))])
preprocessing_transformer = Pipeline(steps=[('scale_01', MinMaxScaler(feature_range=(0, 1))),
                                            ('PCA', PCA(n_components=2))])

### Estimator

In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', multi_class='auto')

### Creating and evaluating the Pipeline

In [5]:
from sklearn.metrics import accuracy_score

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessing_transformer', preprocessing_transformer),
                               ('model', model)
                              ], verbose = True)

# my_pipeline = Pipeline(steps=[('scale_01', MinMaxScaler(feature_range=(0, 1))),
#                               ('PCA', PCA(n_components=2)),
#                               ('model', model)
#                              ], verbose = True)

In [6]:
# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

[Pipeline]  (step 1 of 2) Processing preprocessing_transformer, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.0s




0,1,2
,steps,"[('preprocessing_transformer', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,True

0,1,2
,steps,"[('scale_01', ...), ('PCA', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [7]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
print('Accuracy Score:', score)

Accuracy Score: 0.8666666666666667


### Analyzing the transformation

In [8]:
X_train

array([[6.4, 3.1, 5.5, 1.8],
       [5.4, 3. , 4.5, 1.5],
       [5.2, 3.5, 1.5, 0.2],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.2],
       [5.2, 2.7, 3.9, 1.4],
       [5.7, 3.8, 1.7, 0.3],
       [6. , 2.7, 5.1, 1.6],
       [5.9, 3. , 4.2, 1.5],
       [5.8, 2.6, 4. , 1.2],
       [6.8, 3. , 5.5, 2.1],
       [4.7, 3.2, 1.3, 0.2],
       [6.9, 3.1, 5.1, 2.3],
       [5. , 3.5, 1.6, 0.6],
       [5.4, 3.7, 1.5, 0.2],
       [5. , 2. , 3.5, 1. ],
       [6.5, 3. , 5.5, 1.8],
       [6.7, 3.3, 5.7, 2.5],
       [6. , 2.2, 5. , 1.5],
       [6.7, 2.5, 5.8, 1.8],
       [5.6, 2.5, 3.9, 1.1],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.3, 4.7, 1.6],
       [5.5, 2.4, 3.8, 1.1],
       [6.3, 2.7, 4.9, 1.8],
       [6.3, 2.8, 5.1, 1.5],
       [4.9, 2.5, 4.5, 1.7],
       [6.3, 2.5, 5. , 1.9],
       [7. , 3.2, 4.7, 1.4],
       [6.5, 3. , 5.2, 2. ],
       [6. , 3.4, 4.5, 1.6],
       [4.8, 3.1, 1.6, 0.2],
       [5.8, 2.7, 5.1, 1.9],
       [5.6, 2.7, 4.2, 1.3],
       [5.6, 2

In [9]:
transformed_Dataset = preprocessing_transformer.fit_transform(X_train)

In [10]:
transformed_Dataset

array([[ 3.92209763e-01,  4.84561075e-02],
       [ 9.04869356e-02, -8.95853266e-02],
       [-6.29042414e-01,  1.33227096e-01],
       [ 2.97394765e-01, -1.69364128e-02],
       [ 5.25821952e-01, -7.19394322e-02],
       [-8.81287220e-03, -2.16744469e-01],
       [-5.36677344e-01,  3.00755516e-01],
       [ 2.68771542e-01, -1.40752213e-01],
       [ 1.18143450e-01, -2.69059275e-02],
       [ 2.51091885e-02, -1.81673754e-01],
       [ 5.25692275e-01,  5.32019212e-02],
       [-6.94577142e-01, -3.59042800e-02],
       [ 5.43323143e-01,  1.04376017e-01],
       [-5.34922624e-01,  1.01994571e-01],
       [-6.15543324e-01,  2.31944792e-01],
       [-1.46419376e-01, -4.91909893e-01],
       [ 4.09356632e-01,  2.26626713e-02],
       [ 6.26859126e-01,  1.45225238e-01],
       [ 2.57239670e-01, -3.25774678e-01],
       [ 4.91330874e-01, -1.45418497e-01],
       [-3.11137441e-02, -2.39957062e-01],
       [ 7.51055857e-01,  1.48508623e-01],
       [ 2.30644832e-01,  1.25073917e-01],
       [-4.

In [11]:
type(transformed_Dataset)

numpy.ndarray

In [12]:
preprocessing_transformer.transform(X_valid)

array([[ 0.45431919, -0.13956513],
       [ 0.01613957, -0.305207  ],
       [-0.64028327,  0.4328635 ],
       [ 0.59569967,  0.06930586],
       [-0.64774017,  0.07208554],
       [ 0.61115082,  0.09432855],
       [-0.64749817,  0.11056931],
       [ 0.26200464,  0.09865541],
       [ 0.27341333, -0.00194264],
       [ 0.07738683, -0.07277631],
       [ 0.28573193, -0.16961619],
       [ 0.19957168,  0.10339402],
       [ 0.12542036, -0.07996038],
       [ 0.24300937, -0.03638314],
       [ 0.17388238, -0.04559015],
       [-0.70761088,  0.13831337],
       [ 0.16737484, -0.05646523],
       [ 0.03204393, -0.22204303],
       [-0.63470643, -0.1021321 ],
       [-0.59366964,  0.30640174],
       [ 0.30171174, -0.15420814],
       [ 0.11438335, -0.06601991],
       [-0.62885722,  0.04349897],
       [-0.70413094, -0.18523611],
       [ 0.30904546, -0.0790507 ],
       [-0.75940452,  0.10638345],
       [-0.56014656,  0.22594574],
       [ 0.11622089, -0.02718333],
       [-0.18340504,

## ColumnTransformer: Managing different kinds of transformers on different columns:
Extracted and extended from a kaggle.com tutorial

Applies transformers to columns of an array or pandas DataFrame.

This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer.

In [15]:
dataset = pd.read_csv("data/melb_data.csv")
dataset.head(5)

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [16]:
dataset.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

In [17]:
dataset.shape

(13580, 21)

In [18]:
dataset.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [20]:
dataset['Price']

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
           ...    
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, Length: 13580, dtype: float64

In [None]:
#dataset = dataset[dataset['Price'].isnull()==False]

In [21]:
columns = dataset.columns.to_list()
columns

['Suburb',
 'Address',
 'Rooms',
 'Type',
 'Price',
 'Method',
 'SellerG',
 'Date',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'CouncilArea',
 'Lattitude',
 'Longtitude',
 'Regionname',
 'Propertycount']

In [25]:
if 'Price' in columns:
  columns.remove('Price')

X = dataset[columns]
y = dataset['Price']
y

0        1480000.0
1        1035000.0
2        1465000.0
3         850000.0
4        1600000.0
           ...    
13575    1245000.0
13576    1031000.0
13577    1170000.0
13578    2500000.0
13579    1285000.0
Name: Price, Length: 13580, dtype: float64

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

We construct the full pipeline in three steps.
* Step 1: Define Preprocessing Steps
* Step 2: Define the Model
* Step 3: Create and Evaluate the Pipeline

### Step 1: Define Preprocessing Steps

Similar to how a pipeline bundles together preprocessing and modeling steps, we use the ColumnTransformer class to bundle together different preprocessing steps. The code below:
- imputes missing values in numerical data, and
- imputes missing values and applies a one-hot encoding to categorical data.

In [26]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and
                    X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                X_train[cname].dtype in ['int64', 'float64']]

In [27]:
categorical_cols

['Type', 'Method', 'Regionname']

In [28]:
numerical_cols

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [29]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output = False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
       ('num', numerical_transformer, numerical_cols),
       ('cat', categorical_transformer, categorical_cols)
    ])

### Step 2: Define the Model

In [30]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10, random_state=0)

### Step 3: Create and Evaluate the Pipeline

In [31]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[
                              ('preprocessor', preprocessor),
                              ('model', model),
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 173085.5993863525


### Parameter tuning

Setting parameters of the various steps is enabled by using their names and the parameter name separated by a ‘__’

In [33]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'model__n_estimators': [1,5,10],
    'preprocessor__num__strategy': ['most_frequent','constant'],
    'preprocessor__cat__imputer__strategy': ['most_frequent','constant'],
}

gs_clf = GridSearchCV(my_pipeline, parameters, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

gs_clf.fit(X, y)

0,1,2
,estimator,Pipeline(step...om_state=0))])
,param_grid,"{'model__n_estimators': [1, 5, ...], 'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'], 'preprocessor__num__strategy': ['most_frequent', 'constant']}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [34]:
gs_clf.best_params_

{'model__n_estimators': 10,
 'preprocessor__cat__imputer__strategy': 'most_frequent',
 'preprocessor__num__strategy': 'most_frequent'}

In [35]:
gs_clf.best_score_

np.float64(-190196.82772073775)

Applying the pipeline to some attributes only

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numerical_transformer, numerical_cols)],
        remainder='passthrough')

In [37]:
#This generates an error!

from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[
                              ('preprocessor', preprocessor),
                              ('model', model),
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

ValueError: could not convert string to float: 'St Kilda'

Remainder with estimator

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numerical_transformer, numerical_cols)],
        remainder=OneHotEncoder(handle_unknown='ignore', sparse_output = False))

In [39]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[
                              ('preprocessor', preprocessor),
                              ('model', model),
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)
print('MAE:', score)

MAE: 168145.7458394698


In [40]:
my_pipeline

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,OneHotEncoder..._output=False)
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Alternative techniques to indicate columns

In [41]:
X_train.columns.get_indexer(numerical_cols)

array([ 2,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 19])

In [42]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='most_frequent')


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, X_train.columns.get_indexer(numerical_cols))])

In [43]:
preprocessor.fit_transform(X_train)

array([[ 1.0000000e+00,  5.0000000e+00,  3.1820000e+03, ...,
        -3.7859840e+01,  1.4498670e+02,  1.3240000e+04],
       [ 2.0000000e+00,  8.0000000e+00,  3.0160000e+03, ...,
        -3.7858000e+01,  1.4490050e+02,  6.3800000e+03],
       [ 3.0000000e+00,  1.2600000e+01,  3.0200000e+03, ...,
        -3.7798800e+01,  1.4482200e+02,  3.7550000e+03],
       ...,
       [ 4.0000000e+00,  6.7000000e+00,  3.0580000e+03, ...,
        -3.7735720e+01,  1.4497256e+02,  1.1204000e+04],
       [ 3.0000000e+00,  1.2000000e+01,  3.0730000e+03, ...,
        -3.7720570e+01,  1.4502615e+02,  2.1650000e+04],
       [ 4.0000000e+00,  6.4000000e+00,  3.0110000e+03, ...,
        -3.7794300e+01,  1.4488750e+02,  7.5700000e+03]],
      shape=(10864, 12))

Be carefull!

In questo caso faccio prima un simpleimputer sull'intero dataset, poi faccio un column transformer, che quindi riceve un numpy array (solo valori senza indici delle colonne)

In [44]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# Preprocessing for all dataset
simple=SimpleImputer(strategy='most_frequent')

# Preprocessing for numerical data
numerical_transformer = Normalizer()

preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, numerical_cols)])

# Bundle preprocessing
preprocessing_transformer = Pipeline(steps=[('simple', simple),
                                            ('preprocessor', preprocessor)
                                            ])

In [45]:
preprocessing_transformer

0,1,2
,steps,"[('simple', ...), ('preprocessor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,norm,'l2'
,copy,True


In [46]:
preprocessing_transformer.fit_transform(X_train)

ValueError: Specifying the columns using strings is only supported for dataframes.

In questo modo invece gli passo gli indici numerici delle colonne invece del loro nome, qunidi il columntrasformer non si imputtana

In [47]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# Preprocessing for all dataset
simple=SimpleImputer(strategy='most_frequent')

# Preprocessing for numerical data
numerical_transformer = Normalizer()

preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, X_train.columns.get_indexer(numerical_cols))])

# Bundle preprocessing
preprocessing_transformer = Pipeline(steps=[('simple', simple),
                                            ('preprocessor', preprocessor)
                                            ])

In [48]:
preprocessing_transformer.fit_transform(X_train)

array([[ 7.26963825e-05,  3.63481912e-04,  2.31319889e-01, ...,
        -2.75227341e-03,  1.05400086e-02,  9.62500104e-01],
       [ 2.72783243e-04,  1.09113297e-03,  4.11357131e-01, ...,
        -5.16351401e-03,  1.97632142e-02,  8.70178547e-01],
       [ 5.72636606e-04,  2.40507375e-03,  5.76454183e-01, ...,
        -7.21499218e-03,  2.76434595e-02,  7.16750152e-01],
       ...,
       [ 3.39067054e-04,  5.67937315e-04,  2.59216763e-01, ...,
        -3.19873485e-03,  1.22888547e-02,  9.49726818e-01],
       [ 1.36582374e-04,  5.46329495e-04,  1.39905878e-01, ...,
        -1.71732166e-03,  6.60267194e-03,  9.85669464e-01],
       [ 4.77479959e-04,  7.63967935e-04,  3.59423039e-01, ...,
        -4.51150521e-03,  1.72952194e-02,  9.03630823e-01]],
      shape=(10864, 12))

## FeatureUnion: Applying multiple transformers in parallel

Concatenates results of multiple transformer objects.

This estimator applies a list of transformer objects in parallel to the input data, then concatenates the results. This is useful to combine several feature extraction mechanisms into a single transformer.

In [49]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [50]:
from sklearn import datasets
iris = datasets.load_iris()

X = iris.data
y = iris.target

from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [51]:
X.shape

(150, 4)

In [52]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [53]:
# This dataset is way too high-dimensional. Better do PCA:
pca = PCA(n_components=2)

# Maybe some original features were good, too?
selection = SelectKBest(k=2)

#Normalizing is always a good choice
scaler = MinMaxScaler(feature_range=(0, 1))

# Build estimator from PCA and Univariate selection:
combined_features = FeatureUnion([("pca", pca), ("univ_select", selection), ("normal", scaler)])

In [54]:
pca.fit_transform(X).shape[1]

2

In [55]:
selection.fit_transform(X, y).shape[1]

2

In [56]:
scaler.fit_transform(X).shape[1]

4

In [57]:
# Use combined features to transform dataset:
X_features = combined_features.fit(X, y).transform(X)
print("Combined space has", X_features.shape[1], "features")

Combined space has 8 features


In [58]:
X_features

array([[-2.68412563,  0.31939725,  1.4       , ...,  0.625     ,
         0.06779661,  0.04166667],
       [-2.71414169, -0.17700123,  1.4       , ...,  0.41666667,
         0.06779661,  0.04166667],
       [-2.88899057, -0.14494943,  1.3       , ...,  0.5       ,
         0.05084746,  0.04166667],
       ...,
       [ 1.76434572,  0.07885885,  5.2       , ...,  0.41666667,
         0.71186441,  0.79166667],
       [ 1.90094161,  0.11662796,  5.4       , ...,  0.58333333,
         0.74576271,  0.91666667],
       [ 1.39018886, -0.28266094,  5.1       , ...,  0.41666667,
         0.69491525,  0.70833333]], shape=(150, 8))

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression()


# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('combined_features', combined_features),
                              ('model', model)
                             ], verbose = True)

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

[Pipeline] . (step 1 of 2) Processing combined_features, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.0s


0,1,2
,steps,"[('combined_features', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,True

0,1,2
,transformer_list,"[('pca', ...), ('univ_select', ...), ...]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,n_components,2
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,score_func,<function f_c...t 0x1188ce520>
,k,2

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [60]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = accuracy_score(y_valid, preds)
print('Accuracy Score:', score)

Accuracy Score: 1.0


## FunctionTransformer: Constructs a transformer from an arbitrary callable.

Concatenates results of multiple transformer objects.

A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function.

In [62]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestRegressor

dataset = pd.read_csv("data/melb_data.csv")
columns = dataset.columns.to_list()
columns.remove('Price')

X = dataset[columns]
y = dataset['Price']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

#Selecting the numerical columns
def columns_num(X):
    numerical_cols = [cname for cname in X.columns if  X[cname].dtype in ['int64', 'float64']]
    return X.loc[:,numerical_cols]

fill_na_transformer = Pipeline(steps=[ ('drop_cols', FunctionTransformer(columns_num, validate=False)), #prende solo le colonne numeriche usando una funzione definita dall'utente
                                       ('fill_na', SimpleImputer(strategy='most_frequent')) ], verbose=True)


model = RandomForestRegressor(n_estimators=100, random_state=0)



# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', fill_na_transformer),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

[Pipeline] ......... (step 1 of 2) Processing drop_cols, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing fill_na, total=   0.0s


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('drop_cols', ...), ('fill_na', ...)]"
,transform_input,
,memory,
,verbose,True

0,1,2
,func,<function col...t 0x118222480>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [63]:
# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 169857.39715039625


In [64]:
fill_na_transformer = Pipeline(steps=[ ('drop_cols', FunctionTransformer(columns_num, validate=True)),
                                       ('fill_na', SimpleImputer(strategy='most_frequent')) ])

In [65]:
fill_na_transformer.fit_transform(X)

ValueError: could not convert string to float: 'Abbotsford'

In [66]:
X

Unnamed: 0,Suburb,Address,Rooms,Type,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,S,Barry,26/08/2017,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,SP,Williams,26/08/2017,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,S,Raine,26/08/2017,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,PI,Sweeney,26/08/2017,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


In [67]:
columns_num(X)

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.79960,144.99840,4019.0
1,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.80790,144.99340,4019.0
2,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.80930,144.99440,4019.0
3,3,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.79690,144.99690,4019.0
4,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.80720,144.99410,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...
13575,4,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,-37.90562,145.16761,7392.0
13576,3,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,-37.85927,144.87904,6380.0
13577,3,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,-37.85274,144.88738,6380.0
13578,4,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,-37.85908,144.89299,6380.0
