# Base

In [None]:
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import pipeline
from sklearn.pipeline import Pipeline

simple_cols = ['BEDCERT', 'RESTOT', 'INHOSP', 'CCRC_FACIL', 'SFF', 'CHOW_LAST_12MOS', 'SPRINKLER_STATUS', 'EXP_TOTAL', 'ADJ_TOTAL']

class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        return X[self.columns].values
        
simple_features = Pipeline([
    ('cst', ColumnSelectTransformer(simple_cols)),
    ('imputer', SimpleImputer(strategy='mean')),
])

from sklearn.linear_model import LinearRegression

simple_features_model = Pipeline([
    ('simple', simple_features),
    ('linear', LinearRegression()),
])

simple_features_model.fit(data, fine_counts > 0)

Pipeline(memory=None,
         steps=[('simple',
                 Pipeline(memory=None,
                          steps=[('cst',
                                  ColumnSelectTransformer(columns=['BEDCERT',
                                                                   'RESTOT',
                                                                   'INHOSP',
                                                                   'CCRC_FACIL',
                                                                   'SFF',
                                                                   'CHOW_LAST_12MOS',
                                                                   'SPRINKLER_STATUS',
                                                                   'EXP_TOTAL',
                                                                   'ADJ_TOTAL'])),
                                 ('imputer',
                                  SimpleImputer(add_indicator=False, copy=True,
                                                fill_value=None,
                                                missing_values=nan,
                                                strategy='mean', verbose=0))],
                                                
                          verbose=False)),
                ('linear',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)

# Kaggle

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean') # Your code here

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]) # Your code here

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

# Define model
model = RandomForestRegressor() # Your code here


# Deep dive into sklearn.pipelines

A Deep Dive Into Sklearn Pipelines -> article



- pipelines for repetitive experiments
- Pipelines are set up with the fit/transform/predict functionality, so you can fit a whole pipeline to the training data and transform to the test data, without having to do it individually for each thing you do.
- 2 types of transformations: dependent on a data and not
- split to numberic features / object features
- 

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
# select single cols
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
# process single cols
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

text = Pipeline([
                ('selector', TextSelector(key='processed')),
                ('tfidf', TfidfVectorizer( stop_words='english'))
            ])

length =  Pipeline([
                ('selector', NumberSelector(key='length')),
                ('standard', StandardScaler())
            ])

words =  Pipeline([
                ('selector', NumberSelector(key='words')),
                ('standard', StandardScaler())
            ])
words_not_stopword =  Pipeline([
                ('selector', NumberSelector(key='words_not_stopword')),
                ('standard', StandardScaler())
            ])
avg_word_length =  Pipeline([
                ('selector', NumberSelector(key='avg_word_length')),
                ('standard', StandardScaler())
            ])
commas =  Pipeline([
                ('selector', NumberSelector(key='commas')),
                ('standard', StandardScaler()),
            ])

from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('length', length),
                      ('words', words),
                      ('words_not_stopword', words_not_stopword),
                      ('avg_word_length', avg_word_length),
                      ('commas', commas)])

feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipeline.fit(X_train, y_train)

preds = pipeline.predict(X_test)
np.mean(preds == y_test)


In [None]:
# GridSearch for pipelines

pipeline.get_params().keys()

from sklearn.model_selection import GridSearchCV

hyperparameters = { 'features__text__tfidf__max_df': [0.9, 0.95],
                    'features__text__tfidf__ngram_range': [(1,1), (1,2)],
                   'classifier__max_depth': [50, 70],
                    'classifier__min_samples_leaf': [1,2]
                  }
clf = GridSearchCV(pipeline, hyperparameters, cv=5)
 
# Fit and tune model
clf.fit(X_train, y_train)

#refitting on entire training data using best settings
clf.refit

preds = clf.predict(X_test)
probs = clf.predict_proba(X_test)

np.mean(preds == y_test)


# Another article

In [None]:
# ColumnTransformer

# "This estimator allows different columns or column subsets
# of the input to be transformed separately and the features
# generated by each transformer will be concatenated to form a single feature space. 

# This is useful for heterogeneous or columnar data, to combine several feature extraction
#     mechanisms or transformations into a single transformer."

num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',MinMaxScaler())
])
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse=False))
])

from sklearn.compose import ColumnTransformer

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
    ],
    remainder='drop',
    n_jobs=-1)

In [None]:
#  Create ColumnTransformer to Apply the Pipeline for Each Column Set
#  ColumnTransformer(transformers=[(‘step name’, transform function,cols), …])


# SHAD

In [None]:
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
from typing import Optional, List
import pandas as pd
import numpy as np


class BaseDataPreprocessor(TransformerMixin):
    def __init__(self, needed_columns: Optional[List[str]] = None):
        """
        :param needed_columns: if not None select these columns from the dataframe
        """
        self.needed_columns = needed_columns
        self.scaler = StandardScaler()

    def fit(self, data, *args):
        """
        Prepares the class for future transformations
        :param data: pd.DataFrame with all available columns
        :return: self
        """

        if self.needed_columns:
            data = data.loc[:, self.needed_columns]

        self.scaler.fit(X=data)

        return self

    def transform(self, data: pd.DataFrame) -> np.array:
        
        """
        Transforms features so that they can be fed into the regressors
        :param data: pd.DataFrame with all available columns
        :return: np.array with preprocessed features
        """
        
        if self.needed_columns:
            data = data.loc[:, self.needed_columns]
        return self.scaler.transform(X=data)
    
preprocessor = BaseDataPreprocessor(needed_columns=continuous_columns)

X_train = preprocessor.fit_transform(data_train)
X_test = preprocessor.transform(data_test)

pd.set_option('display.max_columns', None)


class SmartDataPreprocessor(TransformerMixin):
    def __init__(self, needed_columns: Optional[List[str]] = None):
        """
        :param needed_columns: if not None select these columns from the dataframe
        """
        self.needed_columns = needed_columns
        self.scaler = StandardScaler()

    def add_features(self, data: pd.DataFrame):
        cols = data.columns
        if 'Lot_Frontage' in cols and 'Lot_Area' in cols:
            data['Generated_Feature'] = data['Lot_Frontage'] + data['Lot_Area']
        return data

    def fit(self, data, *args):
        """
        Prepares the class for future transformations
        :param data: pd.DataFrame with all available columns
        :return: self

        Feature Generation

        """

        if self.needed_columns:
            data = data.loc[:, self.needed_columns]

        data = self.add_features(data)
        self.scaler.fit(X=data)

        return self

    def transform(self, data: pd.DataFrame) -> np.array:
        """
        Transforms features so that they can be fed into the regressors
        :param data: pd.DataFrame with all available columns
        :return: np.array with preprocessed features

        Feature Generation

        """

        if self.needed_columns:
            data = data.loc[:, self.needed_columns]

        data = self.add_features(data)

        return self.scaler.transform(X=data)


In [None]:


from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from typing import Optional, List
from sklearn.linear_model import LinearRegression, Ridge
import numpy as np
from sklearn.metrics import mean_absolute_error
import sklearn.base


def replace_zero(data):
    mask = (data == 0)
    nonzero_mean = np.mean(data[(1 - mask).astype(bool)])
    dc = data.copy()
    dc[mask] = nonzero_mean
    return dc


class BaseDataPreprocessor(TransformerMixin):
    def __init__(self, needed_columns: Optional[List[str]]=None):
        """
        :param needed_columns: if not None select these columns from the dataframe
        """
        self.scaler = StandardScaler()
        self.needed_columns = needed_columns

    def fit(self, data, *args):
        """
        Prepares the class for future transformations
        :param data: pd.DataFrame with all available columns
        :return: self
        """
        X = data
        if self.needed_columns is not None:
            X = data[self.needed_columns]
        self.scaler.fit(X)
        return self

    def transform(self, data: pd.DataFrame) -> np.array:
        """
        Transforms features so that they can be fed into the regressors
        :param data: pd.DataFrame with all available columns
        :return: np.array with preprocessed features
        """
        X = data
        if self.needed_columns is not None:
            X = data[self.needed_columns]
        return np.array(self.scaler.transform(X))

class OneHotPreprocessor(BaseDataPreprocessor):
    def __init__(self, cat_cols, gaps, **kwargs):
        super(OneHotPreprocessor, self).__init__(**kwargs)
        self.enc = OneHotEncoder(handle_unknown='ignore')
        self.city_center = (42.025333, -93.633801)
        self.cols = cat_cols
        self.gaps = gaps

    def fit(self, data, *args):
        d = data.copy()
        if self.gaps is not None:
            for g in self.gaps:
                d[g] = replace_zero(d[g])
        d['dtc'] = np.linalg.norm(d[['Latitude', 'Longitude']] - self.city_center, axis=1)
        super().fit(d)
        self.enc.fit(d[self.cols])
        return self

    def transform(self, data):
        d = data.copy()
        if self.gaps is not None:
            for g in self.gaps:
                d[g] = replace_zero(d[g])
        d['dtc'] = np.linalg.norm(d[['Latitude', 'Longitude']] - self.city_center, axis=1)
        cont = np.array(super().transform(d))
        cat = np.array(self.enc.transform(d[self.cols]).todense())
        return np.hstack((cont, cat))


def make_ultimate_pipeline():

    continuous_columns = ['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add', 
                          'Mas_Vnr_Area', 'BsmtFin_SF_1', 'BsmtFin_SF_2', 'Bsmt_Unf_SF',
                          'Total_Bsmt_SF', 'First_Flr_SF', 'Second_Flr_SF',
                          'Gr_Liv_Area', 'Bedroom_AbvGr', 'TotRms_AbvGrd',
                          'Garage_Cars', 'Garage_Area', 'Wood_Deck_SF',
                          'Open_Porch_SF', 'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch',
                          'Pool_Area', 'Mo_Sold', 'Year_Sold','dtc']
    
    categorical_column = ['MS_SubClass', 'MS_Zoning', 'Street', 'Alley', 'Lot_Shape', 'Land_Contour',
                      'Utilities', 'Lot_Config', 'Land_Slope', 'Neighborhood', 'Condition_1',
                      'Condition_2', 'Bldg_Type', 'House_Style', 'Overall_Qual', 'Overall_Cond',
                      'Roof_Style', 'Roof_Matl', 'Exterior_1st', 'Exterior_2nd', 'Mas_Vnr_Type',
                      'Exter_Qual', 'Exter_Cond', 'Foundation', 'Bsmt_Qual', 'Bsmt_Cond',
                      'Bsmt_Exposure', 'BsmtFin_Type_1', 'BsmtFin_Type_2', 'Heating',
                      'Heating_QC', 'Central_Air', 'Electrical', 'Kitchen_Qual',
                      'Functional', 'Fireplace_Qu', 'Garage_Type', 'Garage_Finish',
                      'Garage_Qual', 'Garage_Cond', 'Paved_Drive', 'Pool_QC', 'Fence',
                      'Misc_Feature', 'Sale_Type', 'Sale_Condition']
    
    interesting_columns = ['Fence', 'MS_SubClass', 'Utilities', 'Sale_Type',
                           "Overall_Qual", "Garage_Qual", "Sale_Condition", "MS_Zoning",
                           'Bsmt_Half_Bath', 'Half_Bath', 'Bsmt_Full_Bath',
                           'Kitchen_AbvGr', 'Fireplaces', 'Full_Bath']
    
    interesting_columns = list(set(categorical_columns + interesting_columns))
    oh_proc = OneHotPreprocessor(needed_columns=continuous_columns, cat_cols=interesting_columns,
                                 gaps=None)
    return Pipeline(steps=[('scaler', oh_proc), ('reg', Rigde())])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
target_column = "Sale_Price"
np.random.seed(seed)

test_size = 0.2
data_train, data_test, Y_train, Y_test = train_test_split(
    data[data.columns.drop("Sale_Price")],
    np.array(data["Sale_Price"]),
    test_size=test_size,
    random_state=seed)

preprocessor = OneHotPreprocessor(
    needed_columns=continuous_columns, interesting_columns=interesting_columns)

X_train = preprocessor.fit_transform(data_train)
X_test = preprocessor.transform(data_test)


model = SGDLinearRegressor()


def make_ultimate_pipeline():
    pipe = Pipeline([('preprocessor', StandardScaler()), ('model', model)])
    print(pipe.fit(X_train, Y_train).score(X_test, Y_test))


make_ultimate_pipeline()



In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_digits
from sklearn.decomposition import NMF, PCA
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC

X, y = load_digits(return_X_y=True)

pipe = Pipeline(
    [
        ("scaling", MinMaxScaler()),
        # the reduce_dim stage is populated by the param_grid
        ("reduce_dim", "passthrough"),
        ("classify", LinearSVC(dual=False, max_iter=10000)),
    ]
)

N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]

param_grid = [
    {
        "reduce_dim": [PCA(iterated_power=7), NMF(max_iter=1_000)],
        "reduce_dim__n_components": N_FEATURES_OPTIONS,
        "classify__C": C_OPTIONS,
    },
    {
        "reduce_dim": [SelectKBest(mutual_info_classif)],
        "reduce_dim__k": N_FEATURES_OPTIONS,
        "classify__C": C_OPTIONS,
    },
]
reducer_labels = ["PCA", "NMF", "KBest(mutual_info_classif)"]

grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid)
grid.fit(X, y)

In [None]:
import pandas as pd

mean_scores = np.array(grid.cv_results_["mean_test_score"])
# scores are in the order of param_grid iteration, which is alphabetical
mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# select score for best C
mean_scores = mean_scores.max(axis=0)
# create a dataframe to ease plotting
mean_scores = pd.DataFrame(
    mean_scores.T, index=N_FEATURES_OPTIONS, columns=reducer_labels
)

ax = mean_scores.plot.bar()
ax.set_title("Comparing feature reduction techniques")
ax.set_xlabel("Reduced number of features")
ax.set_ylabel("Digit classification accuracy")
ax.set_ylim((0, 1))
ax.legend(loc="upper left")

plt.show()

This estimator applies a list of transformer objects in parallel to the input data,
then concatenates the results. This is useful to combine several feature extraction mechanisms into a single transformer.
During fitting, each of these is fit to the data independently.

Apart from a scalar or a single item list, the column selection can be specified as a list of multiple items, an integer array, a slice, a boolean mask, or with a make_column_selector. The make_column_selector is used to select columns based on data type or column name:



In [None]:
numeric_features = ["age", "fare"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["embarked", "sex", "pclass"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "preprocessor__cat__selector__percentile": [10, 30, 50, 70],
    "classifier__C": [0.1, 1.0, 10, 100],
}

search_cv = RandomizedSearchCV(clf, param_grid, n_iter=10, random_state=0)
search_cv