# Python scikit-learn Machine Learning Workflow 

# Setting up

In [1]:
# Import the libraries
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, SelectFromModel,\
    f_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LassoCV, LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV,\
    train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import datasense as ds

In [2]:
# Set the global parameters
pd.options.display.max_rows = None
pd.options.display.max_columns = None
filename = 'lunch_and_learn.csv'
numrows = 500
target = 'Y'
features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7',
            'X8', 'X9', 'X10', 'X11', 'X12', 'X13']

# Cleaning the data

In [3]:
# Read the data file into a pandas DataFrame
data = pd.read_csv(filename, nrows=numrows)

In [4]:
# Set lower and upper values to remove outliers
mask_values = [
    ('X1', -20, 20),
    ('X2', -25, 25),
    ('X3', -5, 5),
    ('X4', -10, 10),
    ('X5', -3, 3),
    ('X6', -5, 5),
    ('X7', -13, 13),
    ('X8', -9, 15),
    ('X9', -17, 15),
    ('X10', -16, 15),
    ('X11', -16, 17),
    ('X12', -16, 17),
    ('X13', -20, 23)
]
# Replace outliers with NaN
for column, lowvalue, highvalue in mask_values:
    data[column]= data[column].mask(
        (data[column] <= lowvalue) |
        (data[column] >= highvalue)
    )

# Splitting the data

In [5]:
# Create training and testing data sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

# Workflow 1

- Impute using the mean
- Select features using SelectFromModel(DecisionTreeRegressor)
- Fit with LinearRegression

In [6]:
start_time = datetime.now()

## Creating a column transformer

In [7]:
# Create the imputer object
imp = SimpleImputer()

In [8]:
# Create the column transformer object
ct = make_column_transformer(
     (imp, features),
     remainder='passthrough'
)

## Creating a feature selection object

In [9]:
# Create objects to use for feature selection
linreg_selection = LinearRegression()
dtr_selection = DecisionTreeRegressor()
lasso_selection = Lasso()
rfr_selection = RandomForestRegressor()

In [10]:
# Create the feature selection object
selection = SelectFromModel(estimator=dtr_selection,
                            threshold='median')

## Creating a regression object

In [11]:
# Create objects to use for regression
linreg = LinearRegression()
dtr = DecisionTreeRegressor()
lasso = Lasso()
rfr = RandomForestRegressor()

## Create a workflow object

In [12]:
# Create the workflow object
pipe = make_pipeline(ct, selection, linreg)

In [13]:
# Determine the linear regression model
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(),
                                                  ['X1', 'X2', 'X3', 'X4', 'X5',
                                                   'X6', 'X7', 'X8', 'X9',
                                                   'X10', 'X11', 'X12',
                                                   'X13'])])),
                ('selectfrommodel',
                 SelectFromModel(estimator=DecisionTreeRegressor(),
                                 threshold='median')),
                ('linearregression', LinearRegression())])

In [14]:
# Display the regression coefficients of the features
pipe.named_steps.linearregression.coef_.round(3)

array([  7.001, -13.044,   6.358,  -9.018,   4.729,  -7.984,  -0.031])

In [15]:
# Display the regression intercept
pipe.named_steps.linearregression.intercept_.round(3)

69.012

In [16]:
# Show the selected features 
X.columns[selection.get_support()]

Index(['X1', 'X2', 'X3', 'X4', 'X6', 'X7', 'X10'], dtype='object')

## Hyperparameter optimization

In [17]:
pipe.named_steps.columntransformer.get_params

<bound method ColumnTransformer.get_params of ColumnTransformer(remainder='passthrough',
                  transformers=[('simpleimputer', SimpleImputer(),
                                 ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7',
                                  'X8', 'X9', 'X10', 'X11', 'X12', 'X13'])])>

In [18]:
pipe.named_steps.selectfrommodel.get_params

<bound method BaseEstimator.get_params of SelectFromModel(estimator=DecisionTreeRegressor(), threshold='median')>

In [19]:
pipe.named_steps.linearregression.get_params

<bound method BaseEstimator.get_params of LinearRegression()>

In [20]:
# Set the hyperparameters for optimization
# There are 4 x 2 x 3 x 2 = 48 combinations
hyperparams = {}
hyperparams['columntransformer__simpleimputer__strategy'] =\
    ['mean', 'median', 'most_frequent', 'constant']
hyperparams['selectfrommodel__estimator'] =\
    [linreg_selection, dtr_selection]
hyperparams['selectfrommodel__threshold'] = [None, 'mean', 'median']
hyperparams['linearregression__normalize'] = [False, True]

In [21]:
# Perform a grid search
grid = GridSearchCV(pipe, hyperparams, cv=5)
grid.fit(X_train, y_train);

In [22]:
# Present the results
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score');

In [23]:
# Access the best score
grid.best_score_.round(3)

0.993

In [24]:
# Access the best hyperparameters
grid.best_params_

{'columntransformer__simpleimputer__strategy': 'mean',
 'linearregression__normalize': False,
 'selectfrommodel__estimator': LinearRegression(),
 'selectfrommodel__threshold': 'median'}

In [25]:
# Cross-validate the updated pipeline
cross_val_score(pipe, X_train, y_train, cv=5).mean().round(3)

0.986

In [26]:
end_time = datetime.now()
round((end_time - start_time).total_seconds(), 3)

3.73

# Workflow 2

- Impute using the mean
- Select features using SelectFromModel(LinearRegression)
- Fit with DecisionTreeRegression

In [27]:
start_time = datetime.now()

## Creating a column transformer

In [28]:
# Create the imputer object
imp = SimpleImputer()

In [29]:
# Create the column transformer object
ct = make_column_transformer(
     (imp, features),
     remainder='passthrough'
)

## Creating a feature selection object

In [30]:
# Create objects to use for feature selection
linreg_selection = LinearRegression()
dtr_selection = DecisionTreeRegressor()
lasso_selection = Lasso()
rfr_selection = RandomForestRegressor()

In [31]:
# Create the feature selection object
selection = SelectFromModel(estimator=linreg_selection,
                            threshold='median')

## Creating a regression object

In [32]:
# Create objects to use for regression
linreg = LinearRegression()
dtr = DecisionTreeRegressor()
lasso = Lasso()
rfr = RandomForestRegressor()

## Create a workflow object

In [33]:
# Create the workflow object
pipe = make_pipeline(ct, selection, dtr)

In [34]:
# Determine the linear regression model
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(),
                                                  ['X1', 'X2', 'X3', 'X4', 'X5',
                                                   'X6', 'X7', 'X8', 'X9',
                                                   'X10', 'X11', 'X12',
                                                   'X13'])])),
                ('selectfrommodel',
                 SelectFromModel(estimator=LinearRegression(),
                                 threshold='median')),
                ('decisiontreeregressor', DecisionTreeRegressor())])

In [35]:
# Display the regression coefficients of the features
pipe.named_steps.decisiontreeregressor.feature_importances_

array([0.22765829, 0.46894971, 0.0093757 , 0.11506464, 0.01228666,
       0.01986888, 0.14679613])

In [36]:
# Show the selected features 
X.columns[selection.get_support()]

Index(['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7'], dtype='object')

## Hyperparameter optimization

In [37]:
pipe.named_steps.columntransformer.get_params

<bound method ColumnTransformer.get_params of ColumnTransformer(remainder='passthrough',
                  transformers=[('simpleimputer', SimpleImputer(),
                                 ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7',
                                  'X8', 'X9', 'X10', 'X11', 'X12', 'X13'])])>

In [38]:
pipe.named_steps.selectfrommodel.get_params

<bound method BaseEstimator.get_params of SelectFromModel(estimator=LinearRegression(), threshold='median')>

In [39]:
pipe.named_steps.decisiontreeregressor.get_params

<bound method BaseEstimator.get_params of DecisionTreeRegressor()>

In [40]:
# Set the hyperparameters for optimization
# There are 4 x 2 x 3 x 4 x 2 = 192 combinations
hyperparams = {}
hyperparams['columntransformer__simpleimputer__strategy'] =\
    ['mean', 'median', 'most_frequent', 'constant']
hyperparams['selectfrommodel__estimator'] =\
    [linreg_selection, dtr_selection]
hyperparams['selectfrommodel__threshold'] = [None, 'mean', 'median']
hyperparams['decisiontreeregressor__criterion'] =\
    ['mse', 'friedman_mse', 'mae']
hyperparams['decisiontreeregressor__splitter'] = ['best', 'random']

In [41]:
# Perform a grid search
grid = GridSearchCV(pipe, hyperparams, cv=5)
grid.fit(X_train, y_train);

In [42]:
# Present the results
pd.DataFrame(grid.cv_results_).sort_values('rank_test_score');

In [43]:
# Access the best score
grid.best_score_.round(3)

0.714

In [44]:
# Access the best hyperparameters
grid.best_params_

{'columntransformer__simpleimputer__strategy': 'mean',
 'decisiontreeregressor__criterion': 'mse',
 'decisiontreeregressor__splitter': 'best',
 'selectfrommodel__estimator': DecisionTreeRegressor(),
 'selectfrommodel__threshold': None}

In [45]:
# Cross-validate the updated pipeline
cross_val_score(pipe, X_train, y_train, cv=5).mean().round(3)

0.661

In [46]:
end_time = datetime.now()
round((end_time - start_time).total_seconds(), 3)

11.904