In [2]:
import pandas as pd
data=pd.read_csv("data/data_workflow.csv")
# Shuffling the data
data.head(5)

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [3]:
data.shape

(1338, 6)

In [4]:
# Defining the features and the target
X = data.drop(columns='charges')
y = data['charges']
# Train-Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size
= 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 5), (268, 5), (1070,), (268,))

In [5]:
 # Preprocess "age"
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Build the pipeline with the different steps
pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('standard_scaler', StandardScaler())
])
pipeline.fit(X_train[['age']])
pipeline.transform(X_train[['age']])

array([[ 0.13039544],
       [-1.43010713],
       [ 1.26530639],
       ...,
       [ 1.05251059],
       [ 0.55598704],
       [ 1.12344252]])

In [6]:
 # Show the different steps of the pipeline
pipeline

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
# Impute then scale numerical values:
num_transformer = Pipeline([
 ('imputer', SimpleImputer(strategy="mean")),
 ('standard_scaler', StandardScaler())
])
# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')
# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
 ('num_transformer', num_transformer, ['age', 'bmi']),
 ('cat_transformer', cat_transformer, ['smoker', 'region'])
])

In [8]:
 # Visualizing Pipelines in HTML
from sklearn import set_config; set_config(display='diagram')
preprocessor

In [9]:
X_train_transformed = preprocessor.fit_transform(X_train)
print("Original training set")
display(X_train.head(3))
print("Preprocessed training set")
display(pd.DataFrame(X_train_transformed).head(3))

Original training set


Unnamed: 0,age,bmi,children,smoker,region
1197,41.0,33.55,0,False,southeast
430,19.0,33.1,0,False,southwest
859,57.0,28.1,0,False,southwest


Preprocessed training set


Unnamed: 0,0,1,2,3,4,5,6,7
0,0.130363,0.462259,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.43014,0.388773,1.0,0.0,0.0,0.0,0.0,1.0
2,1.265274,-0.427735,1.0,0.0,0.0,0.0,0.0,1.0


In [10]:
# Get your features' names
preprocessor.get_feature_names_out()

array(['num_transformer__age', 'num_transformer__bmi',
       'cat_transformer__smoker_False', 'cat_transformer__smoker_True',
       'cat_transformer__region_northeast',
       'cat_transformer__region_northwest',
       'cat_transformer__region_southeast',
       'cat_transformer__region_southwest'], dtype=object)

In [11]:
 pd.DataFrame(
 X_train_transformed,
 columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__smoker_False,cat_transformer__smoker_True,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest
0,0.130363,0.462259,1.0,0.0,0.0,0.0,1.0,0.0
1,-1.43014,0.388773,1.0,0.0,0.0,0.0,0.0,1.0
2,1.265274,-0.427735,1.0,0.0,0.0,0.0,0.0,1.0
3,-0.933616,0.767632,1.0,0.0,0.0,0.0,1.0,0.0
4,1.052479,0.320186,1.0,0.0,1.0,0.0,0.0,0.0


In [12]:
preprocessor = ColumnTransformer([
 ('num_transformer', num_transformer, ['age','bmi']),
 ('cat_transformer', cat_transformer, ['region','smoker'])],
 remainder='passthrough'
)
preprocessor

In [13]:
 pd.DataFrame(preprocessor.fit_transform(X_train),
 columns=preprocessor.get_feature_names_out()).head(3)

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest,cat_transformer__smoker_False,cat_transformer__smoker_True,remainder__children
0,0.130363,0.462259,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-1.43014,0.388773,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1.265274,-0.427735,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [14]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np
# Create a transformer that compresses data to 2 digits (for instance!)
# rounder = FunctionTransformer(np.round)
# We can use a lambda function for more customizable functions
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [15]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
 ('imputer', SimpleImputer()),
 ('scaler', StandardScaler()),
 ('rounder', rounder)])
# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary',
 handle_unknown='ignore')
preprocessor = ColumnTransformer([
 ('num_transformer', num_transformer, ['bmi', 'age']),
 ('cat_transformer', cat_transformer, ['region', 'smoker'])],
 remainder='passthrough')
preprocessor

In [16]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.46,0.13,0.0,0.0,1.0,0.0,0.0,0.0
1,0.39,-1.43,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.43,1.27,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
from sklearn.base import TransformerMixin, BaseEstimator
class MyCustomTranformer(TransformerMixin, BaseEstimator):
 # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
 # TransformerMixin creates the fit_transform() method from fit() and transform()
 def __init__(self):
     pass
 def fit(self, X, y=None):
 # Here you store what needs to be stored/learned during .fit(X_train) as instance attributes
 # Return "self" to allow chaining .fit().transform()
        pass
 def transform(self, X, y=None):
 # Return the result as a DataFrame for an integration intothe ColumnTransformer
     pass

In [18]:
X_train.head(3)

Unnamed: 0,age,bmi,children,smoker,region
1197,41.0,33.55,0,False,southeast
430,19.0,33.1,0,False,southwest
859,57.0,28.1,0,False,southwest


In [20]:
from sklearn.pipeline import FeatureUnion
# Create a custom transformer that multiplies/divides two columns
# Notice that we are creating this new feature completely randomlyjust as an example
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))
union = FeatureUnion([
 ('preprocess', preprocessor), # columns 0-7
 ('bmi_age_ratio', bmi_age_ratio_constructor) # new column 8
])
union 

In [21]:
pd.DataFrame(union.fit_transform(X_train)).head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.46,0.13,0.0,0.0,1.0,0.0,0.0,0.0,0.818293


In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

In [23]:
make_pipeline(SimpleImputer(), StandardScaler())

In [24]:
 num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()
preproc_basic = make_column_transformer(
 (num_transformer, ['age', 'bmi']),
 (cat_transformer, ['smoker', 'region']),
 remainder='passthrough'
)
preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

In [25]:
 X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [26]:
from sklearn.compose import make_column_selector
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])
cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])
preproc_basic = make_column_transformer(
 (num_transformer, num_col),
 (cat_transformer, cat_col),
 remainder='passthrough'
)
preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

In [27]:
from sklearn.linear_model import Ridge
# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()
preproc = make_column_transformer(
 (num_transformer, make_column_selector(dtype_include=['float64'])),
 (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
 remainder='passthrough'
)
# Add estimator
pipeline = make_pipeline(preproc, Ridge())
pipeline

In [28]:
# Train Pipeline
pipeline.fit(X_train,y_train)
# Make predictions
pipeline.predict(X_test.iloc[0:1])
# Score model
pipeline.score(X_test,y_test)

0.7644739950372961

In [29]:
from sklearn.model_selection import cross_val_score
# Cross-validate Pipeline
cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2').mean()

0.7375828484435625

In [30]:
# Which parameters of the pipeline are GridSearch-able?
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000002C0F1CF5F90>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x000002C0F1CF6080>)])),
  ('ridge', Ridge())],
 'transform_input': None,
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer'

In [31]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
 pipeline,
 param_grid={
 # Access any component of the Pipeline
 # and any available hyperparamater you want to optimize
 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
 'ridge__alpha': [0.1, 0.5, 1, 5, 10]
 },
 cv=5,
 scoring="r2")
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'mean',
 'ridge__alpha': 1}

In [32]:
pipeline_tuned = grid_search.best_estimator_
pipeline_tuned

In [33]:
pipeline_tuned.predict(X_test[0:1])

array([32832.78504306])

In [34]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [35]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

Before preprocessing, X_train.shape = 
(1070, 5)
After preprocessing, X_train_preprocessed.shape = 


(1070, 9)

In [36]:
import pickle
# Export Pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
 pickle.dump(pipeline_tuned, file)
# Load Pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))
my_pipeline.score(X_test, y_test)

0.7644739950372961

In [41]:
import os
from tpot import TPOTRegressor
X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

In [54]:
from dask.distributed import Client

client = Client()  # lance un cluster local par défaut
print(client)
# Instantiate TPOTClassifier
tpot = TPOTRegressor(generations=4, population_size=20, verbose=2, scorers= ["r2"],scorers_weights=[1], n_jobs=-1, 
                     cv=2,search_space='linear-light', random_state=42 ,client=client)
# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

if hasattr(tpot, 'fitted_pipeline_'):
    print("Best pipeline:", tpot.fitted_pipeline_)
    print("Score on test set:", tpot.fitted_pipeline_.score(X_test_preproc, y_test))
else:
    print("TPOT did not find a valid pipeline.")

# Print score
print(tpot.fitted_pipeline_.score(X_test_preproc, y_test))


Perhaps you already have a cluster running?
Hosting the HTTP server on port 60771 instead
  next(self.gen)


<Client: 'tcp://127.0.0.1:60779' processes=4 threads=12, memory=15.72 GiB>


Generation: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:19<00:00, 79.77s/it]


Best pipeline: Pipeline(steps=[('maxabsscaler', MaxAbsScaler()),
                ('variancethreshold',
                 VarianceThreshold(threshold=0.0003615272574)),
                ('featureunion-1',
                 FeatureUnion(transformer_list=[('skiptransformer',
                                                 SkipTransformer()),
                                                ('passthrough',
                                                 Passthrough())])),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('skiptransformer',
                                                 SkipTransformer()),
                                                ('passthrough',
                                                 Passthrough())])),
                ('decisiontreeregressor',
                 DecisionTreeRegressor(min_samples_leaf=19,
                                       min_samples_split=14,
                                       random_state=42))])
Sco

In [53]:
import tpot
print(tpot.__version__)

1.1.0
