In [1]:
import pandas as pd
data=pd.read_csv("data/data_workflow.csv")
# Shuffling the data
data.head(5)

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [3]:
data.shape

(1338, 6)

In [4]:
# Defining the features and the target
X = data.drop(columns='charges')
y = data['charges']
# Train-Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size
= 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 5), (268, 5), (1070,), (268,))

In [5]:
 # Preprocess "age"
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Build the pipeline with the different steps
pipeline = Pipeline([
 ('imputer', SimpleImputer(strategy="median")),
 ('standard_scaler', StandardScaler())
])
pipeline.fit(X_train[['age']])
pipeline.transform(X_train[['age']])

array([[ 1.70323017],
       [ 0.13159291],
       [ 0.27446902],
       ...,
       [ 0.70309737],
       [ 0.4887832 ],
       [-0.65422572]])

In [6]:
 # Show the different steps of the pipeline
pipeline

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
# Impute then scale numerical values:
num_transformer = Pipeline([
 ('imputer', SimpleImputer(strategy="mean")),
 ('standard_scaler', StandardScaler())
])
# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')
# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
 ('num_transformer', num_transformer, ['age', 'bmi']),
 ('cat_transformer', cat_transformer, ['smoker', 'region'])
])

In [8]:
 # Visualizing Pipelines in HTML
from sklearn import set_config; set_config(display='diagram')
preprocessor

In [9]:
X_train_transformed = preprocessor.fit_transform(X_train)
print("Original training set")
display(X_train.head(3))
print("Preprocessed training set")
display(pd.DataFrame(X_train_transformed).head(3))

Original training set


Unnamed: 0,age,bmi,children,smoker,region
109,63.0,35.09,0,True,southeast
228,41.0,31.635,1,False,northeast
1046,43.0,25.08,0,False,northeast


Preprocessed training set


Unnamed: 0,0,1,2,3,4,5,6,7
0,1.703408,0.72795,0.0,1.0,0.0,0.0,1.0,0.0
1,0.131763,0.157107,1.0,0.0,1.0,0.0,0.0,0.0
2,0.27464,-0.925925,1.0,0.0,1.0,0.0,0.0,0.0


In [10]:
# Get your features' names
preprocessor.get_feature_names_out()

array(['num_transformer__age', 'num_transformer__bmi',
       'cat_transformer__smoker_False', 'cat_transformer__smoker_True',
       'cat_transformer__region_northeast',
       'cat_transformer__region_northwest',
       'cat_transformer__region_southeast',
       'cat_transformer__region_southwest'], dtype=object)

In [11]:
 pd.DataFrame(
 X_train_transformed,
 columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__smoker_False,cat_transformer__smoker_True,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest
0,1.703408,0.72795,0.0,1.0,0.0,0.0,1.0,0.0
1,0.131763,0.157107,1.0,0.0,1.0,0.0,0.0,0.0
2,0.27464,-0.925925,1.0,0.0,1.0,0.0,0.0,0.0
3,-0.296868,-0.03042,1.0,0.0,0.0,0.0,0.0,1.0
4,1.703408,-1.490986,1.0,0.0,0.0,1.0,0.0,0.0


In [12]:
preprocessor = ColumnTransformer([
 ('num_transformer', num_transformer, ['age','bmi']),
 ('cat_transformer', cat_transformer, ['region','smoker'])],
 remainder='passthrough'
)
preprocessor

In [13]:
pd.DataFrame(preprocessor.fit_transform(X_train),
 columns=preprocessor.get_feature_names_out()).head(3)

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest,cat_transformer__smoker_False,cat_transformer__smoker_True,remainder__children
0,1.703408,0.72795,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.131763,0.157107,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.27464,-0.925925,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np
# Create a transformer that compresses data to 2 digits (for instance!)
# rounder = FunctionTransformer(np.round)
# We can use a lambda function for more customizable functions
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [15]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
 ('imputer', SimpleImputer()),
 ('scaler', StandardScaler()),
 ('rounder', rounder)])
# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary',
 handle_unknown='ignore')
preprocessor = ColumnTransformer([
 ('num_transformer', num_transformer, ['bmi', 'age']),
 ('cat_transformer', cat_transformer, ['region', 'smoker'])],
 remainder='passthrough')
preprocessor

In [16]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.73,1.7,0.0,0.0,1.0,0.0,1.0,0.0
1,0.16,0.13,1.0,0.0,0.0,0.0,0.0,1.0
2,-0.93,0.27,1.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn.base import TransformerMixin, BaseEstimator
class MyCustomTranformer(TransformerMixin, BaseEstimator):
 # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
 # TransformerMixin creates the fit_transform() method from fit() and transform()
 def __init__(self):
     pass
 def fit(self, X, y=None):
 # Here you store what needs to be stored/learned during .fit(X_train) as instance attributes
 # Return "self" to allow chaining .fit().transform()
        pass
 def transform(self, X, y=None):
 # Return the result as a DataFrame for an integration intothe ColumnTransformer
     pass

In [18]:
X_train.head(3)

Unnamed: 0,age,bmi,children,smoker,region
109,63.0,35.09,0,True,southeast
228,41.0,31.635,1,False,northeast
1046,43.0,25.08,0,False,northeast


In [20]:
from sklearn.pipeline import FeatureUnion
# Create a custom transformer that multiplies/divides two columns
# Notice that we are creating this new feature completely randomlyjust as an example
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))
union = FeatureUnion([
 ('preprocess', preprocessor), # columns 0-7
 ('bmi_age_ratio', bmi_age_ratio_constructor) # new column 8
])
union 

In [21]:
pd.DataFrame(union.fit_transform(X_train)).head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.73,1.7,0.0,0.0,1.0,0.0,1.0,0.0,0.556984


In [23]:
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer

In [24]:
make_pipeline(SimpleImputer(), StandardScaler())

In [25]:
 num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()
preproc_basic = make_column_transformer(
 (num_transformer, ['age', 'bmi']),
 (cat_transformer, ['smoker', 'region']),
 remainder='passthrough'
)
preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

In [26]:
 X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

In [27]:
from sklearn.compose import make_column_selector
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])
cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])
preproc_basic = make_column_transformer(
 (num_transformer, num_col),
 (cat_transformer, cat_col),
 remainder='passthrough'
)
preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

In [28]:
from sklearn.linear_model import Ridge
# Preprocessor
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()
preproc = make_column_transformer(
 (num_transformer, make_column_selector(dtype_include=['float64'])),
 (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
 remainder='passthrough'
)
# Add estimator
pipeline = make_pipeline(preproc, Ridge())
pipeline

In [29]:
# Train Pipeline
pipeline.fit(X_train,y_train)
# Make predictions
pipeline.predict(X_test.iloc[0:1])
# Score model
pipeline.score(X_test,y_test)

0.7743133264694606

In [30]:
from sklearn.model_selection import cross_val_score
# Cross-validate Pipeline
cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2').mean()

0.7354593779750912

In [31]:
# Which parameters of the pipeline are GridSearch-able?
pipeline.get_params()

{'memory': None,
 'steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('pipeline',
                                    Pipeline(steps=[('simpleimputer',
                                                     SimpleImputer()),
                                                    ('standardscaler',
                                                     StandardScaler())]),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x0000016296A7EDA0>),
                                   ('onehotencoder', OneHotEncoder(),
                                    <sklearn.compose._column_transformer.make_column_selector object at 0x0000016296A7E8C0>)])),
  ('ridge', Ridge())],
 'transform_input': None,
 'verbose': False,
 'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('pipeline',
                                  Pipeline(steps=[('simpleimputer'

In [32]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
 pipeline,
 param_grid={
 # Access any component of the Pipeline
 # and any available hyperparamater you want to optimize
 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
 'ridge__alpha': [0.1, 0.5, 1, 5, 10]
 },
 cv=5,
 scoring="r2")
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [33]:
pipeline_tuned = grid_search.best_estimator_
pipeline_tuned

In [34]:
pipeline_tuned.predict(X_test[0:1])

array([8607.14388997])

In [35]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [36]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

Before preprocessing, X_train.shape = 
(1070, 5)
After preprocessing, X_train_preprocessed.shape = 


(1070, 9)

In [37]:
import pickle
# Export Pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
 pickle.dump(pipeline_tuned, file)
# Load Pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))
my_pipeline.score(X_test, y_test)

0.774297002889368

In [38]:
import os
from tpot import TPOTRegressor
X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

In [49]:
from sklearn.metrics import r2_score
from dask.distributed import Client
client = Client()  # lance un client Dask local
# Instantiate TPOTClassifier
tpot = TPOTRegressor(
    generations=4,
    population_size=20,
    verbose=2,  # <--- Affiche l’évolution des scores à chaque génération
    scorers=['r2'],
    n_jobs=1,
    cv=2,
    random_state=42,

)
# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)
print(type(tpot))

# Print score
print(tpot.fitted_pipeline_.score(X_test_preproc, y_test))


Perhaps you already have a cluster running?
Hosting the HTTP server on port 65136 instead
  next(self.gen)
Perhaps you already have a cluster running?
Hosting the HTTP server on port 65191 instead
  next(self.gen)
Generation: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:52<00:00, 88.05s/it]


<class 'tpot.tpot_estimator.templates.tpottemplates.TPOTRegressor'>
0.8775898667658694


In [51]:
from sklearn.metrics import r2_score
from dask.distributed import Client
client = Client()  # lance un client Dask local
# Instantiate TPOTClassifier
tpot = TPOTRegressor(
    generations=4,
    population_size=20,
    verbose=3,  # <--- Affiche l’évolution des scores à chaque génération
    scorers=['r2'],
    n_jobs=1,
    cv=2,
    random_state=42,

)
# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)
print(type(tpot))

# Print score
print(tpot.fitted_pipeline_.score(X_test_preproc, y_test))


Perhaps you already have a cluster running?
Hosting the HTTP server on port 49904 instead
  next(self.gen)
Perhaps you already have a cluster running?
Hosting the HTTP server on port 49969 instead
  next(self.gen)
Generation:  25%|████████████████████████████▌                                                                                     | 1/4 [02:25<07:16, 145.37s/it]

Generation:  1
Best r2_score score: 0.8381716314100627


Generation:  50%|█████████████████████████████████████████████████████████                                                         | 2/4 [04:00<03:51, 115.97s/it]

Generation:  2
Best r2_score score: 0.8424231029166962


Generation:  75%|█████████████████████████████████████████████████████████████████████████████████████▌                            | 3/4 [06:53<02:21, 141.86s/it]

Generation:  3
Best r2_score score: 0.8424231029166962


Generation: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [09:44<00:00, 146.12s/it]

Generation:  4
Best r2_score score: 0.846843178319884





<class 'tpot.tpot_estimator.templates.tpottemplates.TPOTRegressor'>
0.8775898667658694


In [53]:
from sklearn.metrics import r2_score
from dask.distributed import Client
client = Client()  # lance un client Dask local
# Instantiate TPOTClassifier
tpot = TPOTRegressor(
    generations=4,
    population_size=20,
    verbose=4,  # <--- Affiche l’évolution des scores à chaque génération
    scorers=['r2'],
    n_jobs=1,
    cv=2,
    random_state=42,

)
# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)
print(type(tpot))

# Print score
print(tpot.fitted_pipeline_.score(X_test_preproc, y_test))


Perhaps you already have a cluster running?
Hosting the HTTP server on port 50643 instead
  next(self.gen)
Perhaps you already have a cluster running?
Hosting the HTTP server on port 50698 instead
  next(self.gen)
Generation:  25%|████████████████████████████▌                                                                                     | 1/4 [03:02<09:07, 182.60s/it]

Generation:  1
Best r2_score score: 0.8381716314100627


Generation:  50%|█████████████████████████████████████████████████████████                                                         | 2/4 [05:45<05:41, 170.97s/it]

Generation:  2
Best r2_score score: 0.8424231029166962


Generation:  75%|█████████████████████████████████████████████████████████████████████████████████████▌                            | 3/4 [08:39<02:52, 172.56s/it]

Generation:  3
Best r2_score score: 0.8424231029166962


Generation: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [11:53<00:00, 178.28s/it]

Generation:  4
Best r2_score score: 0.846843178319884



2025-07-24 01:25:55,269 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:50717' caused the cluster to lose scattered data, which can't be recovered: {'Series-4a5cb3149ca4fdb84abc10a80e99ca83', 'ndarray-ec304ebc5a7895bf5e9675d11be3d883'} (stimulus_id='handle-worker-cleanup-1753313155.2691047')


<class 'tpot.tpot_estimator.templates.tpottemplates.TPOTRegressor'>
0.8775898667658694
