In [2]:
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer

In [6]:
X = [[1], [2], [np.nan], [3]]

In [7]:
cleaner = SimpleImputer()
power = PowerTransformer()

In [8]:
X_clean = SimpleImputer().fit_transform(X)

In [11]:
X_final = PowerTransformer().fit_transform(X_clean)

In [12]:
X_final

array([[-1.43683574],
       [ 0.02299616],
       [ 0.02299616],
       [ 1.39084342]])

## The pipeline object

In [17]:
from sklearn.pipeline import make_pipeline, Pipeline

In [14]:
pipeline = make_pipeline(SimpleImputer(), PowerTransformer())

In [15]:
pipeline.fit_transform(X)

array([[-1.43683574],
       [ 0.02299616],
       [ 0.02299616],
       [ 1.39084342]])

In [23]:
pipeline = Pipeline([
    ('cleaning', SimpleImputer()),
    ('power_transformer', PowerTransformer())
])

In [19]:
pipeline.fit_transform(X)

array([[-1.43683574],
       [ 0.02299616],
       [ 0.02299616],
       [ 1.39084342]])

In [24]:
pipeline.set_params(cleaning__strategy='median')

In [25]:
pipeline.fit_transform(X)

array([[-1.43683574],
       [ 0.02299616],
       [ 0.02299616],
       [ 1.39084342]])

# 4.2 Pipelines and ColumnTransformer together 

In [31]:
import pandas as pd
import numpy as ny

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

In [27]:
df = pd.read_csv("sample_dataset.csv")

In [40]:
cat_pipe = Pipeline([
    ('cleaner', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse_output=False))
])


transformer = ColumnTransformer([
    ('numerical', SimpleImputer(strategy='mean') , make_column_selector(dtype_exclude='object')),
    ('categorical', cat_pipe ,make_column_selector(dtype_include="object"))
])

In [41]:
transformer.fit_transform(df)

array([[ 14.05954772,  10.38      , 122.8       , ...,   1.        ,
          0.        ,   0.        ],
       [ 20.57      ,  17.77      , 132.9       , ...,   1.        ,
          0.        ,   0.        ],
       [ 19.69      ,  21.25      , 130.        , ...,   1.        ,
          0.        ,   0.        ],
       ...,
       [ 16.6       ,  28.08      , 108.3       , ...,   1.        ,
          0.        ,   0.        ],
       [ 20.6       ,  29.33      , 140.1       , ...,   1.        ,
          0.        ,   0.        ],
       [  7.76      ,  19.31182927,  47.92      , ...,   1.        ,
          0.        ,   0.        ]])

In [44]:
transformer.set_params(numerical__strategy = 'constant', numerical__fill_value = 0)

In [46]:
transformer.set_params(categorical__cleaner__strategy = 'constant', categorical__cleaner__fill_value = 'N')

In [48]:
transformer.fit_transform(df)

array([[  0.  ,  10.38, 122.8 , ...,   0.  ,   0.  ,   0.  ],
       [ 20.57,  17.77, 132.9 , ...,   0.  ,   0.  ,   0.  ],
       [ 19.69,  21.25, 130.  , ...,   0.  ,   0.  ,   0.  ],
       ...,
       [ 16.6 ,  28.08, 108.3 , ...,   0.  ,   0.  ,   0.  ],
       [ 20.6 ,  29.33, 140.1 , ...,   0.  ,   0.  ,   0.  ],
       [  7.76,   0.  ,  47.92, ...,   0.  ,   0.  ,   0.  ]])

### Learnings from `4-pipelines.ipynb`

**Pseudocode:**
1. Import necessary libraries.
2. Load and preprocess the dataset.
3. Apply transformations using `SimpleImputer` and `PowerTransformer`.
4. Create and use a pipeline to streamline preprocessing.
5. Combine `Pipeline` and `ColumnTransformer` for complex transformations.

**Code:**
```python
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

# Load and preprocess the dataset
X = [[1], [2], [np.nan], [3]]
df = pd.read_csv("sample_dataset.csv")

# Apply transformations
cleaner = SimpleImputer()
power = PowerTransformer()
X_clean = cleaner.fit_transform(X)
X_final = power.fit_transform(X_clean)

# Create and use a pipeline
pipeline = make_pipeline(SimpleImputer(), PowerTransformer())
pipeline.fit_transform(X)

# Create a more complex pipeline with ColumnTransformer
cat_pipe = Pipeline([
    ('cleaner', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(sparse_output=False))
])

transformer = ColumnTransformer([
    ('numerical', SimpleImputer(strategy='mean'), make_column_selector(dtype_exclude='object')),
    ('categorical', cat_pipe, make_column_selector(dtype_include="object"))
])

# Apply the complex transformer
transformed_df = transformer.fit_transform(df)
transformer.set_params(numerical__strategy='constant', numerical__fill_value=0)
transformer.set_params(categorical__cleaner__strategy='constant', categorical__cleaner__fill_value='N')
transformed_df = transformer.fit_transform(df)
```

**Learnings:**
1. **SimpleImputer and PowerTransformer:** Used for cleaning and transforming numerical data.
2. **Pipeline:** Simplifies the process of applying multiple transformations.
3. **ColumnTransformer:** Allows for different preprocessing steps on different columns.
4. **Combining Pipelines:** Enables complex data preprocessing workflows.

---

**Exercise 1**

- Load `sample_dataset.csv` and select only the features: `mean radius`, `area error`, `mean perimeter`
- Apply the following transformations using `ColumnTransformer` and `Pipeline`:
  - **Numerical features:**
    - Cleaning using the mean value
    - Transformation using the Yeo-Johnson transformation
  - **Categorical features:**
    - Cleaning using the most probable value
    - One-hot encoding with dense output

---

**Exercise 2**

- Modify the transformations of the previous exercise according to these settings and using `set_params`:
  - **Numerical features:** change the cleaning value to the median value
  - **Categorical features:** change the cleaning value to 'N' constant value

---