# Import Libraries

In [2]:
# imports
from sklearn import set_config
import pandas as pd
import numpy as np

np.random.seed(42)
set_config(display='diagram')

# Load / Create Data

In [3]:
# Create a DataFrame with random numerical and categorical data
df = pd.DataFrame({
    'age': np.random.randint(18, 65, 100),  # Numerical
    'balance': np.random.randint(-2000, 5000, 100),  # Numerical
    'num_contacts': np.random.randint(0, 10, 100),  # Numerical
    'days_since_last_contact': np.random.randint(0, 30, 100),  # Numerical
    'duration': np.random.randint(1, 300, 100),  # Numerical
    'job': np.random.choice(['admin', 'technician', 'services', 'management'], 100),  # Categorical
    'marital': np.random.choice(['single', 'married', 'divorced'], 100),  # Categorical
    'education': np.random.choice(['primary', 'secondary', 'tertiary'], 100),  # Categorical
    'contact_type': np.random.choice(['cellular', 'telephone'], 100),  # Categorical
    'target': np.random.choice([0, 1], 100)  # Target variable (binary classification)
})

# Display the first few rows
df.head()

Unnamed: 0,age,balance,num_contacts,days_since_last_contact,duration,job,marital,education,contact_type,target
0,56,695,0,15,127,management,divorced,primary,cellular,0
1,46,-505,2,29,155,services,single,tertiary,telephone,1
2,32,3258,9,24,273,technician,divorced,primary,cellular,0
3,60,3618,7,2,104,technician,single,tertiary,telephone,1
4,25,4736,5,24,299,services,divorced,primary,telephone,1


# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X = df[['age', 'balance', 'num_contacts', 'days_since_last_contact', 'duration']]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Note: We are only selecting 2 numeric columns for now ans spliting data, as we progress we will use entire data! 

In [6]:
X_train.shape ,y_train.shape

((80, 5), (80,))

# Pipeline

* As data is created by ourselves only we don't need to handle the data, only processing / manipulation is required , so that we can feed it to model.
* But this needs to be repeated for new data as well , which if not done can cause issues

So to fix the issue, scikit-learn provides the pipeline class which allows creation of pipelines to ease our workflow

Steps to use are simple
```
from sklearn.pipeline import Pipeline`

steps = [("name", processing fn ),(..),(..)]

pipe = Pipeline(steps)
```
Based on the ways used, it can be combined in multiple ways


## Example 1

Pipeline with standard scaler and logistic regression - simple use case

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [8]:
steps = [('scaler', StandardScaler()), 
         ('model', LogisticRegression()) ]

pipe_num = Pipeline(steps)

pipe_num.fit(X_train, y_train)

print(pipe_num)

Pipeline(steps=[('scaler', StandardScaler()), ('model', LogisticRegression())])


In [9]:
pipe_num.fit(X_train, y_train)

we can directly fit the data and it will scale the columns (numeric) and apply logistic regression on it

In [10]:
# take predictions
y_pred = pipe_num.predict(X_test)

y_pred

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0])

super easy!

# Pipeline Using Column Transformers

Most of times , data is not only numeric, it contains categorical columns, so it better to process all of them together.

Sci-kit learn provides ColumnTransformer class that can be used here.

**Column Transformer**
Helps apply transformations based on columns names or indices

Required data in format - ("name", operation, [column_names/indices])

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

In [13]:
numerical_features = ['age', 'balance', 'num_contacts', 'days_since_last_contact', 'duration']
categorical_features = ['job', 'marital', 'education', 'contact_type']

X = df.drop(columns='target')
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
num_pipeline = ('num', StandardScaler(), numerical_features)
cat_pipeline = ('cat', OneHotEncoder(), categorical_features)

pca_num = ('pca', PCA(n_components =0.95), numerical_features)

preprocessor = ColumnTransformer([num_pipeline, 
                                 cat_pipeline, 
                                 pca_num])

steps = [('preprocessor', preprocessor), 
         ('model', LogisticRegression(max_iter = 100000)) ]

pipe_ct = Pipeline(steps)

pipe_ct

In [15]:
# train model
pipe_ct.fit(X_train, y_train)

In [16]:
pipe_ct.predict(X_test)

array([1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0])

In [17]:
pipe_ct.score(X_test, y_test)

0.4

# Feature union

Feature union is not commonly used, but allow to run the transformation in parallel.

FeatureUnion combines several transformer objects into a new transformer that combines their output. A FeatureUnion takes a list of transformer objects. During fitting, each of these is fit to the data independentl

Runs very fast compared to pipeline which run sequentially.

**Feature Union**
```
from sklearn.pipline import FeatureUnion

var = FeatureUnion(transformer_list = [(...),(...),(...)])

print(var)
```

In [18]:
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor

In [19]:
data = {
    'age': [25, 30, 35, 40, 45],
    'salary': [50000, 60000, 70000, 80000, 90000],
    'experience': [1, 2, 3, 4, 5],
    'bonus': [5000, 6000, 7000, 8000, 9000],
    'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago'],
    'department': ['HR', 'Finance', 'IT', 'Finance', 'HR'],
    'gender': ['F', 'M', 'F', 'M', 'F']
}

df = pd.DataFrame(data)
df.head()

Unnamed: 0,age,salary,experience,bonus,city,department,gender
0,25,50000,1,5000,New York,HR,F
1,30,60000,2,6000,Los Angeles,Finance,M
2,35,70000,3,7000,Chicago,IT,F
3,40,80000,4,8000,New York,Finance,M
4,45,90000,5,9000,Chicago,HR,F


In [None]:
# Define the features
numerical_features = ['age', 'experience', 'bonus']
categorical_features = ['city', 'department', 'gender']

# Transformer for numerical features
numerical_transformer = StandardScaler()

# Transformer for categorical features
categorical_transformer = OneHotEncoder()

In [20]:
preprocessor = ColumnTransformer(transformers=[
    num_pipeline,  # Include numerical feature transformation
    cat_pipeline   # Include categorical feature transformation
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Use the preprocessor with ColumnTransformer
    ('model', LogisticRegression())   # Add the model
])

# Fit the pipeline to your training data
pipeline.fit(X_train, y_train)

In [None]:
# create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
# pipeline for Linear Regression
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('regressor', LinearRegression())  # Estimator
])

# pipeline for Decision Tree Regression
tree_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('regressor', DecisionTreeRegressor())  # Estimator
])


In [None]:
voting_regressor = VotingRegressor(estimators=[
    ('linear', linear_pipeline),
    ('tree', tree_pipeline)
])

In [None]:
X = df.drop('salary', axis=1)  # Features
y = df['salary']    

In [None]:
voting_regressor.fit(X, y)

In [None]:
predictions = voting_regressor.predict(X)
print("Predictions:")
print(predictions)

# Resources

Learn More at: [Feature Union](https://scikit-learn.org/0.18/modules/pipeline.html#:~:text=FeatureUnion%20combines%20several%20transformer%20objects%20into%20a%20new,of%20these%20is%20fit%20to%20the%20data%20independently.)

Notebook: [Github](https://github.com/devloperhs14/practical_ml)

Series Playlist : [Youtube](https://www.youtube.com/playlist?list=PLDfna1ApN44oZsHW1AAxoMkREFWOse7sV)

Image Used : [Image](https://excalidraw.com/#json=oBeUOVcD4jvlXZzoBJcbg,hH51wrkKCe9ldjmD2i2HNA)

---

**Thanks 🙏**