# Using column transformer with mixed types

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

## Setup libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_openml

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [6]:
from sklearn.impute import SimpleImputer

In [7]:
from sklearn.metrics import f1_score

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

In [10]:
import lightgbm as lgb

## Create utility functions

## Get data

In [11]:
titanic_data = fetch_openml("titanic", version=1, as_frame=True)
titanic: pd.DataFrame = titanic_data.frame

In [12]:
X = titanic.drop("survived", axis=1)
y = titanic["survived"].cat.codes

## Column transformers for numerical and categorical features

In [13]:
num_features = ["age", "fare"]
num_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

In [14]:
cat_features = ["embarked", "sex", "pclass"]
cat_transformer = OneHotEncoder(handle_unknown="ignore")

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
    ]
)

## Create a pipeline

In [16]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier()),
    ]
)

## Train model

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)

In [18]:
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['embarked', 'sex',
                                                   'pclass'])])),
                ('classifier', GradientBoostingClassifier())])

## Evaluate model

In [19]:
y_pred = clf.predict(X_test)

In [20]:
f1 = f1_score(y_test, y_pred)
print("model F1 score: {:.3f}".format(f1))

model F1 score: 0.727


## Grid search for best hyperparameters

In [21]:
param_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "classifier__n_estimators": [50, 100, 200],
}

In [22]:
grid = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring="f1")
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['age',
                                                                          'fare']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                  

In [23]:
print("Best params:", grid.best_params_)

Best params: {'classifier__n_estimators': 200, 'preprocessor__num__imputer__strategy': 'median'}


## Use best parameters

In [24]:
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier()),
    ]
)
clf.set_params(**grid.best_params_)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fare']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['embarked', 'sex',
                                                   'pclass'])])),
                ('classifier', GradientBoostingClassifier(n_estimators=200))])

In [25]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [26]:
f1 = f1_score(y_test, y_pred)
print("model F1 score - best parameters: {:.3f}".format(f1))

model F1 score - best parameters: 0.760


## Setup LightGBM dataset

In [27]:
features = num_features + cat_features

In [28]:
lgb_train = lgb.Dataset(X_train[features], y_train)
lgb_eval = lgb.Dataset(X_test[features], y_test)

## Train lightgbm model

In [29]:
params = {"boosting_type": "gbdt", "objective": "binary", "verbose": -1}

In [30]:
gbm = lgb.train(
    params, lgb_train, valid_sets=lgb_train,
    callbacks=[lgb.log_evaluation(period=0)])

## Evaluate lightgbm model

In [31]:
y_pred = gbm.predict(X_test[features])
y_pred_int = y_pred.round().astype("int")

In [32]:
f1 = f1_score(y_test, y_pred_int)
print("gbm model F1 score: {:.3f}".format(f1))

gbm model F1 score: 0.740
