<a href="https://colab.research.google.com/github/gingerheef003/colab/blob/main/tip1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [How to keep feature names in sklearn Pipeline](https://medium.com/@anderson.riciamorim/how-to-keep-feature-names-in-sklearn-pipeline-e00295359e31)

In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import set_config

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
X,y = fetch_openml('titanic', version=1, as_frame=True, return_X_y=True, parser='pandas')
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=0)

X_train.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
813,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S,,,
809,3,"Ford, Mr. Edward Watson",male,18.0,2,2,W./C. 6608,34.375,,S,,,"Rotherfield, Sussex, England Essex Co, MA"
906,3,"Jonsson, Mr. Nils Hilding",male,27.0,0,0,350408,7.8542,,S,,,
305,1,"Weir, Col. John",male,60.0,0,0,113800,26.55,,S,,,"England Salt Lake City, Utah"
988,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q,,,


In [None]:
num_feat = ['age', 'fare']
cat_feat = ['sex', 'embarked', 'pclass']

set_config(transform_output='pandas')

num_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

transformer = ColumnTransformer([
    ('numerical', num_pipe, num_feat),
    ('categorical', OneHotEncoder(sparse_output=False, drop='if_binary', handle_unknown='ignore'), cat_feat)
], verbose_feature_names_out=True)

In [None]:
rf_pipe = Pipeline([
    ('dataprep', transformer),
    ('rf_clf', RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=123, verbose=0))
])

rf_pipe.fit(X_train, y_train)

In [None]:
clf = rf_pipe[-1]

data = list(zip(clf.feature_names_in_, clf.feature_importances_))
df_importances = pd.DataFrame(data, columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)

df_importances

Unnamed: 0,Feature,Importance
2,categorical__sex_male,0.301248
0,numerical__age,0.276543
1,numerical__fare,0.273124
8,categorical__pclass_3,0.057166
6,categorical__pclass_1,0.039941
7,categorical__pclass_2,0.01665
3,categorical__embarked_C,0.014645
5,categorical__embarked_S,0.012511
4,categorical__embarked_Q,0.008172
