https://stackoverflow.com/questions/54150352/how-to-output-pandas-object-from-sklearn-pipeline

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

# create aritificial data
numeric_features_vals = pd.DataFrame({'x1': [1, 2, 3, 4], 'x2': [0.15, 0.25, 0.5, 0.45]})
numeric_features = ['x1', 'x2']
categorical_features_vals = pd.DataFrame({'cat1': [0, 1, 1, 2], 'cat2': [2, 1, 5, 0] })
categorical_features = ['cat1', 'cat2']

In [3]:
numeric_features_vals

Unnamed: 0,x1,x2
0,1,0.15
1,2,0.25
2,3,0.5
3,4,0.45


In [5]:
categorical_features_vals

Unnamed: 0,cat1,cat2
0,0,2
1,1,1
2,1,5
3,2,0


In [2]:
X_train = pd.concat([numeric_features_vals, categorical_features_vals], axis=1)
X_test = pd.DataFrame({'x1':[2,3], 'x2':[0.2, 0.3], 'cat1':[0, 1], 'cat2':[2, 1]})
y_train = pd.DataFrame({'labels': [10, 20, 30, 40]})

In [6]:
X_train

Unnamed: 0,x1,x2,cat1,cat2
0,1,0.15,0,2
1,2,0.25,1,1
2,3,0.5,1,5
3,4,0.45,2,0


In [2]:
# impute and standardize numeric data 

numeric_transformer = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy="mean")),
    ('scale', StandardScaler())
])

# impute and encode dummy variables for categorical data
categorical_transformer = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
    ('one_hot', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

clf = Pipeline([
    ('transform', preprocessor),
    ('ridge', Ridge())
])

In [41]:
p = preprocessor.fit(X_train)

In [56]:
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [57]:
enc.fit(X_train)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='ignore',
              n_values=None, sparse=False)

In [65]:
enc.get_feature_names()

array(['x0_1', 'x0_2', 'x0_3', 'x0_4', 'x1_0.15', 'x1_0.25', 'x1_0.45',
       'x1_0.5', 'x2_0', 'x2_1', 'x2_2', 'x3_0', 'x3_1', 'x3_2', 'x3_5'],
      dtype=object)

In [66]:
p.named_transformers_.cat.steps[1][1].get_feature_names()

array(['x0_0', 'x0_1', 'x0_2', 'x1_0', 'x1_1', 'x1_2', 'x1_5'],
      dtype=object)

In [8]:
preprocessor.transform(X_train)

array([[-1.34164079, -1.31055608,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ],
       [-0.4472136 , -0.61159284,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ],
       [ 0.4472136 ,  1.13581527,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.        ],
       [ 1.34164079,  0.78633365,  0.        ,  0.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ]])

In [9]:
kf = KFold(n_splits=2, shuffle=True, random_state=44)
cross_val_score(clf, X_train, y_train, cv=kf).mean()

param_grid = {
    'ridge__alpha': [.001, .1, 1.0, 5, 10, 100]
}

gs = GridSearchCV(clf, param_grid, cv = kf)
gs.fit(X_train, y_train)

model = gs.best_estimator_
predictions = model.fit(X_train, y_train).predict(X_test)
print('coefficients : ',  model.named_steps['ridge'].coef_, '\n')

coefficients :  [[ 2.35558322  2.00582929 -0.76932764 -0.08412006  0.8534477   0.8534477
  -0.24151912 -0.76932764  0.15739906]] 



In [10]:
# create column names for categorical hot encoded data
columns_names_to_map = list(np.copy(numeric_features))
columns_names_to_map.extend('cat1_' + str(col) for col in pd.get_dummies(X_train['cat1']).columns)
columns_names_to_map.extend('cat2_' + str(col) for col in pd.get_dummies(X_train['cat2']).columns)

In [11]:
columns_names_to_map

['x1',
 'x2',
 'cat1_0',
 'cat1_1',
 'cat1_2',
 'cat2_0',
 'cat2_1',
 'cat2_2',
 'cat2_5']

In [2]:
print('columns after preprocessing :', columns_names_to_map,  '\n')
print('#'*80)
print( '\n', 'dataframe of rescaled features with custom colum names: \n\n', pd.DataFrame({col:vals for vals, col in zip (preprocessor.fit_transform(X_train).T, columns_names_to_map)}))
print('#'*80)
print( '\n', 'dataframe of ridge coefficients with custom colum names: \n\n', pd.DataFrame({col:vals for vals, col in zip (model.named_steps['ridge'].coef_.T, columns_names_to_map)}))

coefficients :  [[ 2.35558322  2.00582929 -0.76932764 -0.08412006  0.8534477   0.8534477
  -0.24151912 -0.76932764  0.15739906]] 

columns after preprocessing : ['x1', 'x2', 'cat1_0', 'cat1_1', 'cat1_2', 'cat2_0', 'cat2_1', 'cat2_2', 'cat2_5'] 

################################################################################

 dataframe of rescaled features with custom colum names: 

          x1        x2  cat1_0  cat1_1  cat1_2  cat2_0  cat2_1  cat2_2  cat2_5
0 -1.341641 -1.310556     1.0     0.0     0.0     0.0     0.0     1.0     0.0
1 -0.447214 -0.611593     0.0     1.0     0.0     0.0     1.0     0.0     0.0
2  0.447214  1.135815     0.0     1.0     0.0     0.0     0.0     0.0     1.0
3  1.341641  0.786334     0.0     0.0     1.0     1.0     0.0     0.0     0.0
################################################################################

 dataframe of ridge coefficients with custom colum names: 

          x1        x2    cat1_0   cat1_1    cat1_2    cat2_0    cat2_1  \
0  2.

NameError: name 'np' is not defined