<a href="https://colab.research.google.com/github/datafyi/datascience.fyi/blob/main/random_forest_classifier_with_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

# Cancer Dataset

In [93]:
from sklearn import set_config

set_config(display="diagram")

In [94]:
import pandas as pd

cancer_data = pd.read_csv("cancer_dataset.csv")

In [95]:
cancer_data.diagnosis = cancer_data.diagnosis.map({'M':1,'B':0})

In [96]:
cancer_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [97]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [98]:
X = cancer_data.drop(columns=['diagnosis'])
y = cancer_data.diagnosis

In [99]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [100]:
numeric_features = X.columns
# categorical_features = []

In [101]:
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder

## Drop unnecessary columns

In [102]:
drop_feat=['id']
drop_transformer = ColumnTransformer(transformers=[('drop_columns', 'drop', drop_feat)], remainder='passthrough')

In [103]:
from sklearn.pipeline import Pipeline

In [104]:
pipeline = Pipeline([('drop_column', drop_transformer)])

In [105]:
pipeline.fit(X_train)

In [106]:
transformed_train=pipeline.transform(X_train)
transformed_train

array([[1.152e+01, 1.493e+01, 7.387e+01, ..., 2.664e-01, 7.809e-02,
              nan],
       [1.305e+01, 1.931e+01, 8.261e+01, ..., 2.439e-01, 6.289e-02,
              nan],
       [1.420e+01, 2.053e+01, 9.241e+01, ..., 2.534e-01, 7.858e-02,
              nan],
       ...,
       [1.720e+01, 2.452e+01, 1.142e+02, ..., 3.313e-01, 1.339e-01,
              nan],
       [1.403e+01, 2.125e+01, 8.979e+01, ..., 2.226e-01, 7.617e-02,
              nan],
       [1.303e+01, 1.842e+01, 8.261e+01, ..., 1.987e-01, 6.169e-02,
              nan]])

## Numeric Transformer

In [107]:
numeric_transformer = Pipeline(steps=[
                                     ('meanimputer', SimpleImputer(strategy='mean')),
                                     ('robostscaler', RobustScaler())
                                     ])

## Categorical Transformer

In [108]:
# categorical_transformer = Pipeline(steps=[
#                                          ('onehotenc', OneHotEncoder(handle_unknown='ignore'))
#                                          ])

## Lets build the pipeline

In [109]:
col_transformer = ColumnTransformer(transformers=[('drop_columns', 'drop', drop_feat),
                                                   ('numeric_processing',numeric_transformer, numeric_features)
                                                   # ,('categorical_processing', categorical_transformer, categorical_features)
                                                  ], remainder='drop')

In [110]:
pipeline = Pipeline([('transform_column', col_transformer)])

In [111]:
pipeline.fit(X_train)

In [112]:
transformed_train=pipeline.transform(X_train)

In [113]:
pd.DataFrame(transformed_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,-0.002690,-0.395310,-0.685986,-0.402170,-0.357491,0.256156,-0.302049,-0.197001,-0.090011,0.273491,...,-0.353736,-0.439820,-0.380333,-0.305879,0.254314,-0.333803,-0.198326,-0.033074,-0.215955,-0.106452
1,-0.006032,-0.053601,0.071799,-0.119161,-0.047332,-0.851981,-0.926021,-0.609685,-0.551513,0.077922,...,-0.102544,-0.320585,-0.163000,-0.089993,-1.082652,-0.827535,-0.841015,-0.873217,-0.549907,-0.869696
2,0.000663,0.203238,0.282872,0.198170,0.186634,-0.385707,0.205946,-0.125778,-0.066315,-0.878533,...,0.250397,0.242970,0.344575,0.243544,-0.603088,0.613255,0.056510,0.340873,-0.408905,-0.081848
3,0.001510,-0.272473,-0.943772,-0.273942,-0.257696,0.721895,-0.115588,-0.250006,-0.114076,-0.417112,...,-0.226550,-1.049494,-0.240088,-0.211072,0.733878,-0.308166,-0.263835,-0.252083,-0.042301,0.015566
4,0.000037,-0.375209,-0.497405,-0.350684,-0.352617,0.657655,0.299100,0.071320,0.198025,0.282659,...,-0.355326,-0.610799,-0.355953,-0.332150,0.348774,-0.032304,-0.019077,0.109504,-0.033395,-0.298268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,-0.004930,1.451703,1.076990,1.428317,1.658158,0.266863,0.952725,1.849754,1.482618,1.248281,...,1.232909,0.953881,1.194404,1.484518,-0.159855,0.834765,1.194277,0.729452,0.735436,0.222948
394,-0.004609,-0.567281,-0.679066,-0.583826,-0.488327,-0.994914,-0.716426,-0.398265,-0.483398,-1.205500,...,-0.465024,-0.492688,-0.452081,-0.388447,-0.882834,-0.517882,-0.496536,-0.645606,-0.756215,-0.628672
395,-0.005795,0.873255,0.973183,0.903748,0.984479,0.566649,1.326890,1.023184,0.831228,0.407945,...,1.342607,0.980877,1.261740,1.634643,0.966394,2.646327,1.515702,0.894574,0.747310,2.695958
396,10.861553,0.165271,0.407439,0.113333,0.148153,-0.311296,-0.436035,-0.474721,-0.279770,-0.844920,...,0.072337,0.582677,0.023452,0.059152,-0.116258,-0.369183,-0.623378,-0.195724,-0.866048,-0.202862


In [114]:
pipeline.transform(X_test)

array([[ 3.03425610e-05,  3.12674484e-01, -8.50346021e-01, ...,
         1.12470646e-01,  2.59740260e-02,  5.96033141e-01],
       [-1.02046136e-01, -2.68006700e-02, -4.06574394e-02, ...,
         1.08144852e+00,  1.61855288e+00,  1.89254331e+00],
       [-1.82179721e-03, -7.59352317e-02, -4.97404844e-01, ...,
         6.10554938e-02,  8.46753247e-01,  7.84835551e-01],
       ...,
       [ 1.04089511e+01,  5.56113903e-01, -1.73875433e-01, ...,
         8.06575207e-01,  1.45825603e+00,  1.23474768e+00],
       [ 9.95358614e-01, -9.38023451e-02, -4.64532872e-01, ...,
        -4.11568409e-01,  1.17922078e+00, -4.81546573e-01],
       [-3.73064274e-06, -4.75711893e-01,  4.35121107e-01, ...,
        -5.57310592e-01,  2.47124304e-01, -4.50916395e-01]])

In [115]:
pipeline = Pipeline([
                     ('transform_column', col_transformer),
                     ('logistics', RandomForestClassifier())
                    ])

In [116]:
pipeline.fit(X_train, y_train)

## Score

In [117]:
pipeline.score(X_test, y_test)

0.9649122807017544