In [103]:
import numpy as np
import pandas as pd
import seaborn as sns

In [104]:
df = sns.load_dataset('diamonds')

In [105]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [106]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

### . Seperating categorical and continous columns

In [107]:
x,y=df.drop(columns=['price']),df['price']
xCat,xCont=x[['cut','color','clarity']],x.drop(columns=['cut','color','clarity'])

In [108]:
import sklearn
from sklearn.model_selection import train_test_split

In [109]:
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.32,random_state=80)

In [110]:
x.select_dtypes(include=['object','bool','category'])

Unnamed: 0,cut,color,clarity
0,Ideal,E,SI2
1,Premium,E,SI1
2,Good,E,VS1
3,Premium,I,VS2
4,Good,J,SI2
...,...,...,...
53935,Ideal,D,SI1
53936,Good,D,SI1
53937,Very Good,D,SI1
53938,Premium,H,SI2


### . Assigning nominal and ordinal columns 

In [111]:
CatcolsNominal = ['color','clarity']
Catcolsordinal = ['cut']
Contcols= ['carat','depth','table','x','y','z']

In [112]:
import sklearn
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,RobustScaler,PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


### . NominalPipeline

In [113]:
CatNominalPipeline = Pipeline(steps=[
    ('OneHotEncoding',OneHotEncoder(sparse_output=False,drop='first'))
])

In [114]:
df['cut'].value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

### . OrdinalPipeline

In [115]:
CatOrdinalPipeline = Pipeline(steps=[
    ('OrdinalEncoding',OrdinalEncoder(categories=[['Ideal','Premium','Very Good','Good','Fair']]))
])

### . ContinousPipeline

In [None]:
ContPipeline = Pipeline(steps=[
    ('RobustScaler',RobustScaler()),
    ('yeojohnson',PowerTransformer())
])

### . ColumnTransformer

In [117]:
preColTrans = ColumnTransformer(transformers = [
    ('catNominalPipeline',CatNominalPipeline,CatcolsNominal),
    ('catOrdinalPipeline',CatOrdinalPipeline,Catcolsordinal),
    ('contPipeline',ContPipeline,Contcols)
    
],remainder = 'passthrough')

### .FinalPipeline 

In [None]:
finalPipeline = Pipeline(steps = [
    ('preColTrans',preColTrans),
])

### . Trainingdata

In [119]:
finalPipeline.fit_transform(xTrain)

array([[ 0.        ,  1.        ,  0.        , ..., -1.14405054,
        -1.1378171 , -1.10583252],
       [ 0.        ,  0.        ,  1.        , ..., -0.48102811,
        -0.50367236, -0.42008106],
       [ 0.        ,  0.        ,  0.        , ..., -1.22270703,
        -1.20834094, -1.23283044],
       ...,
       [ 0.        ,  0.        ,  0.        , ..., -0.91257274,
        -0.88524668, -0.77137179],
       [ 1.        ,  0.        ,  0.        , ..., -1.37071689,
        -1.44836516, -1.45508817],
       [ 0.        ,  0.        ,  0.        , ...,  0.91398641,
         0.93231297,  0.89612174]])

### .Trainingdata Dataframe 

In [None]:
xTrainprocessed = pd.DataFrame(finalPipeline.fit_transform(xTrain),
                              columns=finalPipeline.get_feature_names_out(),
                              index = xTrain.index)

In [None]:
xTrainprocessed

In [124]:
finalPipeline.transform(xTest)

array([[ 0.        ,  0.        ,  0.        , ...,  1.36621253,
         1.28944448,  1.4265397 ],
       [ 1.        ,  0.        ,  0.        , ..., -0.5218908 ,
        -0.43046525, -0.40385856],
       [ 0.        ,  0.        ,  0.        , ..., -0.54242539,
        -0.51421704, -0.65206475],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.18641863,
         0.23985943,  0.18982884],
       [ 0.        ,  1.        ,  0.        , ..., -0.48102811,
        -0.48264767, -0.37155579],
       [ 0.        ,  0.        ,  0.        , ...,  0.69666697,
         0.7379844 ,  0.65991089]])

### . Testdata Dataframe

In [125]:
xTestprocessed = pd.DataFrame(finalPipeline.transform(xTest),
                              columns=finalPipeline.get_feature_names_out(),
                              index = xTest.index)

In [126]:
xTestprocessed

Unnamed: 0,catNominalPipeline__color_E,catNominalPipeline__color_F,catNominalPipeline__color_G,catNominalPipeline__color_H,catNominalPipeline__color_I,catNominalPipeline__color_J,catNominalPipeline__clarity_IF,catNominalPipeline__clarity_SI1,catNominalPipeline__clarity_SI2,catNominalPipeline__clarity_VS1,catNominalPipeline__clarity_VS2,catNominalPipeline__clarity_VVS1,catNominalPipeline__clarity_VVS2,catOrdinalPipeline__cut,contPipeline__carat,contPipeline__depth,contPipeline__table,contPipeline__x,contPipeline__y,contPipeline__z
21394,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.398383,0.947348,-2.478706,1.366213,1.289444,1.426540
45561,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.497124,0.587806,-0.097095,-0.521891,-0.430465,-0.403859
46725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,-0.671760,-1.206677,0.761274,-0.542425,-0.514217,-0.652065
33918,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.345456,0.803079,0.355249,-1.416756,-1.509540,-1.343251
40863,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.365084,0.021731,-0.611510,-0.390106,-0.297364,-0.339444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16638,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.852061,-1.072466,0.355249,0.936741,0.910353,0.785671
37296,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.973383,0.161997,-0.611510,-0.978012,-0.941808,-0.927899
681,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169622,-0.186894,-0.611510,0.186419,0.239859,0.189829
42830,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,-0.430252,0.875142,1.133835,-0.481028,-0.482648,-0.371556


### . Diagram representation

In [129]:
finalPipeline