In [88]:
import numpy as np  # linear algebra
from sklearn.compose import ColumnTransformer

import pandas as pd  # data processing
import matplotlib.pyplot as plt
import seaborn as sns

In [89]:
df = pd.read_csv('./reference/100_day_ml_source_code/day28-column-transformer/covid_toy.csv')
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [90]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
# 
from sklearn.compose import ColumnTransformer

SimpleImputer : fever
StandardScaler : age
OrdinalEncoder : cough

In [91]:
## Fill Missing values 
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['fever']] = imp_mean.fit_transform(df[['fever']])
df[['fever']]

Unnamed: 0,fever
0,103.0
1,100.0
2,101.0
3,98.0
4,101.0
...,...
95,104.0
96,101.0
97,101.0
98,98.0


In [92]:
scaler = StandardScaler()
df[['age']] = scaler.fit_transform(df[['age']])
df['age']

0     0.637467
1    -0.695639
2    -0.089682
3    -0.534050
4     0.839453
        ...   
95   -1.301596
96    0.273893
97   -0.978419
98   -1.584376
99   -1.382390
Name: age, Length: 100, dtype: float64

In [93]:
encoder = OrdinalEncoder(categories=[['Mild', 'Strong']])
df[['cough_encoded']] = encoder.fit_transform(df[['cough']])
df[['cough', 'cough_encoded']]

Unnamed: 0,cough,cough_encoded
0,Mild,0.0
1,Mild,0.0
2,Mild,0.0
3,Mild,0.0
4,Mild,0.0
...,...,...
95,Mild,0.0
96,Strong,1.0
97,Mild,0.0
98,Strong,1.0


### Column Transformer

In [102]:
transformer = ColumnTransformer(transformers=[
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'), ['age']),
    ('scaler', StandardScaler(), ['age']),
    ('encoder', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
])

transformer.fit_transform(df)

array([[ 0.63746697,  0.63746697,  0.        ],
       [-0.69563886, -0.69563886,  0.        ],
       [-0.08968166, -0.08968166,  0.        ],
       [-0.53405027, -0.53405027,  0.        ],
       [ 0.8394527 ,  0.8394527 ,  0.        ],
       [ 1.60699847,  1.60699847,  0.        ],
       [-1.22080176, -1.22080176,  1.        ],
       [-0.97841888, -0.97841888,  1.        ],
       [-1.01881603, -1.01881603,  1.        ],
       [ 0.79905555,  0.79905555,  0.        ],
       [ 1.24342416,  1.24342416,  0.        ],
       [ 0.8394527 ,  0.8394527 ,  0.        ],
       [-0.77643315, -0.77643315,  1.        ],
       [ 0.79905555,  0.79905555,  0.        ],
       [ 0.27389265,  0.27389265,  0.        ],
       [ 1.04143843,  1.04143843,  1.        ],
       [ 1.00104128,  1.00104128,  0.        ],
       [-0.17047596, -0.17047596,  1.        ],
       [ 0.79905555,  0.79905555,  0.        ],
       [-0.08968166, -0.08968166,  1.        ],
       [-1.30159605, -1.30159605,  1.   

In [111]:
transformer.transformers_

[('imputer', SimpleImputer(), ['age']),
 ('scaler', StandardScaler(), ['age']),
 ('encoder', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
 ('remainder', 'drop', [1, 2, 4, 5, 6])]