In [1]:
#                                                                 ColumnTransformer
# Tool to apply different preprocessing transformations to different columns within a dataset.

In [6]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv('csv files/covid_toy.csv')

In [8]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [9]:
df.isnull().sum()
# fever --> missing value column

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [10]:
df['cough'].value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split ( df.drop(columns=['has_covid']), df['has_covid'], test_size = .2, random_state=42 )

In [13]:
X_train

Unnamed: 0,age,gender,fever,cough,city
55,81,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi
69,73,Female,103.0,Mild,Delhi
...,...,...,...,...,...
60,24,Female,102.0,Strong,Bangalore
71,75,Female,104.0,Strong,Delhi
14,51,Male,104.0,Mild,Bangalore
92,82,Female,102.0,Strong,Kolkata


In [14]:
y_train

55    Yes
88     No
26    Yes
42    Yes
69     No
     ... 
60    Yes
71     No
14     No
92     No
51    Yes
Name: has_covid, Length: 80, dtype: object

In [15]:
# age and fever ---> numerical columns, fever has missing values  ---> SimpleImputer
# gender and city ---> nominal categorical columns  ---> One Hot Encoding, OHE
# cough --> ordinal categorical column ---> ordinal encoding (it's a feature column)
# has_covid ---> nominal categorical column ---> label encoding (it's a targent column)

In [16]:
from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [24]:
# column_transformer_object = ColumnTransformer(transformers=[ tuple(s) ], remainder='passthrough or drop')
column_transformer_obj = ColumnTransformer(transformers=[
    ('tnf1', SimpleImputer(), ['fever'] ),
    ('tnf2', OrdinalEncoder(categories=[['Mild', 'Strong']]), ['cough']),
    ('tnf3', OneHotEncoder(sparse_output=False, drop='first'), ['gender', 'city'] )
], remainder='passthrough')
# categories expect list of list 
# sparse_output = False --> dense output {numpy/dataframe, all elements are stored explicitly } 
# sparse_output = True --> sparse output {sparse matrix, only non zero elements are stored }
# OHE parameter drop = 'first' --> remove multicolinearity between columns
# remainder parameter specifies what to do with columns which aren't subjected to transformation. passthrough --> leave as it  is, drop --> remove

In [22]:
column_transformer_obj.fit_transform(X_train).shape

(80, 7)

In [23]:
column_transformer_obj.fit_transform(X_train)

array([[101.,   0.,   0.,   0.,   0.,   1.,  81.],
       [100.,   0.,   0.,   0.,   1.,   0.,   5.],
       [100.,   0.,   0.,   0.,   1.,   0.,  19.],
       [100.,   0.,   1.,   1.,   0.,   0.,  27.],
       [103.,   0.,   0.,   1.,   0.,   0.,  73.],
       [103.,   1.,   1.,   0.,   1.,   0.,  70.],
       [102.,   0.,   0.,   1.,   0.,   0.,  49.],
       [101.,   1.,   0.,   0.,   1.,   0.,  51.],
       [101.,   0.,   0.,   1.,   0.,   0.,  64.],
       [101.,   0.,   0.,   0.,   1.,   0.,  83.],
       [ 98.,   0.,   0.,   0.,   0.,   1.,  65.],
       [104.,   0.,   0.,   0.,   0.,   0.,  18.],
       [103.,   0.,   0.,   0.,   0.,   0.,  16.],
       [104.,   0.,   1.,   0.,   1.,   0.,  16.],
       [100.,   0.,   1.,   0.,   1.,   0.,  27.],
       [101.,   0.,   0.,   0.,   0.,   0.,  84.],
       [104.,   0.,   1.,   0.,   1.,   0.,  51.],
       [102.,   0.,   0.,   0.,   0.,   0.,  69.],
       [102.,   1.,   0.,   0.,   0.,   0.,  82.],
       [103.,   0.,   0.,   0.,

In [20]:
# from sklearn.compose import ColumnTransformer
# syntax: column_transformer_obj = ColumnTransformer (transformers, remainder).
# transformer ---> list of tuples , remainder = 'passthrough/drop'
# tuples takes 3 parameters ('given_identification', transformer_function(), ['name_of_column(s)_of_interest'])